In [1]:
#install dependencies
# !pip install pyarrow
# !pip install  duckdb --upgrade --pre
# !pip install wittgenstein


In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split,  GroupShuffleSplit
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.model_selection import GridSearchCV

import os
import time

## Duck Db imports
import pyarrow.parquet as pq
import duckdb

# developed packages
from utils import create_facts_and_examples, performance_metrics



## Import Data¶


#### create duck db connection

In [3]:
con = duckdb.connect(':memory:')
# enable automatic query parallelization
con.execute("PRAGMA threads=2")
# enable caching of parquet metadata
con.execute("PRAGMA enable_object_cache")

<duckdb.duckdb.DuckDBPyConnection at 0x7fad63fda570>

#### Reading PAYSIM1 data and creating parquet file

In [4]:
try:
    con.sql('DROP TABLE df_fraud_tbl')
    print("previous table dropped")
except:
    pass

try:
    con.sql("""CREATE TABLE fraud_tbl AS SELECT * FROM 'fraud.parquet';""")
except:
    con.sql(
    """
    copy 'DATA/paysim1/PS_20174392719_1491204439457_log.csv' to 'fraud.parquet';
""")
    con.sql("""CREATE TABLE fraud_tbl AS SELECT * FROM 'fraud.parquet';""")

#### Creating externarl origin and destination, Imputing zero values for external accounts

In [5]:
df_fraud = con.sql(""" 
SELECT *,
-- evaluating external accounts
    oldbalanceOrg==0 and newbalanceOrig==0 as external_orig,
    oldbalanceDest==0 and newbalanceDest==0 as external_dest,
-- Imputing zero values for external accounts
    CASE WHEN external_orig==True 
        THEN  amount
        ELSE oldbalanceOrg
    END AS oldbalanceOrg_imputed,
    CASE WHEN external_dest==True 
        THEN  amount
        ELSE newbalanceDest
    END AS newbalanceDest_imputed, 
FROM fraud_tbl
""").df()

# testing
slice_index = df_fraud['external_orig']==True

assert np.sum(df_fraud[slice_index]['amount'] != df_fraud[slice_index]['oldbalanceOrg_imputed'] )==0, "Wrong imputation for external origin"

slice_index = df_fraud['external_dest']==True

assert np.sum(df_fraud[slice_index]['amount'] != df_fraud[slice_index]['newbalanceDest_imputed'] )==0, "Wrong imputation for external destination"


## Split data to train/test/val 

#### train and test sets

In [6]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, test_index in gss.split(df_fraud, groups=df_fraud['nameDest']):
    df_train_total = df_fraud.loc[train_index]
    df_test_       = df_fraud.loc[test_index]

#### validation set is a part of train set

In [None]:
df_train_total = df_train_total.reset_index(drop=True)

gss = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
for train_index, val_index in gss.split(df_train_total, groups=df_train_total['nameDest']):
    df_train_ = df_train_total.loc[train_index]
    df_val_   = df_train_total.loc[val_index]

# Preprocessing

#### aggregation definition
compute average, and maximum amount of transfer for the last 3, and 7 days for the destination 

In [None]:
def aggregates(df):
    """ runs aggregations on pandas dataframe with help of duckdb"""
    data = df.copy()
    data = data.sort_values(by='step')
    data=data.reset_index(drop=True).reset_index()
    data = con.sql("""
        SELECT 
            *,

-- calculate aggregetions (Average and Max) for last 7 including the current row for each name Destination
            AVG(amount) OVER (PARTITION BY nameDest ORDER BY index ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS mean_last_7,
            MAX(amount) OVER (PARTITION BY nameDest ORDER BY index ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS max_last_7,
       
-- calculate aggregetions (Average and Max) for last 3 including the current row for each name Destination
            AVG(amount) OVER (PARTITION BY nameDest ORDER BY index ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS mean_last_3,
            MAX(amount) OVER (PARTITION BY nameDest ORDER BY index ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS max_last_3,

-- deviation from the aggregated values
            amount-mean_last_7 as deviation_from_mean_7_days,
            amount-mean_last_3 as deviation_from_mean_3_days,
            amount-max_last_7 as deviation_from_max_7_days,
            amount-max_last_3 as deviation_from_max_3_days,

        FROM 
            data
            ORDER BY index
            """).df()
    
    return data

#### Preprocessing class fit on train data and processes 

In [None]:
class Preprocess:
    def __init__(self, scaler_columns):

        self.scaler = StandardScaler()

        self.scaler_columns = scaler_columns
        
    def fit(self, train_data_frame):
        
        self.scaler.fit(train_data_frame[self.scaler_columns])
        
    def transform(self, data_frame):
        
        df = data_frame.copy()
        
        #scale numerical columns 
        df[self.scaler_columns] = self.scaler.transform(df[self.scaler_columns])
        
        ### run aggregation
        df = aggregates(df)
        
        # one hot encoding and drop columns 
        df['distType'] = df["nameDest"].str[0]
        df = pd.get_dummies(df, columns=['type', "distType"])
        df.drop(columns=["step", "nameOrig", "nameDest", "isFlaggedFraud"], inplace=True)

        
        return df

    
columns = ['amount', 'oldbalanceOrg_imputed', 'newbalanceOrig', 'oldbalanceDest',
   'newbalanceDest_imputed']

preprocessor = Preprocess(scaler_columns=columns)

preprocessor.fit(df_train_)

#### preprocessing 

In [None]:
df_train = preprocessor.transform(df_train_)
df_test = preprocessor.transform(df_test_)
df_val = preprocessor.transform(df_val_)

#### performance

In [None]:
def performance(model, x_test, y_test, title=''):
    """ evaluetes performance of the model"""
    y_pred = model.predict(x_test)
    performance_metrics(y_pred, y_test, labels=[True, False], title=title)

## Decision Tree

In [None]:
X_train = df_train.drop(columns=['isFraud'])
y_train = df_train['isFraud']
X_test = df_test.drop(columns=['isFraud'])
y_test = df_test['isFraud']

X_val = df_val.drop(columns=['isFraud'])
y_val = df_val['isFraud']
features = ['amount', 'external_dest', 'oldbalanceOrg_imputed', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest_imputed', 'deviation_from_max_7_days',
       'deviation_from_max_3_days', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT',
       'type_PAYMENT', 'type_TRANSFER']

In [None]:
# %%script true
# hyper parameter tuning
dtc_model=DecisionTreeClassifier(random_state=42)

parameters = {'max_leaf_nodes': range(3,9,1)}

gs_clf = GridSearchCV(dtc_model, parameters, cv=5, scoring='matthews_corrcoef')

gs_clf.fit(X_val[features], y_val)
dtc_model=gs_clf.best_estimator_
print(gs_clf.best_estimator_)
print('grid search score, cv=5,', gs_clf.best_score_)
dtc_model.fit(X_train[features],y_train)

In [None]:
plot_tree(dtc_model, feature_names=dtc_model.feature_names_in_, filled=True, fontsize=4, class_names=True)

In [None]:
performance(dtc_model, x_test=X_test[features], y_test=y_test, title='dtc performance test set')
performance(dtc_model, x_test=X_train[features], y_test=y_train, title='dtc performance train set')


In [None]:
def create_cart_predicates(cart_model, input_df, negation=True):
    """ Creates data frame with columns from decision tree features and thresholds feature<threshold"""
    bool_filter         = cart_model.tree_.feature>=0
    chosen_features_id  = cart_model.tree_.feature[bool_filter]
    chosen_features     = cart_model.feature_names_in_[chosen_features_id]
    thresholds          = cart_model.tree_.threshold[bool_filter]
    features_thresholds = list(set(list(zip(chosen_features, thresholds))))
    features_thresholds.sort()
    df = pd.DataFrame()
    for ft in features_thresholds:
        df[str(ft).replace('(','{').replace(')','}').replace('\'','')] = input_df[ft[0]]<=ft[1]
        if negation:
            df['NOT'+str(ft).replace('(','{').replace(')','}').replace('\'','')] = input_df[ft[0]]>ft[1]
        
    df['isFraud'] = input_df['isFraud']
    predicates = df.columns[:-1]
    return df, predicates

train_predicates_cart, predicates = create_cart_predicates(dtc_model, df_train)
test_predicates_cart,a = create_cart_predicates(dtc_model, df_test)

create_facts_and_examples(df_=train_predicates_cart, target='isFraud', \
                          predicates=predicates, output_dir='examples/fraud-cart')

test_predicates_cart.to_parquet(path='examples/fraud-cart/df_test.parquet')

## No Negation

train_predicates_cart_no_negation, predicates = create_cart_predicates(dtc_model, df_train, negation=False)
test_predicates_cart_no_negation,a = create_cart_predicates(dtc_model, df_test, negation=False)

create_facts_and_examples(df_=train_predicates_cart_no_negation, target='isFraud', \
                          predicates=predicates, output_dir='examples/fraud-cart-no-negation')
test_predicates_cart_no_negation.to_parquet(path='examples/fraud-cart-no-negation/df_test.parquet')


In [None]:
#short version
df_1 = train_predicates_cart[train_predicates_cart['isFraud']==0]
df_2 = train_predicates_cart[train_predicates_cart['isFraud']==1]
fraud_background = pd.concat([df_1.iloc[0:100], df_2.iloc[0:100]], ignore_index=True)


create_facts_and_examples(df_=fraud_background, target='isFraud', \
                          predicates=predicates, output_dir='examples/fraud-cart-short')

test_predicates_cart.to_parquet(path='examples/fraud-cart-short/df_test.parquet')


## No Negation

df_1 = train_predicates_cart_no_negation[train_predicates_cart_no_negation['isFraud']==0]
df_2 = train_predicates_cart_no_negation[train_predicates_cart_no_negation['isFraud']==1]
fraud_background = pd.concat([df_1.iloc[0:100], df_2.iloc[0:100]], ignore_index=True)


create_facts_and_examples(df_=fraud_background, target='isFraud', \
                          predicates=predicates, output_dir='examples/fraud-cart-short-no-negation')

test_predicates_cart_no_negation.to_parquet(path='examples/fraud-cart-short-no-negation/df_test.parquet')

In [None]:
import copy
df_1 = df_train[df_train['isFraud']==0]
df_2 = df_train[df_train['isFraud']==1]
fraud_background_100_100 = pd.concat([df_1.iloc[0:100], df_2.iloc[0:100]], ignore_index=True)
dtc_model_100_100 = copy.copy(dtc_model)
dtc_model_100_100.fit(fraud_background_100_100[features], fraud_background_100_100['isFraud'])
performance(dtc_model_100_100, x_test=X_test[features], y_test=y_test, title='dtc_model_100_100 performance test set')
performance(dtc_model, x_test=fraud_background_100_100[features], y_test=fraud_background_100_100['isFraud'], title='dtc_model performance train set 100/100')
performance(dtc_model_100_100, x_test=fraud_background_100_100[features], y_test=fraud_background_100_100['isFraud'], title='dtc_model_100_100 performance train set 100/100')


In [None]:
#short version 10-1000
df_1 = train_predicates_cart[train_predicates_cart['isFraud']==0]
df_2 = train_predicates_cart[train_predicates_cart['isFraud']==1]
fraud_background = pd.concat([df_1.iloc[0:1000], df_2.iloc[0:10]], ignore_index=True)
create_facts_and_examples(df_=fraud_background, target='isFraud', \
                          predicates=predicates, output_dir='examples/fraud-cart-short-1000-10')

test_predicates_cart.to_parquet(path='examples/fraud-cart-short-1000-10/df_test.parquet')

## No Negation

## No Negation

df_1 = train_predicates_cart_no_negation[train_predicates_cart_no_negation['isFraud']==0]
df_2 = train_predicates_cart_no_negation[train_predicates_cart_no_negation['isFraud']==1]
fraud_background = pd.concat([df_1.iloc[0:1000], df_2.iloc[0:10]], ignore_index=True)

create_facts_and_examples(df_=fraud_background, target='isFraud', \
                          predicates=predicates, output_dir='examples/fraud-cart-short-1000-10-no-negation')

test_predicates_cart_no_negation.to_parquet(path='examples/fraud-cart-short-1000-10-no-negation/df_test.parquet')

In [None]:
df_1 = df_train[df_train['isFraud']==0]
df_2 = df_train[df_train['isFraud']==1]
fraud_background_1000_10 = pd.concat([df_1.iloc[0:1000], df_2.iloc[0:10]], ignore_index=True)
dtc_model_1000_10 = copy.copy(dtc_model)
dtc_model_1000_10.fit(fraud_background_1000_10[features], fraud_background_1000_10['isFraud'])

performance(dtc_model_1000_10, x_test=X_test[features], y_test=y_test, title='dtc_model_1000_10 performance test set')
performance(dtc_model, x_test=fraud_background_1000_10[features], y_test=fraud_background_1000_10['isFraud'], title='dtc_model performance train set 1000/10')
performance(dtc_model_1000_10, x_test=fraud_background_100_100[features], y_test=fraud_background_100_100['isFraud'], title='dtc_model_1000_10 performance train set 100/100')


## Prepare background knowledge, positive, and negative examples for DILP based on symbolic regression rule 

In [None]:
def create_symbolic_predicates(input_df):
    """ Creates data frame with columns from decision tree features and thresholds feature<threshold"""
    #  type = transfer, and
    # • externalDest = True, and
    # • amount - maxDest7 > -0.15
    fraud_background=input_df.copy()
    fraud_background['deviation_from_max_7_days'] = fraud_background['deviation_from_max_7_days']>-0.15

    predicates=['external_dest', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT',
                                       'type_TRANSFER', 'deviation_from_max_7_days']

    return fraud_background, predicates

In [None]:
train_predicates_symb, predicates_symb = create_symbolic_predicates(df_train)
test_predicates_symb, a = create_symbolic_predicates(df_test)

create_facts_and_examples(df_=train_predicates_symb, target='isFraud', 
                          predicates=predicates_symb, output_dir='examples/fraud-symb-full')
test_predicates_symb.to_parquet(path='examples/fraud-symb-full/df_test.parquet')


df_1 = train_predicates_symb[train_predicates_symb['isFraud']==0]
df_2 = train_predicates_symb[train_predicates_symb['isFraud']==1]
fraud_background = pd.concat([df_1.iloc[0:100], df_2.iloc[0:100]], ignore_index=True)


create_facts_and_examples(df_=fraud_background, target='isFraud', 
                          predicates=predicates_symb, output_dir='examples/fraud-symb-100-100')

test_predicates_symb.to_parquet(path='examples/fraud-symb-100-100/df_test.parquet')


In [None]:
train_predicates_symb, predicates_symb = create_symbolic_predicates(df_train)
test_predicates_symb,a = create_symbolic_predicates(df_test)


df_1 = train_predicates_symb[train_predicates_symb['isFraud']==0]
df_2 = train_predicates_symb[train_predicates_symb['isFraud']==1]
fraud_background = pd.concat([df_1.iloc[0:1000], df_2.iloc[0:10]], ignore_index=True)


create_facts_and_examples(df_=fraud_background, target='isFraud', 
                          predicates=predicates_symb, output_dir='examples/fraud-symb-10-1000')

test_predicates_symb.to_parquet(path='examples/fraud-symb-10-1000/df_test.parquet')

In [None]:
assert 0, "break before ripper"

In [None]:
np.sum(test_predicates_symb['type_TRANSFER']&test_predicates_symb['type_CASH_OUT'])

## RIPPER

In [None]:
import wittgenstein as lw

In [None]:
#https://pypi.org/project/wittgenstein/
#https://www.geeksforgeeks.org/ripper-algorithm/
#https://github.com/imoscovitz/wittgenstein#useful-references
ripper_clf = lw.RIPPER() # Or irep_clf = lw.IREP() to build a model using IREP
ripper_clf.fit(X_train,y_train) # Or pass X and y data to .fit
ripper_clf

In [None]:
ripper_clf.out_model()

In [None]:
performance(ripper_clf, x_test=X_test, y_test=y_test)