In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, auc

import numpy as np
import pandas as pd

%matplotlib inline

In [3]:
data_file = "~/.kaggle/creditcard.csv"

# Data loading

In [6]:
raw = pd.read_csv(data_file)

print(raw.shape)
raw.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [8]:
raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,284807.0,94813.86,47488.145955,0.0,54201.5,84692.0,139320.5,172792.0
V1,284807.0,3.91956e-15,1.958696,-56.40751,-0.920373,0.018109,1.315642,2.45493
V2,284807.0,5.688174e-16,1.651309,-72.715728,-0.59855,0.065486,0.803724,22.057729
V3,284807.0,-8.769071e-15,1.516255,-48.325589,-0.890365,0.179846,1.027196,9.382558
V4,284807.0,2.782312e-15,1.415869,-5.683171,-0.84864,-0.019847,0.743341,16.875344
V5,284807.0,-1.552563e-15,1.380247,-113.743307,-0.691597,-0.054336,0.611926,34.801666
V6,284807.0,2.010663e-15,1.332271,-26.160506,-0.768296,-0.274187,0.398565,73.301626
V7,284807.0,-1.694249e-15,1.237094,-43.557242,-0.554076,0.040103,0.570436,120.589494
V8,284807.0,-1.927028e-16,1.194353,-73.216718,-0.20863,0.022358,0.327346,20.007208
V9,284807.0,-3.137024e-15,1.098632,-13.434066,-0.643098,-0.051429,0.597139,15.594995


In [12]:
X = raw[["Amount"] + ["V%d" % i for i in range(1, 29)]]
y = raw.Class.astype(int)

X.shape, y.shape

((284807, 29), (284807,))

In [11]:
X.columns

Index(['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28'],
      dtype='object')

In [13]:
y.unique()

array([0, 1])

# Std ML

Now for a bit of RF.

In [14]:
from sklearn.ensemble import RandomForestClassifier


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

clf = RandomForestClassifier(n_jobs=4, bootstrap=0.7, n_estimators=200)
clf.fit(X_train, y_train)

y_train_hat = clf.predict_proba(X_train)[:,1]
y_test_hat = clf.predict_proba(X_test)[:,1]

roc_train = roc_auc_score(y_train, y_train_hat)
roc_test = roc_auc_score(y_test, y_test_hat)

print("ROC AUC for train = {} and test = {}".format(roc_train, roc_test))

ROC AUC for train = 1.0 and test = 0.9670806518614272


In [34]:
importance = pd.DataFrame(
    {'imp': clf.feature_importances_, 'f': range(X.shape[1])})
importance.sort_values(by="imp", ascending=False).head(15)

Unnamed: 0,imp,f
17,0.17607,17
12,0.129282,12
14,0.12025,14
10,0.084435,10
11,0.082694,11
16,0.057558,16
9,0.035003,9
7,0.026107,7
18,0.025558,18
4,0.025492,4


# Rules gen with skope

In [36]:
from skrules import SkopeRules

In [81]:
feature_names = X_train.columns

# Train a skope-rules-boosting classifier
skope_rules_clf = SkopeRules(feature_names=feature_names, random_state=42, n_estimators=120,
                             n_jobs = 4,
                             recall_min=0.005, precision_min=0.8,
                             max_samples=0.7,
                             bootstrap = 0.6,
                             max_depth_duplication= 4, max_depth = 4)
skope_rules_clf.fit(X_train, y_train)

SkopeRules(bootstrap=0.6, bootstrap_features=False,
      feature_names=Index(['Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28'],
      dtype='object'),
      max_depth=4, max_depth_duplication=4, max_features=1.0,
      max_samples=0.7, max_samples_features=1.0, min_samples_split=2,
      n_estimators=120, n_jobs=4, precision_min=0.8, random_state=42,
      recall_min=0.005, verbose=0)

In [84]:
y_train_hat = skope_rules_clf.score_top_rules(X_train)
y_test_hat = skope_rules_clf.score_top_rules(X_test)

roc_train = roc_auc_score(y_train, y_train_hat)
roc_test = roc_auc_score(y_test, y_test_hat)

print("ROC AUC for train = {} and test = {}".format(roc_train, roc_test))

ROC AUC for train = 0.9127995474570109 and test = 0.9157196942583625


In [85]:
# Get number of survival rules created
print(str(len(skope_rules_clf.rules_)) + ' rules have been built with ' +
      'SkopeRules.\n')

n_to_keep = 10

print('The {} most performing rules are the following one:\n'.format(n_to_keep))
for i_rule, rule in enumerate(skope_rules_clf.rules_[:n_to_keep]):
    print(rule[0])
    print(rule[1])

40 rules have been built with SkopeRules.

The 10 most performing rules are the following one:

V1 > -31.419588088989258 and V14 <= -2.7047367095947266 and V17 <= -2.785845637321472 and V7 <= 0.44506019353866577
(0.9424083769633508, 0.6766917293233082, 2)
V14 <= -2.6140414476394653 and V17 <= -2.75789737701416 and V27 <= 1.6969460844993591 and V7 <= 0.15012067556381226
(0.9502762430939227, 0.6231884057971014, 2)
V11 > 0.39566717483103275 and V17 <= -2.7893006801605225 and V27 <= 1.6969460844993591 and V7 <= 2.3501495718955994
(0.9064039408866995, 0.6366782006920415, 2)
V12 <= -1.7209022641181946 and V14 <= -2.7853721380233765 and V17 <= -2.7690573930740356 and V27 <= 1.7496772408485413
(0.9055555555555556, 0.6293436293436293, 2)
V1 > -31.41292381286621 and V17 <= -2.7965283393859863 and V26 > -0.2658992409706116 and V4 > 1.1661997437477112
(0.9510869565217391, 0.5952380952380952, 2)
V12 <= -4.575529336929321 and V26 > -0.2644702345132828 and V4 > 1.214595913887024 and V9 <= 0.821277722

### Are there any rules with Amount?

In [86]:
[(i, rule[0]) for (i, rule) in enumerate(skope_rules_clf.rules_) if 'Amount' in rule[0]]

[]

In [87]:
def extract_vars(rule_text):
    strings = rule_text.split(' ')
    return {one_string for one_string in strings if one_string[0] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}

def var_in_rules(clf, n_rules):
    var_set = set()
    for i in range(n_rules):
        e = extract_vars(skope_rules_clf.rules_[i][0])
        var_set = var_set | e
    return var_set

### Extract variables used in rules, starting from the first and then combining

In [88]:
for i in range(1, len(skope_rules_clf.rules_)):
    vars = sorted(list(var_in_rules(skope_rules_clf, i)))
    print(i, '=> {} / {}'.format(len(vars), str(vars)) )

1 => 4 / ['V1', 'V14', 'V17', 'V7']
2 => 5 / ['V1', 'V14', 'V17', 'V27', 'V7']
3 => 6 / ['V1', 'V11', 'V14', 'V17', 'V27', 'V7']
4 => 7 / ['V1', 'V11', 'V12', 'V14', 'V17', 'V27', 'V7']
5 => 9 / ['V1', 'V11', 'V12', 'V14', 'V17', 'V26', 'V27', 'V4', 'V7']
6 => 10 / ['V1', 'V11', 'V12', 'V14', 'V17', 'V26', 'V27', 'V4', 'V7', 'V9']
7 => 11 / ['V1', 'V11', 'V12', 'V14', 'V17', 'V26', 'V27', 'V3', 'V4', 'V7', 'V9']
8 => 13 / ['V1', 'V10', 'V11', 'V12', 'V14', 'V17', 'V26', 'V27', 'V28', 'V3', 'V4', 'V7', 'V9']
9 => 14 / ['V1', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V26', 'V27', 'V28', 'V3', 'V4', 'V7', 'V9']
10 => 14 / ['V1', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V26', 'V27', 'V28', 'V3', 'V4', 'V7', 'V9']
11 => 15 / ['V1', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V2', 'V26', 'V27', 'V28', 'V3', 'V4', 'V7', 'V9']
12 => 15 / ['V1', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V2', 'V26', 'V27', 'V28', 'V3', 'V4', 'V7', 'V9']
13 => 15 / ['V1', 'V10', 'V11', 'V12', 'V14', 'V16', 'V1