<a href="https://colab.research.google.com/github/tpprymjmdr/notebooks/blob/master/BrainFinance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Relevant Modules

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from collections import Counter

## Fraud Detection

#### Get The Data

In [275]:
df = pd.read_csv('fraud_prep.csv.gz')
df.shape

(284807, 31)

#### Preprocess

In [276]:
print(df.dtypes.unique().tolist())

[dtype('float64'), dtype('int64')]


In [277]:
dtypes = df.dtypes
dtypes[dtypes == np.int64].index.tolist()

['Class']

In [278]:
nulls = df.isnull().sum()
nulls[nulls > 0]

Series([], dtype: int64)

In [0]:
df.drop('Time', inplace=True, axis=1)

In [280]:
df.drop_duplicates(inplace=True)
df.shape

(275663, 30)

#### Rectify class imbalance using SMOTE and undersampling

In [281]:
df['Class'].value_counts()

0    275190
1       473
Name: Class, dtype: int64

In [282]:
rows_to_keep = df.loc[df['Class'] == 1].index.tolist() \
               + np.random.choice(df.loc[df['Class'] == 0].index, 
                                  int(0.5 * df['Class'].value_counts()[0]), 
                                  replace=False).tolist()
len(rows_to_keep)

138068

In [0]:
df = df.loc[rows_to_keep]
y = df.pop('Class').values
X = df.values

In [0]:
X_res, y_res = SMOTE().fit_resample(X, y)

In [286]:
X_res.shape

(275190, 29)

In [287]:
Counter(y_res)

Counter({0: 137595, 1: 137595})

#### Split into training and test

In [291]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, stratify=y_res)
y_train.shape, y_test.shape

((220152,), (55038,))

#### Train using Random Forest

In [301]:
rfc = RandomForestClassifier()
grid_param = {  
    'n_estimators': [500],
    'bootstrap': [True],
    'max_depth': [10]
}

grid_search = GridSearchCV(estimator=rfc,  
                           param_grid=grid_param,
                           cv=3,
                           scoring='roc_auc',
                           n_jobs=-1,
                           verbose=3)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 12.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [303]:
rfc = grid_search.best_estimator_
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9875177150332498

#### Train using Logistic Regression

In [308]:
lrc = LogisticRegression()
grid_param = {  
    'penalty': ['l2'],
    'fit_intercept': [True],
    'C': [10.]
}

grid_search = GridSearchCV(estimator=lrc,  
                           param_grid=grid_param,
                           cv=3,
                           scoring='roc_auc',
                           n_jobs=-1,
                           verbose=3)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   16.2s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [10.0], 'fit_intercept': [True],
                         'penalty': ['l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=3)

In [307]:
lrc = grid_search.best_estimator_
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)



0.9592463388931284

## Crime Dataset

#### Read the dataset

In [152]:
df = pd.read_csv('crime_prep.csv.gz')
df.shape

(1994, 128)

In [153]:
df.head(1)

Unnamed: 0,target,v_cont_0,v_cat_0,v_cat_1,v_cat_2,v_cat_3,v_cont_5,v_cont_6,v_cont_7,v_cont_8,v_cont_9,v_cont_10,v_cont_11,v_cont_12,v_cont_13,v_cont_14,v_cont_15,v_cont_16,v_cont_17,v_cont_18,v_cont_19,v_cont_20,v_cont_21,v_cont_22,v_cont_23,v_cont_24,v_cont_25,v_cont_26,v_cont_27,v_cont_28,v_cont_29,v_cont_30,v_cont_31,v_cont_32,v_cont_33,v_cont_34,v_cont_35,v_cont_36,v_cont_37,v_cont_38,...,v_cont_87,v_cont_88,v_cont_89,v_cont_90,v_cont_91,v_cont_92,v_cont_93,v_cont_94,v_cont_95,v_cont_96,v_cont_97,v_cont_98,v_cont_99,v_cont_100,v_cont_101,v_cont_102,v_cont_103,v_cont_104,v_cont_105,v_cont_106,v_cont_107,v_cont_108,v_cont_109,v_cont_110,v_cont_111,v_cont_112,v_cont_113,v_cont_114,v_cont_115,v_cont_116,v_cont_117,v_cont_118,v_cont_119,v_cont_120,v_cont_121,v_cont_122,v_cont_123,v_cont_124,v_cont_125,v_cont_126
0,0.2,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,0.17,0.34,0.47,0.29,0.32,0.2,1.0,0.37,0.72,0.34,0.6,0.29,0.15,0.43,0.39,0.4,0.39,0.32,0.27,0.27,0.36,0.41,0.08,0.19,0.1,0.18,0.48,0.27,0.68,...,0.36,0.35,0.38,0.34,0.38,0.46,0.25,0.04,0.0,0.12,0.42,0.5,0.51,0.64,0.03,0.13,0.96,0.17,0.06,0.18,0.44,0.13,0.94,0.93,0.03,0.07,0.1,0.07,0.02,0.57,0.29,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14


#### Drop duplicates and columns with more than 25% null values, then impute nulls with mean for the remaining columns

In [154]:
df.drop_duplicates(inplace=True)
df.shape

(1994, 128)

In [0]:
null_perc = df.isnull().sum() * 100 / df.shape[0]
cols_to_keep = null_perc[null_perc < 25].index.tolist()
df = df[cols_to_keep]
empty_cols = null_perc[(null_perc < 25) & (null_perc > 0)].index.tolist()
for col in empty_cols:
    df[col].fillna(df[col].mean(), inplace=True)

#### Find highly correlated columns with the target variable, and drop them

In [157]:
corr_matrix = df.corr().abs()
highly_correlated_cols = corr_matrix[corr_matrix.target > 0.70].index.tolist()
highly_correlated_cols

['target', 'v_cont_48', 'v_cont_49', 'v_cont_55']

In [0]:
y = df['target'].values
df.drop(highly_correlated_cols, inplace=True, axis=1)

#### Find other pairs of correlated columns, and drop them

In [159]:
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]
print(len(to_drop))

57


In [160]:
df.drop(to_drop, inplace=True, axis=1)
df.shape

(1994, 43)

#### Find categorial columns, and created dummy variables for them

In [161]:
dtypes = df.dtypes
non_float_cols = dtypes[dtypes != np.float].index.tolist()
cols_to_drop = []
cat_cols = []
for col in non_float_cols:
    if df[col].nunique() / df.shape[0] > 0.9:
        cols_to_drop.append(col)
    elif dtypes.loc[col] == np.int and df[col].nunique() < 20:
        cat_cols.append(col)
    
cols_to_drop, cat_cols

(['v_cat_2'], ['v_cat_3'])

In [162]:
df.drop(cols_to_drop, axis=1, inplace=True)
num_cols = set(df.columns.tolist()).difference(set(cat_cols))
X = df[num_cols].values
X.shape

(1994, 41)

#### Stack with numeric columns to get the predictor matrix

In [163]:
for col in cat_cols:
    X_cat = pd.get_dummies(df[col], drop_first=True).values
    X = np.hstack([X, X_cat])
X.shape

(1994, 50)

#### Split into train and test sets

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train.shape, y_test.shape

((1595,), (399,))

#### Do grid search on Random Forest

In [0]:
rfr = RandomForestRegressor()
grid_param = {  
    'n_estimators': [100, 500, 1000],
    'bootstrap': [True, False],
    'max_depth': [10, None]
}

grid_search = GridSearchCV(estimator=rfr,  
                           param_grid=grid_param,
                           cv=3,
                           scoring='r2',
                           n_jobs=-1,
                           verbose=3)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  4.6min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'bootstrap': [True, False], 'max_

In [198]:
grid_search.best_params_, grid_search.best_score_

({'bootstrap': True, 'max_depth': None, 'n_estimators': 1000},
 0.6258873378751005)

#### Do grid search on Lasso regression

In [217]:
lasso = Lasso()
grid_param = {  
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'tol': [1e-5],
    'alpha': [0, 0.01, 0.1, 1., 10., 100, ]
}

grid_search = GridSearchCV(estimator=lasso,  
                           param_grid=grid_param,
                           cv=3,
                           scoring='r2',
                           n_jobs=-1,
                           verbose=3)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    0.5s finished
  self.best_estimator_.fit(X, y, **fit_params)
  positive)
  positive)


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'alpha': [0, 0.01, 0.1, 1.0, 10.0, 100],
                         'fit_intercept': [True, False],
                         'normalize': [True, False], 'tol': [1e-05]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=3)

In [218]:
grid_search.best_params_, grid_search.best_score_

({'alpha': 0, 'fit_intercept': True, 'normalize': False, 'tol': 1e-05},
 0.625799539016251)

#### Pick the model with the best score, and evaluate on the test set

In [219]:
lasso = grid_search.best_estimator_
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

  
  positive)
  positive)


0.6550884956913887