In [28]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
train_url = "./Data/Train/train_data.csv"
test_url = "./Data/Test/test_data.csv"

train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

print(train_data.shape)
print(test_data.shape)
train_data.head()

(800000, 8)
(200000, 8)


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,5.572427,4.850337,1.14987,1.0,0.0,0.0,0.0,0.0
1,5.32753,3.933291,1.682739,1.0,1.0,0.0,1.0,0.0
2,78.945633,0.023222,2.740935,1.0,1.0,0.0,1.0,0.0
3,10.119026,26.548445,1.945075,1.0,0.0,0.0,0.0,0.0
4,33.646388,1.90924,0.888281,1.0,0.0,0.0,0.0,0.0


In [17]:
train_X, train_Y = train_data.drop("fraud", axis=1), train_data["fraud"]
test_X, test_Y = test_data.drop("fraud", axis=1), test_data["fraud"]
print("Train Data Shape", train_X.shape, train_Y.shape)
print(train_Y.value_counts())
print("Test Data Shape", test_X.shape, test_Y.shape)
print(test_Y.value_counts())
train_X.head()

Train Data Shape (800000, 7) (800000,)
0.0    730181
1.0     69819
Name: fraud, dtype: int64
Test Data Shape (200000, 7) (200000,)
0.0    182416
1.0     17584
Name: fraud, dtype: int64


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
0,5.572427,4.850337,1.14987,1.0,0.0,0.0,0.0
1,5.32753,3.933291,1.682739,1.0,1.0,0.0,1.0
2,78.945633,0.023222,2.740935,1.0,1.0,0.0,1.0
3,10.119026,26.548445,1.945075,1.0,0.0,0.0,0.0
4,33.646388,1.90924,0.888281,1.0,0.0,0.0,0.0


In [21]:
clf = RandomForestClassifier(n_estimators=100, max_depth=5)

In [22]:
clf.fit(train_X, train_Y)

RandomForestClassifier(max_depth=5)

In [24]:
pred_Y = clf.predict(test_X)
accuracy_score(test_Y, pred_Y)

0.999655

In [27]:
confusion_matrix(test_Y, pred_Y)

array([[182416,      0],
       [    69,  17515]], dtype=int64)

In [41]:
n_estimators_range = range(5, 51, 5)
max_depth_range = range(5, 20, 5)
param_grid ={'n_estimators': n_estimators_range, 'max_depth': max_depth_range}

clf_grid = GridSearchCV(estimator=RandomForestClassifier(),
             param_grid=param_grid, scoring='accuracy', verbose=3)
clf_grid.fit(train_X, train_Y)

print(f'Best score obtained with RFC: {rfc_grid.best_score_}')
print('Best Params', rfc_grid.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END .......max_depth=5, n_estimators=5;, score=0.987 total time=   3.3s
[CV 2/5] END .......max_depth=5, n_estimators=5;, score=0.998 total time=   3.0s
[CV 3/5] END .......max_depth=5, n_estimators=5;, score=0.983 total time=   3.4s
[CV 4/5] END .......max_depth=5, n_estimators=5;, score=0.997 total time=   3.9s
[CV 5/5] END .......max_depth=5, n_estimators=5;, score=0.986 total time=   3.5s
[CV 1/5] END ......max_depth=5, n_estimators=10;, score=0.999 total time=   6.7s
[CV 2/5] END ......max_depth=5, n_estimators=10;, score=0.982 total time=   6.4s
[CV 3/5] END ......max_depth=5, n_estimators=10;, score=0.999 total time=   6.7s
[CV 4/5] END ......max_depth=5, n_estimators=10;, score=1.000 total time=   7.0s
[CV 5/5] END ......max_depth=5, n_estimators=10;, score=0.999 total time=   6.7s
[CV 1/5] END ......max_depth=5, n_estimators=15;, score=0.997 total time=  10.0s
[CV 2/5] END ......max_depth=5, n_estimators=15

[CV 2/5] END ......max_depth=15, n_estimators=5;, score=1.000 total time=   4.5s
[CV 3/5] END ......max_depth=15, n_estimators=5;, score=1.000 total time=   4.1s
[CV 4/5] END ......max_depth=15, n_estimators=5;, score=1.000 total time=   5.0s
[CV 5/5] END ......max_depth=15, n_estimators=5;, score=1.000 total time=   4.9s
[CV 1/5] END .....max_depth=15, n_estimators=10;, score=1.000 total time=   8.9s
[CV 2/5] END .....max_depth=15, n_estimators=10;, score=1.000 total time=   8.1s
[CV 3/5] END .....max_depth=15, n_estimators=10;, score=1.000 total time=   9.4s
[CV 4/5] END .....max_depth=15, n_estimators=10;, score=1.000 total time=   8.7s
[CV 5/5] END .....max_depth=15, n_estimators=10;, score=1.000 total time=   9.3s
[CV 1/5] END .....max_depth=15, n_estimators=15;, score=1.000 total time=  12.1s
[CV 2/5] END .....max_depth=15, n_estimators=15;, score=1.000 total time=  13.4s
[CV 3/5] END .....max_depth=15, n_estimators=15;, score=1.000 total time=  13.7s
[CV 4/5] END .....max_depth=

NameError: name 'rfc_grid' is not defined

In [42]:
print(f'Best score obtained with RFC: {clf_grid.best_score_}')
print('Best Params', clf_grid.best_params_)

Best score obtained with RFC: 0.999995
Best Params {'max_depth': 15, 'n_estimators': 15}
