# Install and import necessary packages


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

np.random.seed(1)
df = pd.read_csv('https://github.com/timcsmith/MIS536-Public/raw/master/Data/UniversalBank.csv')
df.head(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


# Explore the dataset

#read the first row of the dataset 


In [2]:
print(df.head())
print(df.columns)
print(df.describe())
print(df.info())

   ID  Age  Experience  Income  ZIP Code  Family  CCAvg  Education  Mortgage  \
0   1   25           1      49     91107       4    1.6          1         0   
1   2   45          19      34     90089       3    1.5          1         0   
2   3   39          15      11     94720       1    1.0          1         0   
3   4   35           9     100     94112       1    2.7          2         0   
4   5   35           8      45     91330       4    1.0          2         0   

   Personal Loan  Securities Account  CD Account  Online  CreditCard  
0              0                   1           0       0           0  
1              0                   1           0       0           0  
2              0                   0           0       0           0  
3              0                   0           0       0           0  
4              0                   0           0       0           1  
Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education'

# Based on findings from data exploration, we need to clean up colum names, as there are some leading whitespace characters


In [3]:
df.columns = [s.strip() for s in df.columns] 
df.columns

df = df.drop(columns=['ID', 'ZIP Code'])

# Translation education categories into dummy vars


In [4]:
df = df.join(pd.get_dummies(df['Education'], prefix='Edu', drop_first=True))
df.drop('Education', axis=1, inplace = True)

df.head(3)

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Edu_2,Edu_3
0,25,1,49,4,1.6,0,0,1,0,0,0,0,0
1,45,19,34,3,1.5,0,0,1,0,0,0,0,0
2,39,15,11,1,1.0,0,0,0,0,0,0,0,0


# Construct datasets for analysis


In [5]:
target = 'Personal Loan'
predictors = list(df.columns)
predictors.remove(target)
X = df[predictors]
y = df[target]

# Create the training set and the test set 


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1)

# Random forest with randomized search cv


In [7]:
rf = RandomForestClassifier(random_state=1)

# Specify the hyperparameter distributions to be searched


In [8]:
param_dist = {'n_estimators': [10, 50, 100, 200, 500],
              'max_features': ['sqrt', 'log2'],
              'max_depth': [5, 10, 20, 30, None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'bootstrap': [True, False]}

# Create the randomized search cv object with 10 iterations


In [9]:
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=5, random_state=1)

# Fit the randomized search cv object


In [10]:
_ = random_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model


In [11]:
best_params = random_search.best_params_
best_rf = random_search.best_estimator_

# Evaluate the performance of the best model on the test set


In [12]:
y_pred = best_rf.predict(X_test)
print(f"{'Model':^18}{'Score':^18}")
print("************************************")
print(f"{'>> Recall Score:':18}{recall_score(y_test, y_pred)}")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")

      Model             Score       
************************************
>> Recall Score:  0.8590604026845637
Accuracy Score:   0.982
Precision Score:  0.9552238805970149
F1 Score:         0.9045936395759717


# Get the recall score of the best model


In [13]:
best_rf_recall = recall_score(y_test, y_pred)
print(f"Best RandomForestClassifier Recall Score: {best_rf_recall:.4f}")

Best RandomForestClassifier Recall Score: 0.8591


# Conclusion:

#Based on the evaluation metrics, the RandomForestClassifier model performed well on the given dataset with an accuracy score of 0.982. The model achieved a high precision score of 0.9552 and a relatively high recall score of 0.8591. The F1 score of 0.9046 indicates a good balance between precision and recall.

#Overall, the model seems to perform well in identifying fraudulent transactions. However, there is still room for improvement. One possible approach would be to collect more data, which would allow the model to learn more complex patterns and improve its accuracy. Another approach could be to try out different algorithms or ensemble models to further enhance the performance of the model.

#In addition, the current model is trained on a static dataset and may not perform well on new or unseen data. Therefore, it is crucial to regularly update the model with new data to ensure that it stays accurate and effective in detecting fraudulent transactions.