In [40]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Inbalanced Classes
## In this lab, we are going to explore a case of imbalanced classes. 


Like we disussed in class, when we have noisy data, if we are not careful, we can end up fitting our model to the noise in the data and not the 'signal'-- the factors that actually determine the outcome. This is called overfitting, and results in good results in training, and in bad results when the model is applied to real data. Similarly, we could have a model that is too simplistic to accurately model the signal. This produces a model that doesnt work well (ever). 


### Note: before doing the first commit, make sure you don't include the large csv file, either by adding it to .gitignore, or by deleting it.

### First, download the data from: https://www.kaggle.com/datasets/ealaxi/paysim1. Import the dataset and provide some discriptive statistics and plots. What do you think will be the important features in determining the outcome?
### Note: don't use the entire dataset, use a sample instead, with n=100000 elements, so your computer doesn't freeze.

In [16]:
# Your code here
paysim = pd.read_csv("dataset/paysim.csv").sample(100000)
display(paysim.head())

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
4641774,330,TRANSFER,202580.66,C589587874,0.0,0.0,C1801993250,11209917.13,11412497.8,0,0
1577167,155,PAYMENT,5948.7,C547446279,24510.79,18562.09,M1379218191,0.0,0.0,0,0
957893,44,TRANSFER,46276.94,C1199059971,9330.0,0.0,C1449067028,54905.15,101182.09,0,0
3509959,259,CASH_OUT,290881.39,C1555220705,27469.0,0.0,C700491314,526492.58,817373.97,0,0
1724677,160,PAYMENT,8075.41,C682336650,0.0,0.0,M1679501402,0.0,0.0,0,0


### What is the distribution of the outcome? 

In [17]:
# Your response here
print(paysim["isFraud"].unique())
print(paysim["isFlaggedFraud"].unique())
print(paysim["isFraud"].mode())


[0 1]
[0]
0    0
Name: isFraud, dtype: int64


### Clean the dataset. How are you going to integrate the time variable? Do you think the step (integer) coding in which it is given is appropriate?

In [18]:
# Your code here
missing_values=[]
missing = pd.DataFrame(paysim.isnull().any(axis=1), columns=["IsNull"])
for index in range(0,len(missing["IsNull"])):
    if missing.iloc[index]["IsNull"] == True:
        missing_values.append(index)
display(missing)
print(missing_values)

Unnamed: 0,IsNull
4641774,False
1577167,False
957893,False
3509959,False
1724677,False
...,...
1639142,False
3186983,False
4753939,False
2330008,False


[]


In [19]:
#no null values
#no, should've been a more accurate date tracking datatype like datetime

#back up string type columns
paysim_categorical= pd.DataFrame([paysim["type"], paysim["nameOrig"], paysim["nameDest"]], columns=["type", "nameOrig", "nameDest"])
paysim.drop(columns=["type", "nameOrig", "nameDest"], inplace=True)

#standardize data
scaler = StandardScaler()
paysim_stdized= pd.DataFrame(scaler.fit_transform(paysim), columns=paysim.columns)
paysim_stdized

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,0.604914,0.038186,-0.287852,-0.291535,2.939695,2.740955,-0.03771,0.0
1,-0.618164,-0.289893,-0.279285,-0.285122,-0.322041,-0.332104,-0.03771,0.0
2,-1.393944,-0.222606,-0.284591,-0.291535,-0.306065,-0.304859,-0.03771,0.0
3,0.108694,0.185515,-0.278251,-0.291535,-0.168848,-0.112009,-0.03771,0.0
4,-0.583218,-0.286345,-0.287852,-0.291535,-0.322041,-0.332104,-0.03771,0.0
...,...,...,...,...,...,...,...,...
99995,-0.604185,-0.299619,-0.286321,-0.290063,-0.322041,-0.332104,-0.03771,0.0
99996,-0.024097,-0.292306,-0.286788,-0.291535,-0.322041,-0.332104,-0.03771,0.0
99997,0.625881,0.015252,-0.278772,-0.291535,-0.085074,-0.061960,-0.03771,0.0
99998,-0.387526,-0.296031,-0.287491,-0.291535,-0.322041,-0.332104,-0.03771,0.0


### Run a logisitc regression classifier and evaluate its accuracy.

In [37]:
# Your code here
label = paysim["isFraud"]
features = paysim_stdized.drop(columns=["isFraud", "isFlaggedFraud"])

X_train, X_test, y_train, y_test = train_test_split(features, label, random_state=0)

In [43]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print("Score by test values: ", log_reg.score(X_test, y_test))
print("Score by train values: ", log_reg.score(X_train, y_train))
pred = log_reg.predict(X_test)

confusion_matrix(y_test, pred)

Score by test values:  0.99864
Score by train values:  0.9989333333333333


array([[24957,     0],
       [   34,     9]], dtype=int64)

### Now pick a model of your choice and evaluate its accuracy.

In [44]:
# Your code here
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)

print("Score by test values: ", dec_tree.score(X_test, y_test))
print("Score by train values: ", dec_tree.score(X_train, y_train))
pred = dec_tree.predict(X_test)

confusion_matrix(y_test, pred)

Score by test values:  0.99896
Score by train values:  1.0


array([[24947,    10],
       [   16,    27]], dtype=int64)

### Which model worked better and how do you know?

In [22]:
# Your response here
# the decision tree, since it had the lesser amount of false negatives on the confusion matrix (16<34)

### Note: before doing the first commit, make sure you don't include the large csv file, either by adding it to .gitignore, or by deleting it.