## Naive Bayes Classifier

**Simon Grishin**

In [2]:
import pandas as pd
import numpy as np

In [52]:
# Import data
train_data = pd.read_csv('fraudTrain.csv').drop(columns=['Unnamed: 0'])
test_data = pd.read_csv('fraudTest.csv').drop(columns=['Unnamed: 0'])

In [34]:
train_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [35]:
train_data['merchant'].value_counts()

merchant
fraud_Kilback LLC                       4403
fraud_Cormier LLC                       3649
fraud_Schumm PLC                        3634
fraud_Kuhn LLC                          3510
fraud_Boyer PLC                         3493
                                        ... 
fraud_Douglas, DuBuque and McKenzie      775
fraud_Treutel-King                       775
fraud_Medhurst, Labadie and Gottlieb     759
fraud_Reichert-Weissnat                  753
fraud_Hahn, Douglas and Schowalter       727
Name: count, Length: 693, dtype: int64

In [84]:
train_data.nunique()

trans_date_trans_time    1274791
cc_num                       983
merchant                     693
category                      14
amt                        52928
first                        352
last                         481
gender                         2
street                       983
city                         894
state                         51
zip                          970
lat                          968
long                         969
city_pop                     879
job                          494
dob                          968
trans_num                1296675
unix_time                1274823
merch_lat                1247805
merch_long               1275745
is_fraud                       2
dtype: int64

In [82]:
train_data['is_fraud'].value_counts()

is_fraud
0    1289169
1       7506
Name: count, dtype: int64

In [36]:
train_data.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


### Exploratory Data Analysis:

**Drop:**
* Transaction number
* cc_num
* first name
* last_name
* unix_time

**Transformed Variables:**
* Locational variables for merchant "merch_lat" and "merch_long"
* Locational variables for cardholders "lat" and "long"
* Convert dob to age in numerical value 
* Convert date time to month, day, year (numerical)

**Categorical Variables:**
* merchant
* category
* city
* state
* zip
* gender

**Numerical Variables:**
* city_pop
* amt

In [122]:
# Data Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
import reverse_geocoder as rg

transformed_data = train_data.copy()

# Computee distance between merchant and customer
transformed_data['distance'] = np.sqrt((transformed_data['lat'] - transformed_data['merch_lat'])**2 + (transformed_data['long'] - transformed_data['merch_long'])**2)

# Transforming dob to age
transformed_data['dob'] = pd.to_datetime(transformed_data['dob'])
transformed_data['age'] = (datetime.now() - transformed_data['dob']).dt.days / 365
transformed_data.drop(columns=['dob'], inplace=True)

# Transforming date to day, month, year
date = transformed_data['trans_date_trans_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
transformed_data['hour'] = date.dt.hour
transformed_data['day'] = date.dt.dayofweek
transformed_data['month'] = date.dt.month
transformed_data.drop(columns=['trans_date_trans_time'], inplace=True)

# Log-transform skewed data
transformed_data['amt'] = np.log1p(transformed_data['amt'])

transformed_data.drop(columns=['first', 'last', 'street', 'lat', 'long', 'merch_lat', 'merch_long', 'cc_num', 'trans_num', 'unix_time'], inplace=True)

# Transforming categorical data (Label encoding)
label_encoder = LabelEncoder()
label_vars = ['gender']

for var in label_vars:
    transformed_data[var] = label_encoder.fit_transform(transformed_data[var])

# Transform categorical data (frequency encoding)
frequency_vars = ['merchant', 'city', 'job', 'category', 'state']

for col in frequency_vars:
    freq_encoding = transformed_data[col].value_counts(normalize=True).to_dict()
    transformed_data[col] = transformed_data[col].map(freq_encoding)


In [123]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

X = transformed_data.drop(columns=['is_fraud'])
y = transformed_data['is_fraud']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(transformed_Y_data.drop(columns=['is_fraud']))

# Train Naive Bayes Classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predictions
y_pred = gnb.predict(X_val)

def compute_utility(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TP, FN, FP = cm[1, 1], cm[1, 0], cm[0, 1]
    utility = TP * (50) - FN * (100) - FP * (5)
    return utility

# Evaluate model performance
utility_score = compute_utility(y_val, y_pred)
print("Validation Utility Score:", utility_score)



Validation Utility Score: -149330


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Split data into features and target
trainin



In [6]:
# Utility Function
def calculate_utility(TN, FN, FP):
    return 50*TP - 100*FN - 5*FP
