In [29]:
import numpy as np
import pandas as pd
import tensorflow as tf
from geopy import distance
import geopandas
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import keras 
from keras.models import Sequential
from keras.layers import Dense, Dropout
import tensorflow as tf
from keras.utils import np_utils
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Assignment 4 - Simple Neural Networks

For this assigment you'll do a realistic task - predicting fraud from transaction data. 
### Some Things to Note

<ul>
<li> The dataset is imbalanced. See: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data for some ideas
<li> The locations, time, dob all likely aren't super useful on their own, but can be made into something more useful without much code or trouble. Think about how it may be useful to represent them. The data doesn't have missing rows, so this is the main data prep portion. 
<li> With respect to the above, and the other data here, we have a lot of rows of data. That means that we can generally handle data that is reasonably wide...
</ul>

### Deliverables

Your final goal is to produce a function that can be called to classify a transaction:
<ul>
<li> Please submit two .ipynb files - one where you did your work, and another that can use your model to make predictions. 
<li> In that prediction file, please ensure:
    <ul>
    <li> You have a function where I can load a file, and the end result is a classificaiton matrix of your prediction accuracy. 
    <li> You load a trained model. There's no training here. 
    <li> Any data prep stuff that is needed for your data should be built in here. I'm going to run a test file that is the exact same setup as the training data.
    <li> I should be able to open the prediction file, load the test data, and click RUN ALL and things should work. 
    <li> In addition to that, please include a short (~1-2 paragraph) description of what you did. Include anything that was innovative/different as well as a note on:
        <ul>
        <li> Any imbalanced data steps. 
        <li> Treatment of the location and time variables. What did you do to them?
        <li> Model structure (layers/size)
        <li> Any optimization steps included - regularization, dropouts, feature selection, etc...
        </ul>
    </ul>
</ul>

### Grades

The grade breakdown is as follows:

<ul>
<li> Code preduces predictions - 40
<li> Accuracy - 30
<li> Explaination - 20
<li> Balance/variable transformations - 10
</ul>



In [31]:
# Load some data
df = pd.read_csv("https://jrssbcrsefilesnait.blob.core.windows.net/3950data1/fraudTrain.csv.zip")
df.drop(columns={"Unnamed: 0"}, inplace=True)
df.head()


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [32]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
trans_date_trans_time,1296675.0,1274791.0,2019-04-22 16:02:01,4.0,,,,,,,
cc_num,1296675.0,,,,4.1719204207972666e+17,1.3088064470007892e+18,60416207185.0,180042946491150.0,3521417320836166.0,4642255475285942.0,4.992346398065154e+18
merchant,1296675.0,693.0,fraud_Kilback LLC,4403.0,,,,,,,
category,1296675.0,14.0,gas_transport,131659.0,,,,,,,
amt,1296675.0,,,,70.351035,160.316039,1.0,9.65,47.52,83.14,28948.9
first,1296675.0,352.0,Christopher,26669.0,,,,,,,
last,1296675.0,481.0,Smith,28794.0,,,,,,,
gender,1296675.0,2.0,F,709863.0,,,,,,,
street,1296675.0,983.0,0069 Robin Brooks Apt. 695,3123.0,,,,,,,
city,1296675.0,894.0,Birmingham,5617.0,,,,,,,


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop          

In [34]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Create new features for hour of the day and day of the week
df['hour_of_day'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek

In [35]:
df.shape

(1296675, 24)

### Deal with Lat/Lon

We can utilize lat/lon of the home and merchant in a useful way?

Note: I left the section headers in from when I did it. You can remove them if you want. 

By comparing the latitude and longitude of a transaction's origin (e.g., the customer's home) with the location of the merchant, it's possible to identify suspicious transactions that occur far from the customer's usual location.

In [36]:
from math import sin, cos, sqrt, atan2, radians

def calc_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    lon = lon2 - lon1
    lat = lat2 - lat1
    a = sin(lat / 2)**2 + cos(lat1) * cos(lat2) * sin(lon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

In [37]:
# Create a new column called "distance" that calculates the distance between the home and merchant locations
df['distance'] = np.vectorize(calc_distance)(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

### Deal with Time

Can we make date/time and the date of birth into something useful?

Comparing the date of birth of the customer with the date of transaction can help verify the age of the customer. For example, if a customer claims to be over 18 years old but their date of birth suggests they are under 18, this could be a red flag for potential fraud.

In [38]:
# Convert dob column to datetime format and calculate age at time of transaction
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

Analyzing the date of transaction can help identify patterns in a customer's behavior that may be indicative of fraudulent activity. For example, if a customer makes multiple transactions in a short period of time, especially with different merchants or in different locations, this could suggest that someone is using their account without their authorization.

In [39]:
# converting to date column to datetime type:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Create new features for hour of the day and day of the week
df['Day_Hour'] = df['trans_date_trans_time'].dt.hour
df['week_Day'] = df['trans_date_trans_time'].dt.dayofweek

### Check Target Balance

As indicated below our data is highly imbalance which will lead to poor performance of ML models. 

In [40]:
df['is_fraud'].value_counts()

0    1289169
1       7506
Name: is_fraud, dtype: int64

### Prepare Data


In [41]:
# Drop unnecessary columns
df.drop(columns=['trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'street', 'city', 'state', 'zip',
                 'lat', 'long', 'merch_lat', 'merch_long', 'job', 'dob', 'trans_num', 'unix_time'], inplace=True)
# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['category', 'gender'])


### Split Data

In [42]:
# Split into features and target
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

In [43]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Application of PCA will allow us the dimentionality of the dataset

In [44]:
# Apply PCA to reduce the dimensionality
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

In [45]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

The implementation of Random oversampler will combat the imbalance in data by creating instances of the minority class 

In [46]:
# Address imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

### Model

In [47]:
# Build the neural network model

model = Sequential()
model.add(Dense(64, input_dim=X_train_resampled.shape[1], activation='relu'))
model.add(Dropout(0.2)) # Add dropout layer
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [48]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_resampled, y_train_resampled, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.12926200032234192
Test Accuracy: 0.9490492343902588


In [50]:
model.save('my_model.h5')