In [1]:
#import libraries
#Binary text classification of IMDB dataset to classify movies as good or bad
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, GlobalMaxPooling1D, Embedding, Input, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from tensorflow.keras.regularizers import l2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam

2.18.0


In [7]:
#Load and preprocess data
#increasing max_features past 5000 does not noticeably improve accuracy for both epochs while loss stays about the same
#decreasing max_features below 5000 does not noticeably improve accuracy for both epochs while loss stays about the same
max_features=5000
max_len = 100
word_embedding_dims=50
num_filters=250
kernel_size=3
hidden_dims=250
#increasing batch size past 32 does not noticeably imporve accuracy for both epochs, but does increase loss
#decreasing batch size below 32 improves accuracy and loss of epoch 1 while accuracy and loss for epoch 2 stays the same
batch_size=32
epochs=2
threshold=0.5

#num_words needs to be capped at max_features otherwise an error will occur at the Embedding() function
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
#y_train = to_categorical(y_train, 10)
#y_test = to_categorical(y_test, 10)


#X_train = np.array(X_train, dtype='object')
#X_test = np.array(X_test, dtype='object')

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

print(X_train.dtype)
print(y_train.dtype)

int32
int64


In [8]:
#Define model
model = Sequential([
    Input(shape=(max_len,)),
    Embedding(max_features, word_embedding_dims),
    Conv1D(num_filters, kernel_size, padding="valid", activation="relu", strides=1),
    #Conv1D(num_filters // 2, kernel_size, padding="valid", activation="relu", strides=1),
    GlobalMaxPooling1D(),
    #accuracy drops from 0.006 to 0.005
    Dense(hidden_dims, activation="relu", kernel_regularizer=l2(0.006)),
    Dense(1, activation="sigmoid", kernel_regularizer=l2(0.006))
])

In [9]:
#compile model
model.compile(optimizer=Adam(learning_rate=0.0001), 
              loss="binary_crossentropy", 
              metrics=["accuracy"])


model.summary()

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

Epoch 1/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 22ms/step - accuracy: 0.5362 - loss: 1.6609 - val_accuracy: 0.7160 - val_loss: 0.7808
Epoch 2/2
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - accuracy: 0.7422 - loss: 0.6683 - val_accuracy: 0.7838 - val_loss: 0.5050


<keras.src.callbacks.history.History at 0x1ff01825dd0>

In [10]:
#Train model
#model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
#Predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > threshold).astype("int")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: " + str(accuracy * 100))
print("Precision: " + str(precision * 100))
print("Recall: " + str(recall * 100))
print("F1 Score: " + str(f1 * 100))
#ROC Curve
fpr,tpr,thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("AUC:", roc_auc)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step
Accuracy: 78.38000000000001
Precision: 75.82065652522017
Recall: 83.336
F1 Score: 79.40089180227905
AUC: 0.7837999999999999


In [None]:
#Creative Task: Binary classification for fradulent e-commerce transactions
#Dataset used (too large for Github): https://www.kaggle.com/datasets/shriyashjagtap/fraudulent-e-commerce-transactions?resource=download
transactions = pd.read_csv("Fraudulent_E-Commerce_Transaction_Data.csv")

print(transactions.head())
transactions.isnull().sum()
transactions.shape
transactions.info()
transactions["Is Fraudulent"].value_counts()
#less fradulent transactions than valid ones
is_fraud = transactions[transactions["Is Fraudulent"] == 1]
not_fraud = transactions[transactions["Is Fraudulent"] == 0]
not_fraud = not_fraud[:len(is_fraud)]
data = pd.concat([is_fraud, not_fraud], ignore_index=True)
print(data.head())
tX = data.drop("Is Fraudulent", axis=1).select_dtypes(include=['number'])
ty = data["Is Fraudulent"]

tX_train, tX_test, ty_train, ty_test = train_test_split(tX, ty, test_size=0.3, random_state=42, stratify=ty)

scaler = StandardScaler()

                         Transaction ID                           Customer ID  \
0  15d2e414-8735-46fc-9e02-80b472b2580f  d1b87f62-51b2-493b-ad6a-77e0fe13e785   
1  0bfee1a0-6d5e-40da-a446-d04e73b1b177  37de64d5-e901-4a56-9ea0-af0c24c069cf   
2  e588eef4-b754-468e-9d90-d0e0abfc1af0  1bac88d6-4b22-409a-a06b-425119c57225   
3  4de46e52-60c3-49d9-be39-636681009789  2357c76e-9253-4ceb-b44e-ef4b71cb7d4d   
4  074a76de-fe2d-443e-a00c-f044cdb68e21  45071bc5-9588-43ea-8093-023caec8ea1c   

   Transaction Amount     Transaction Date Payment Method Product Category  \
0               58.09  2024-02-20 05:58:41  bank transfer      electronics   
1              389.96  2024-02-25 08:09:45     debit card      electronics   
2              134.19  2024-03-18 03:42:55         PayPal    home & garden   
3              226.17  2024-03-16 20:41:31  bank transfer         clothing   
4              121.53  2024-01-15 05:08:17  bank transfer         clothing   

   Quantity  Customer Age Customer Location 

In [13]:
#Define model
tX_train = scaler.fit_transform(tX_train)
tX_test = scaler.transform(tX_test)

ty_train = np.array(ty_train)
ty_test = np.array(ty_test)
tX_train = tX_train.reshape(-1, tX_train.shape[1], 1)
tX_test = tX_test.reshape(-1, tX_test.shape[1], 1)

tModel = Sequential([
    Conv1D(32, 2, activation="relu", input_shape=(tX_train.shape[1], 1)),
    Dropout(0.2),
    Conv1D(64, 2, activation="relu"),
    Dropout(0.5),
    Flatten(),
    Dense(64, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

tModel.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

tModel.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
#Train data
tModel.fit(tX_train, ty_train, epochs=epochs, batch_size=32, validation_data=(tX_test, ty_test))

Epoch 1/2
[1m3231/3231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 5ms/step - accuracy: 0.6878 - loss: 0.5827 - val_accuracy: 0.7329 - val_loss: 0.5284
Epoch 2/2
[1m3231/3231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.7287 - loss: 0.5394 - val_accuracy: 0.7344 - val_loss: 0.5260


<keras.src.callbacks.history.History at 0x1ff045286d0>

In [15]:
#Evaluations
ty_pred_prob = tModel.predict(tX_test)
ty_pred = (ty_pred_prob > threshold).astype("int")
accuracy = accuracy_score(ty_test, ty_pred)
precision = precision_score(ty_test, ty_pred)
recall = recall_score(ty_test, ty_pred)
f1 = f1_score(ty_test, ty_pred)

print("Accuracy: " + str(accuracy * 100))
print("Precision: " + str(precision * 100))
print("Recall: " + str(recall * 100))
print("F1 Score: " + str(f1 * 100))

fpr,tpr,thresholds = roc_curve(ty_test, ty_pred)
roc_auc = roc_auc_score(ty_test, ty_pred)

print("AUC:" + str(roc_auc * 100))

[1m1385/1385[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
Accuracy: 73.43972191499446
Precision: 72.17097958835083
Recall: 76.29903841813011
F1 Score: 74.17762075094912
AUC:73.43978645355764
