In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import datetime
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.layers import Layer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, classification_report

In [8]:
train_df = pd.read_csv('fraudTrain.csv')
train_df.drop(['Unnamed: 0'],axis=1, inplace=True)

In [9]:
# Store log columns as new features
train_df["log_amount"] = np.log(train_df["amt"])
train_df["log_city_pop"] = np.log(train_df["city_pop"])
# Create age column from date of birth
train_df["age"] = 2023 - pd.to_datetime(train_df["dob"]).dt.year
# Create hour and month columns from the transaction datetime column
train_df["hour"] = pd.to_datetime(train_df["trans_date_trans_time"]).dt.hour
train_df["month"] = pd.to_datetime(train_df["trans_date_trans_time"]).dt.month
#### Combine latitude and longitude columns and bucketize them 
lat_bins = [15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70]
long_bins = [-170, -150, -130, -110, -90, -70, -50]
train_df['lat_binned'] = pd.cut(train_df['lat'], lat_bins)
train_df['long_binned'] = pd.cut(train_df['long'], long_bins)
train_df['merch_lat_binned'] = pd.cut(train_df['merch_lat'], lat_bins)
train_df['merch_long_binned'] = pd.cut(train_df['merch_long'], long_bins)
train_df["long_lat_binned"] = list(zip(train_df["lat_binned"], train_df["long_binned"]))
train_df["merch_long_lat_binned"] = list(zip(train_df["merch_lat_binned"], train_df["merch_long_binned"]))
# Encode the frequency distribution of the credit card numbers
cc_num_frequency_counts = train_df["cc_num"].value_counts()
train_df["cc_num_frequency"] = train_df["cc_num"].map(cc_num_frequency_counts)
# Category and gender can be encoded using one-hot encoding
print(f"Unique categories for category: {len(train_df.category.unique())}")
ohe_features = ["gender"]
# Encode lower dimensional categorical features using one-hot encoding
ohe = OneHotEncoder(handle_unknown="error", sparse=False)
ohe_encoded_X_train = pd.DataFrame(ohe.fit_transform(train_df[ohe_features]))
ohe_encoded_features = list(ohe.get_feature_names_out())
ohe_encoded_X_train.columns = ohe_encoded_features
train_df = pd.concat([train_df, ohe_encoded_X_train], axis=1)

Unique categories for category: 14




In [10]:
# Final features
sparse_feature_names = ["merchant", "category", "city", "state", "zip", "job", 
                        "long_lat_binned", "merch_long_lat_binned"]
dense_feature_names = ["log_amount", "log_city_pop", "age", "hour", "month", "cc_num_frequency"] + \
                        ohe_encoded_features
sparse_features = train_df[sparse_feature_names]
dense_features = train_df[dense_feature_names]

In [11]:
# Create vocabularies dictionary
vocabularies = {}
for feat in sparse_feature_names:
    sparse_feature = train_df[feat].astype("str")
    feat_vocab = sparse_feature.unique().tolist()
    vocabularies[feat] = feat_vocab

In [12]:
embedding_dims = {}

embedding_dims["merchant"] = 16
embedding_dims["category"] = 16
embedding_dims["city"] = 16
embedding_dims["state"] = 16
embedding_dims["zip"] = 16
embedding_dims["job"] = 16
embedding_dims["long_lat_binned"] = 16
embedding_dims["merch_long_lat_binned"] = 16

From tfrs.experimental.models.Ranking docs: https://www.tensorflow.org/recommenders/api_docs/python/tfrs/experimental/models/Ranking

embedding_layer:	The embedding layer is applied to categorical features. It expects a string-to-tensor (or SparseTensor/RaggedTensor) dict as an input, and outputs a dictionary of string-to-tensor of feature_name, embedded_value pairs. {feature_name_i: tensor_i} -> {feature_name_i: emb(tensor_i)}.


In [13]:
class EmbeddingLayer(Layer):

    def __init__(self, sparse_feature_names, vocabularies, embedding_dims):
        super(EmbeddingLayer, self).__init__()
        self.sparse_feature_names = sparse_feature_names
        self.vocabularies = vocabularies
        self.embedding_dims = embedding_dims

    def build(self, input_shape):  # Create the state of the layer (weights)
        self.embbedings = {}
        for feature_name in self.sparse_feature_names:
            self.embbedings[feature_name] = tf.keras.layers.Embedding(len(self.vocabularies[feature_name]) + 1, self.embedding_dims[feature_name])
            

    def call(self, sparse_features_dict):  # Defines the computation from inputs to outputs
        outputs = {}
        for feature_name, embedding in self.embbedings.items():
            feature_output = embedding(sparse_features_dict[feature_name])
            outputs[feature_name] = feature_output
        return outputs

In [14]:
embedding_layer = EmbeddingLayer(sparse_feature_names, vocabularies, embedding_dims)

In [15]:
model = tfrs.experimental.models.Ranking(embedding_layer)

In [16]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

In [17]:
target_variable = "is_fraud"

classes = np.unique(train_df[target_variable])
class_weights = compute_class_weight(class_weight = 'balanced', 
                                     classes = classes, 
                                     y = train_df[target_variable])
class_weight_dict = dict(zip(classes, class_weights))

print("Class Weights:", class_weight_dict)

class_weights_tensor = tf.constant(list(class_weight_dict.values()), dtype=tf.float32)

Class Weights: {0: 0.5029111776656126, 1: 86.37589928057554}


In [None]:
batch_size = 64

for i in list(range(0, train_df.shape[0], batch_size)):
    # Create inputs dict
    inputs = {}
    
    # Create label
    y = train_df[i: i+batch_size][target_variable]
    label_tensor = tf.convert_to_tensor(y)
    inputs["label_tensor"] = label_tensor

    # Create sparse features
    sparse_features_dict = {}
    for feat in sparse_feature_names:
        sparse_feature = train_df[i: i+batch_size][feat].astype(str)
        feat_tensor = tf.convert_to_tensor(sparse_feature)
        feat_lookup_layer = tf.keras.layers.StringLookup(vocabulary=vocabularies[feat])
        feat_input = feat_lookup_layer(feat_tensor)
        sparse_features_dict[feat] = feat_input
    inputs["sparse_features"] = sparse_features_dict
    
    # Create dense features
    dense_features = train_df[i: i+batch_size][dense_feature_names]
    inputs["dense_features"] = tf.convert_to_tensor(dense_features)
    
    # Create sample weights
    sample_weight = tf.gather(class_weights_tensor, label_tensor)
    inputs["sample_weight"] = sample_weight
    
    model.fit(inputs, label_tensor, verbose=0)

In [19]:
import gc
del train_df
gc.collect()

239281

In [21]:
test_df = pd.read_csv("fraudTest.csv")

In [22]:
# Store log columns as new features
test_df["log_amount"] = np.log(test_df["amt"])
test_df["log_city_pop"] = np.log(test_df["city_pop"])
# Create age column from date of birth
test_df["age"] = 2023 - pd.to_datetime(test_df["dob"]).dt.year
# Create hour and month columns from the transaction datetime column
test_df["hour"] = pd.to_datetime(test_df["trans_date_trans_time"]).dt.hour
test_df["month"] = pd.to_datetime(test_df["trans_date_trans_time"]).dt.month
#### Combine latitude and longitude columns and bucketize them 
test_df['lat_binned'] = pd.cut(test_df['lat'], lat_bins)
test_df['long_binned'] = pd.cut(test_df['long'], long_bins)
test_df['merch_lat_binned'] = pd.cut(test_df['merch_lat'], lat_bins)
test_df['merch_long_binned'] = pd.cut(test_df['merch_long'], long_bins)
test_df["long_lat_binned"] = list(zip(test_df["lat_binned"], test_df["long_binned"]))
test_df["merch_long_lat_binned"] = list(zip(test_df["merch_lat_binned"], test_df["merch_long_binned"]))
# Encode the frequency distribution of the credit card numbers
test_df["cc_num_frequency"] = test_df["cc_num"].map(cc_num_frequency_counts)
# Indicate new credit card numbers in the test data by setting their frequency to -1
test_df["cc_num_frequency"].fillna(-1, inplace=True)

# Encode lower dimensional categorical features using one-hot encoding
ohe_encoded_X_test = pd.DataFrame(ohe.transform(test_df[ohe_features]))
ohe_encoded_X_test.columns = ohe_encoded_features
test_df = pd.concat([test_df, ohe_encoded_X_test], axis=1)

In [None]:
preds = []
for i in list(range(0, test_df.shape[0], batch_size)):
    # Create inputs dict
    inputs = {}
    
    # Create label
    y = test_df[i: i+batch_size][target_variable]
    label_tensor = tf.convert_to_tensor(y)
    inputs["label_tensor"] = label_tensor

    # Create sparse features
    sparse_features_dict = {}
    for feat in sparse_feature_names:
        sparse_feature = test_df[i: i+batch_size][feat].astype(str)
        feat_tensor = tf.convert_to_tensor(sparse_feature)
        feat_lookup_layer = tf.keras.layers.StringLookup(vocabulary=vocabularies[feat])
        feat_input = feat_lookup_layer(feat_tensor)
        sparse_features_dict[feat] = feat_input
    inputs["sparse_features"] = sparse_features_dict
    
    # Create dense features
    dense_features = test_df[i: i+batch_size][dense_feature_names]
    inputs["dense_features"] = tf.convert_to_tensor(dense_features)
    
    pred = model.predict(inputs, verbose=0)
    preds.extend(list(pred))

In [None]:
true_y = test_df[target_variable]
true_y = true_y.values

preds = np.array(preds)

pred_y = (preds >= 0.5).astype(int)

In [24]:
print(f"Test AUC: {round(roc_auc_score(true_y, preds), 4)}")

Test AUC: 0.5862


In [25]:
print("Classification report: \n")
print(classification_report(true_y, pred_y))

Classification report: 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       1.00      0.08      0.14      2145

    accuracy                           1.00    555719
   macro avg       1.00      0.54      0.57    555719
weighted avg       1.00      1.00      0.99    555719

