In [None]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt

# Data Preprocessing

* Drop features (i.e. columns) which have more than 60% NaN values
* Convert numerical feature 'Timestamp' to hour
* Impute NaN data in numerical features with median and categorical features with most frequent values, repectively
* Standardize numerical features
* Encode categorical features as an integer array
* Undersample the majority class on the class boundary

References:
* https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
* https://machinelearningmastery.com/undersampling-algorithms-for-imbalanced-classification/
* https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.RandomOverSampler.html

In [None]:
# load training data
df_train = pd.read_csv('/kaggle/input/suspicious-transaction-detection/train.csv', sep = ',')

# drop features which have too many NaN values
df_train.dropna(thresh=df_train.shape[0]*0.4,how='all',axis=1,inplace=True)

# convert Timestamp feature to hour
df_train['Timestamp'] = pd.to_datetime(df_train['Timestamp'])
df_train['Timestamp'] = df_train['Timestamp'].dt.hour

# split features and label
X_train = df_train.iloc[:,2:]
y_train = df_train.iloc[:,1]

# get a list of all features
col = X_train.columns.values.tolist()
del df_train

In [None]:
# get a list of categorical features
categorical_features = ['Goods', 'A_0', 'A_1', 'E_0', 'E_1', 'M_0', 'M_1',
                          'C_0', 'C_1', 'C_2', 'C_3', 'C_4', 'C_5', 'C_6',
                          'C_7', 'C_8', 'C_23', 'C_24', 'C_25', 'C_26', 'C_27',
                         'C_28', 'O_2', 'O_4', 'O_6', 'O_7', 'O_8', 'O_10',
                         'O_11', 'O_12', 'O_13', 'O_14', 'O_15', 'O_16', 'O_17',
                         'O_19', 'O_21', 'O_23', 'O_24', 'O_25', 'O_26', 'O_27',
                         'O_28', 'O_30', 'O_31', 'O_32', 'O_35', 'O_36', 'O_39']

# get a list of numerical features
num_features = []
cat_features = []
for f in col:
    if f not in categorical_features:
        num_features.append(f)
    else:
        cat_features.append(f)
        

# impute NaN data in both numerical and categorical features
# standardize numerical features and encode categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('cat_imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder(handle_unknown='ignore'))])

preprocessor = compose.ColumnTransformer(transformers=[('num', numerical_transformer, num_features),
                                                       ('cat', categorical_transformer, cat_features)])

X_train_enc = preprocessor.fit_transform(X_train)
print(X_train_enc.shape)

In [None]:
# The training data is imbalanced with the fraud class being the minority class. We use TomekLinks to 
# locate the boundary between the two classes and to remove some data points from the majority class 
# close to the boundary.

tl = TomekLinks(sampling_strategy='auto')
X_train_tl, y_train_tl = tl.fit_resample(X_train_enc, y_train)

print(X_train_tl.shape, y_train_tl.shape)

del X_train_enc

In [None]:
# load and preprocess testing data
df_test = pd.read_csv('/kaggle/input/suspicious-transaction-detection/test.csv', sep = ',')
df_test['Timestamp'] = pd.to_datetime(df_test['Timestamp'])
df_test['Timestamp'] = df_test['Timestamp'].dt.hour

T_id = df_test.iloc[:,0]
X_test = df_test[col]
del df_test

X_test_enc = preprocessor.transform(X_test)
del X_test

# Model selection
We trained a Random Forest model because it is an emsemble learning method and can lower the risk of overfitting. We didn't use Naive Bayes because we were not confident to assume Gaussian distribution for some features such as time, transaction amount, etc. We didn't use neural networks because there are too many features in our dataset and it would require too much computation to train a neural network model. Random Forest model runs more efficiently on our dataset. Another benefit is that it is a tree-based method and it's easy to interpret the model. 
In the training process, we used Pipeline to set up the steps of estimation and GridSearchCV to choose the hyperparameters (i.e. the number of trees in the forest and the maximum depth of the tree) from pre-specified hyperparameters range: n_estimators = [200,500,800], max_depth = [20,50,100]. We used 5-fold cross-validation to choose the hyperparameters that maximize roc_auc score (i.e. area under the receiver operating characteristic curve). GridSearchCV returns the best model (n_estimators=800, max_depth=50) which is then refitted on the whole dataset. Its mean validation AUC score is 0.93, which is acceptable. However, due to possible timeout or memonry leakage problem with GridSearchCV, we directly use the tuned hyperparameter (n_estimators=800, max_depth=50) below to train the final model.

In [None]:
'''
# Random Forest model
estimator_rf = Pipeline(steps = [('rf', RandomForestClassifier())])
param_rf = {'rf__n_estimators':[200,500,800],'rf__max_depth':[20,50,100]}  # to save code running time, here we simplified the hyperparameters range to the best model we found

# do 5-fold cross-validation to find the best hyperparameters
model_rf = GridSearchCV(estimator_rf,param_grid = param_rf, cv=5, scoring = 'roc_auc', verbose = 0, n_jobs=-1)

print(model_rf.best_estimator_)
print(model_rf.best_score_)
'''

# directly use the result of GridSearchCV with the tuned hyperparameters
model_rf = RandomForestClassifier(n_estimators = 800, max_depth = 50)

# train the model
model_rf.fit(X_train_tl, y_train_tl)

In [None]:
# make predictions on the testing data
# this returns the probabilities that the transaction is suspicious ('Target'=1)
prob = model_rf.predict_proba(X_test_enc)
df2 = pd.DataFrame({'Target': prob[:,1]})
result = pd.concat([T_id, df2], axis=1)

# save results
result.to_csv('/kaggle/working/submission_rf.csv', index=False, header=True)

print(result.head())

Sort the features and Plot 10 most important features

In [None]:
# importances = model_rf.best_estimator_.named_steps["rf"].feature_importances_
importances = model_rf.feature_importances_
indices = np.argsort(importances)[::-1]
col = np.asarray(col)
col_sort = col[indices]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train_tl.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, col_sort[f], importances[indices[f]]))

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Top 10 important features")
plt.bar(range(10), importances[indices][:10])
plt.xticks(range(10),col_sort[:10],rotation=30)
plt.show()