# Import the necessary packages

In [15]:
import pandas as pd 
import logging
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve
import scikitplot as skplt
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.combine import SMOTETomek 
from collections import Counter
import random

# Create Data Reader function
This function drops the ordered_product_key and the campaign_key columns from a given file as whilst needed when creating the dataset for joins.etc, they serve no use when trying to model fraud.

In [16]:
def data_reader(file):
    data_read = pd.read_csv(file)
    data = data_read.drop(["Ordered_Product_Key", "Campaign_Key"], 1)
    return data

# Create NaN function
Naturally. it is impossibly to model fraud if no fraud actually exists in a specific site. For a given dataframe, this function returns the number of NaN values within a dataset as well as the number of total number of fraudulent observations, if any at all. 


In [17]:
def file_checker(dataframe):
    num_nan = np.sum(dataframe.isnull().sum())
    num_fraud = dataframe['fraud_status'].sum()
    return num_nan, num_fraud

# Create SMOTE function
This function applies Tomek line and SMOTE to the dataset, allowing for fraud to be better modelled.

In [18]:
def get_smote(feature, label):
    print("Raw Data: " + str(sorted(Counter(label).items())))
    smt = SMOTETomek(random_state=42)
    feature_resampled, label_resampled = smt.fit_sample(feature, label)
    print("Resampled: " + str(sorted(Counter(label_resampled).items())))
    return feature_resampled, label_resampled

# Plot the cross-validation scores
For each of the 6 sites, this function plots the cross-validation scores vs. the number of trees. Additionally, this saves each graph as a .png file within a local directory. This function was used when determining the number of trees needed.


In [19]:
def plotter(scores, array1, array2, tree_list, directory, file_write):
    plt.plot(tree_list, scores)
    plt.plot(tree_list, array1 + array2, 'b--')
    plt.plot(tree_list, array1 - array2, 'b--')
    plt.ylabel('CV score')
    plt.xlabel('# of trees')
    plt.savefig(directory + 'accuracy_plots/dataset' + str(file_write).strip(file_dir) + ".png")

# Create a Metric function
This function calculates and returns the recall, precision, accuracy, F-score, and AUC of a particular model. 

In [20]:
def data_scorer(model, features, labels, folds):
    recall = np.mean(cross_val_score(model, X = features, y = labels, cv = folds, scoring = "recall", n_jobs = -1))
    precision = np.mean(cross_val_score(model, X = features, y = labels, cv = folds, scoring = "precision", n_jobs = -1))
    accuracy = np.mean(cross_val_score(model, X = features, y = labels, cv = folds, scoring = "accuracy", n_jobs = -1))
    f1 = np.mean(cross_val_score(model, X = features, y = labels, cv = folds, scoring = "f1", n_jobs = -1))
    auc = np.mean(cross_val_score(model, X = features, y = labels, cv = folds, scoring = "roc_auc", n_jobs = -1))
    return accuracy, recall, precision, f1, auc

In [23]:
random.seed(123)

In [12]:
file_dir = "/Users/Nick/Desktop/datafundamentals/thgfd/data/stratified "
files = glob.glob(file_dir + "dataset*.csv")
data_list = []

# Processing and Modelling the data
This is where the modelling happens, via a for loop. This whole process is automated by looping through a directory containing each of the datasets, producing a model, storing the model's metrics, writing to a .csv file and then moving onto the next file.

#### Data read and validate
The data is first read in, a variable of the file's name is created and then the data is checked for the presence of fraud. If this presence is confirmed, then the data is split into a testing and training set, with labels split out too. 

#### Modelling
The training data is then passed through a random forest, using 80 trees. Once the model has been trained, variable importances are extracted and unimportant variables are then removed from the training and testing set. The model is then fitted again, using only the important variables.

#### Testing
The testing data is then passed through the model and, using cross validation, accuracy, precision, recall, f-score and auc are extracted. A confusion matrix and ROC plot are also created and saved to a .png file. The final model metrics and then written to csv and the loop proceeds to the next file.

In [13]:
import warnings
warnings.filterwarnings('ignore')
for file in files:
    filename = file.strip(file_dir)
    site = data_reader(file)
    file_checks = file_checker(site)
    if file_checks[1] > 0:
        train, test = train_test_split(site, test_size = 0.5, random_state = 42)
        train_x = train.loc[:, train.columns != "fraud_status"]
        train_y = train['fraud_status']
        test_x = test.loc[:, test.columns != "fraud_status"]
        test_y = test['fraud_status']
        classifier = RandomForestClassifier(80)
        classifier.fit(train_x, train_y)
        importances = pd.Series(classifier.feature_importances_, name = "Importances")
        var_importance = pd.concat([pd.Series(test_x.columns, name = "Names"), importances], 1)
        var_importance = var_importance.sort_values(by = "Importances", ascending = False).reset_index()
        var_importance = var_importance.drop("index", 1)
        sns.set_style('ticks')
        fig, ax = plt.subplots()
        fig.set_size_inches(w = 20, h = 20)
        sns.barplot(y = "Names", x = "Importances", data = var_importance, ax = ax)
        sns.despine()
        fig.savefig("/Users/sunmengnan/Desktop/FD/plot/" +
                    filename+ "variable_importance_no_smote.png")
        
        unimportant = var_importance.drop(var_importance[var_importance.Importances < 0.0001].index)
        unimportant = unimportant["Names"]
        unimportant.to_csv("/Users/sunmengnan/Desktop/FD/result/"+
                           filename + "important_variables.csv",index = True)
        
        train_x = pd.DataFrame(train_x)
        train_x.columns = train.loc[:, train.columns != "fraud_status"].columns
        train_x = train_x[unimportant]
        train_x = train_x.as_matrix()
        test_x = test[unimportant].values
        test_y = test['fraud_status'].values
        
        clf = RandomForestClassifier(80, n_jobs = -1, random_state=42)
        clf.fit(train_x, train_y)
        clf.predict(test_x)

        if sum(test_y) < 10:
            cv = sum(test_y)
        else:
            cv = 10
        
        accuracy, recall, precision, f_score, auc = data_scorer(clf, test_x, test_y, cv)
        results = [filename, accuracy, recall, precision, f_score, auc]
          
        y_prob = clf.predict_proba(test_x)
        y_pred = clf.predict(test_x)
        
        fig, ax = plt.subplots()
        fig.set_size_inches(w = 20, h = 20)
        skplt.metrics.plot_roc_curve(test_y, y_prob, title = "Random Forest ROC Site " + filename ,
                                     ax = ax, title_fontsize=46, text_fontsize = 30)
        fig.savefig("/Users/sunmengnan/Desktop/FD/plot/" + filename + "rf_roc_no_smote.png")
        
        fig, ax = plt.subplots()
        fig.set_size_inches(w = 20, h = 20)
        skplt.metrics.plot_confusion_matrix(test_y, y_pred, normalize = True, ax = ax,
                                            text_fontsize=30, title_fontsize=46, 
                                            title="Normalised Confusion Matrix")
        
        fig.savefig("/Users/sunmengnan/Desktop/FD/plot/"+ filename + "rf_confusion_no_smote.png")
        data_list.append(results)
    else:
        pass

# Save Results
This saves all results into a .csv file called "random_forest_results.csv".

In [None]:
results_df = pd.DataFrame(data_list, columns = ["File","Accuracy", " recall", "precision", "f_score", "auc"])
results_df.to_csv("random_forest_results.csv",
                  index = False)