<a href="https://colab.research.google.com/github/zingisamatwana/Fraud-Detection-Challenge/blob/master/Copy_of_fraudDetection_Hakathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
# UPLOADING DATA
# ----------------------------------
# from google.colab import files
# files.upload()
# files.upload()
# ---------------------------------

In [0]:
! pip install -U imbalanced-learn

Collecting imbalanced-learn
[?25l  Downloading https://files.pythonhosted.org/packages/e6/62/08c14224a7e242df2cef7b312d2ef821c3931ec9b015ff93bb52ec8a10a3/imbalanced_learn-0.5.0-py3-none-any.whl (173kB)
[K     |████████████████████████████████| 174kB 2.8MB/s 
Installing collected packages: imbalanced-learn
  Found existing installation: imbalanced-learn 0.4.3
    Uninstalling imbalanced-learn-0.4.3:
      Successfully uninstalled imbalanced-learn-0.4.3
Successfully installed imbalanced-learn-0.5.0


In [0]:
train = pd.read_csv('training.csv', parse_dates=['TransactionStartTime'])
test = pd.read_csv('test.csv',parse_dates=['TransactionStartTime'])

In [0]:
removed_cols = ['TransactionId','BatchId','AccountId','SubscriptionId', 'CustomerId','CurrencyCode', 'CountryCode']
train_df = train.drop(removed_cols, axis=1)
test_df = test.drop(removed_cols, axis=1)

In [0]:

def create_date_cols(input_df, date_col):
  df = input_df.copy()
  df['year'] = df[date_col].dt.year
  df['month'] = df[date_col].dt.month
  df['day'] = df[date_col].dt.day 
  df['hour'] = df[date_col].dt.hour
  df['minutes'] = df[date_col].dt.minute  
  df['seconds'] = df[date_col].dt.second
  df['dayofweek'] = df[date_col].dt.dayofweek
  return df

In [0]:

train_df = create_date_cols(train_df, 'TransactionStartTime')
test_df = create_date_cols(test_df, 'TransactionStartTime')


In [0]:
price_strats_dict = {0:'zero',1:'one',2:'two',4:'four'}
train.PricingStrategy=train.PricingStrategy.map(price_strats_dict)
test.PricingStrategy=test.PricingStrategy.map(price_strats_dict)

In [0]:
# to dummies
to_dummy_cols = ['ProviderId','ProductId','ProductCategory','ChannelId','PricingStrategy']
train_dummy_df = pd.get_dummies(train_df,columns=to_dummy_cols, drop_first=True)

In [0]:
# to dummies
test_dummy_df = pd.get_dummies(test_df,columns=to_dummy_cols, drop_first=True)


In [0]:
# create training and test data
trainer = train_dummy_df.drop('TransactionStartTime', axis=1)
X_test = test_dummy_df.drop('TransactionStartTime', axis=1)

In [0]:
# Separate input features and target
y_train = trainer.FraudResult
X_train = trainer.drop('FraudResult', axis=1)

# training the models

In [0]:
# import models
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [0]:
X_test = X_test[[col for col in X_test.columns.values if col in X_train.columns.values]]
X_train = X_train[[col for col in X_train.columns.values if col in X_test.columns.values]]


## Ensemble Methods Random Forest And Gradient Boosting Algorithms

In [0]:
# enseble algorithms random forest amd gradient boositng 
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1,class_weight='balanced').fit(X_train,y_train)
pred_rf = rf.predict(X_test)
submission_binary_rf = pd.DataFrame({'TransactionId':test.TransactionId, 'FraudResult': pred_rf})
submission_binary_rf.to_csv('finale_rf2.csv',sep=',', index=False)

In [0]:
pred_test_rf = rf.predict(X_train)
print('confusion_metrics: \n',metrics.confusion_matrix(y_train,pred_test_rf))
print('f1_score: ', metrics.f1_score(y_train,pred_test_rf))

confusion_metrics: 
 [[95469     0]
 [    0   193]]
f1_score:  1.0


## Data Preprocessing And Feature Engineering 

In [0]:
from  imblearn.over_sampling import SMOTE

In [0]:
smote_algo = SMOTE(random_state=0)

In [0]:
smote_data_X, smote_data_Y = smote_algo.fit_resample(X_train,y_train)
smote_data_X= pd.DataFrame(data=smote_data_X, columns=X_train.columns.values)
smote_data_Y= pd.DataFrame(data=smote_data_Y, columns=['FraudResult'])

In [0]:
smote_data_X.shape,X_train.shape,smote_data_Y.shape,y_train.shape


((190938, 45), (95662, 45), (190938, 1), (95662,))

In [0]:
rf_smote = RandomForestClassifier(n_estimators=3000, n_jobs=-1).fit(smote_data_X, smote_data_Y.values.ravel())

In [0]:
pred_test_rf_smote = rf_smote.predict(smote_data_X)
print('f1_score: ', metrics.f1_score(smote_data_Y,pred_test_rf_smote))
print('confusion_metrics: \n',metrics.confusion_matrix(smote_data_Y,pred_test_rf_smote))


f1_score:  1.0
confusion_metrics: 
 [[95469     0]
 [    0 95469]]


In [0]:
pred_rf_smote = rf_smote.predict(X_test)
submission_binary_rf_smote = pd.DataFrame({'TransactionId':test.TransactionId, 'FraudResult': pred_rf_smote})
submission_binary_rf_smote.to_csv('final_rf_smote.csv',sep=',', index=False)
