### Data Cleaining and Preprocessing

- Load data into a dataframe
- Look into each of the columns
- over_sampling/under_sampling for imbalance of class (is_fraud) (To-do)
- feature selection (to-do)
- split data into train and test


In [1]:
import pandas as pd

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

import pickle

import warnings 
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)

In [7]:
# pip install pandas scikit-learn xgboost pyspark numpy matplotlib imblearn

In [3]:
# df_train = pd.read_csv('./data/fraudTrain.csv', low_memory=False, index_col=0)
# df_test = pd.read_csv('./data/fraudTest.csv', low_memory=False, index_col=0)

# df = pd.concat([df_train, df_test],ignore_index=True)

In [None]:
df = pd.read_csv('./data/sample.csv', low_memory=False, index_col=0)

In [26]:
df.shape

(1852394, 22)

In [27]:
df.is_fraud.value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

In [28]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


### Looking into columns and cleaning

In [11]:
df_removed = df[['cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time']]
df = df.drop(['cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob', 'trans_num','trans_date_trans_time'],axis=1)


In [12]:
df.head(2)

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 12 columns):
 #   Column      Dtype  
---  ------      -----  
 0   merchant    object 
 1   category    object 
 2   amt         float64
 3   gender      object 
 4   lat         float64
 5   long        float64
 6   city_pop    int64  
 7   job         object 
 8   unix_time   int64  
 9   merch_lat   float64
 10  merch_long  float64
 11  is_fraud    int64  
dtypes: float64(5), int64(3), object(4)
memory usage: 169.6+ MB


### Sampling and handling the minority class using SMOTE

### Encoding object columns

In [29]:
cols = ['merchant', 'category', 'gender', 'job']
def encode(df):
    df_obj = df.select_dtypes(include=['object'])
    encoders = {}
    for col in cols:
        encoder = LabelEncoder()
        df[col+'_label'] = encoder.fit_transform(df[col])
        encoders[col] = encoder
    with open('./encoders/LE_model_v1.pkl', 'wb') as f:
        pickle.dump(encoders, f)
    return df

df = encode(df)

In [30]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merchant_label,category_label,gender_label,job_label
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,514,8,0,372
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,241,4,0,431
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,390,0,1,308
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,360,2,1,330
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,297,9,1,116


### Splitting data into train and test

In [31]:
train, test = train_test_split(df, test_size=0.1, shuffle=True, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
train.shape, test.shape

((1667154, 26), (185240, 26))

In [36]:
train = train.drop(['merchant','gender','job','category','cc_num','first', 'last', 'street', 'city', 'state', 'zip', 'dob','trans_date_trans_time'],axis=1)



In [37]:
train.to_csv('./data/clean_train.csv')
test.to_csv('./data/clean_test.csv')

In [17]:
train.head()

Unnamed: 0,merchant,category,amt,gender,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,514,8,4.97,0,36.0788,-81.1781,3495,372,1325376018,36.011293,-82.048315,0
1,241,4,107.23,0,48.8878,-118.2105,149,431,1325376044,49.159047,-118.186462,0
2,390,0,220.11,1,42.1808,-112.262,4154,308,1325376051,43.150704,-112.154481,0
3,360,2,45.0,1,46.2306,-112.1138,1939,330,1325376076,47.034331,-112.561071,0
4,297,9,41.96,1,38.4207,-79.4629,99,116,1325376186,38.674999,-78.632459,0


In [38]:
train.head(2)

Unnamed: 0,amt,lat,long,city_pop,trans_num,unix_time,merch_lat,merch_long,is_fraud,merchant_label,category_label,gender_label,job_label
0,85.75,36.4899,-79.4736,3402,db9872c1e5c88708e700a23a04435ef3,1341848490,36.974911,-80.4626,0,260,5,1,251
1,8.45,40.817,-74.0,13835,61cb27a0aa621405f8adde59ea363003,1344061258,41.076082,-74.337634,0,421,11,1,368
