# Data preprocessing for credit card transaction data with Pandas

this notebook is describing how to preprocess data by using pandas.
Main concept is, 

<li> normalize dataset </li>
<li> split data to multiple chunks (training,validate,testing)</li>
<li> shuffle data by using ".sample(frac=1)" command</li>


In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

df_csv = pd.read_csv('./data/creditcard.csv')
df_csv.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [25]:
df_norm = (df_csv - df_csv.min() ) / (df_csv.max() - df_csv.min() )
df_norm.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,0.935192,0.76649,0.881365,0.313023,0.763439,0.267669,0.266815,0.786444,0.475312,...,0.561184,0.522992,0.663793,0.391253,0.585122,0.394557,0.418976,0.312697,0.005824,0.0
1,0.0,0.978542,0.770067,0.840298,0.271796,0.76612,0.262192,0.264875,0.786298,0.453981,...,0.55784,0.480237,0.666938,0.33644,0.58729,0.446013,0.416345,0.313423,0.000105,0.0
2,6e-06,0.935217,0.753118,0.868141,0.268766,0.762329,0.281122,0.270177,0.788042,0.410603,...,0.565477,0.54603,0.678939,0.289354,0.559515,0.402727,0.415489,0.311911,0.014739,0.0
3,6e-06,0.941878,0.765304,0.868484,0.213661,0.765647,0.275559,0.266803,0.789434,0.414999,...,0.559734,0.510277,0.662607,0.223826,0.614245,0.389197,0.417669,0.314371,0.004807,0.0
4,1.2e-05,0.938617,0.77652,0.864251,0.269796,0.762975,0.263984,0.268968,0.782484,0.49095,...,0.561327,0.547271,0.663392,0.40127,0.566343,0.507497,0.420561,0.31749,0.002724,0.0


In [26]:

df_norm.iloc[1][1:29]
df_norm.iloc[1][30]



0.0

In [27]:
NORM_FILE='./data/creditcard_norm.csv'
df_norm.to_csv(NORM_FILE, index=False, header=False)

In [28]:
# split normalized data by label
df_norm_fraud=df_norm[ df_norm.Class==1.0] #fraud
df_norm_nonfraud=df_norm[ df_norm.Class==0.0] #non_fraud

# split non_fraudfor 60%,20%,20% (training,validation,test)
df_norm_nonfraud_train,df_norm_nonfraud_validate,df_norm_nonfraud_test = \
    np.split(df_norm_nonfraud,[int(.6*len(df_norm_nonfraud)),int(.8*len(df_norm_nonfraud))])
# split fraud data to 50%,50% (validation and test)
df_norm_fraud_validate,df_norm_fraud_test = \
    np.split(df_norm_fraud,[int(0.5*len(df_norm_fraud))])
print('number of non fraud training, test, validation dataset = ',\
len(df_norm_nonfraud_train),\
len(df_norm_nonfraud_test),\
len(df_norm_nonfraud_validate))
      
print('number of fraud test,fraud validation dataset =',\
len(df_norm_fraud_test),\
len(df_norm_fraud_validate))

#create train,validate and test dataset with shuffle
df_train = df_norm_nonfraud_train.sample(frac=1) 
df_validate = df_norm_nonfraud_validate.append(df_norm_fraud_validate).sample(frac=1)
df_test = df_norm_nonfraud_test.append(df_norm_fraud_test).sample(frac=1)

print 'size of train,validate,test data =',len(df_train),len(df_validate),len(df_test)

('number of non fraud training, test, validation dataset = ', 170589, 56863, 56863)
('number of fraud test,fraud validation dataset =', 246, 246)
size of train,validate,test data = 170589 57109 57109


# Training data set

In [29]:
df_train.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
159846,0.65396,0.99444,0.765953,0.807044,0.253598,0.768589,0.25732,0.265826,0.783348,0.495935,...,0.564832,0.542583,0.664601,0.392792,0.600635,0.35863,0.41643,0.311987,3.9e-05,0.0
36776,0.223847,0.94122,0.769319,0.843473,0.267792,0.778211,0.253344,0.266262,0.788193,0.446112,...,0.564783,0.523146,0.665778,0.316106,0.551206,0.333208,0.419669,0.317717,3.9e-05,0.0
73035,0.317989,0.943973,0.773825,0.843239,0.306767,0.772997,0.255055,0.271795,0.780835,0.451908,...,0.561573,0.552387,0.673772,0.39645,0.538665,0.371704,0.425698,0.313827,0.000212,0.0
156260,0.623183,0.947559,0.780913,0.853596,0.371119,0.773915,0.268569,0.273704,0.785388,0.427252,...,0.565912,0.541062,0.664909,0.472773,0.578883,0.424164,0.416922,0.316218,0.004323,0.0
2256,0.010394,0.977896,0.758391,0.869791,0.305722,0.752537,0.265998,0.257369,0.788043,0.481708,...,0.554006,0.491249,0.666001,0.477012,0.599689,0.37665,0.418852,0.314106,0.000973,0.0


# Test data set

In [30]:
df_test.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
273774,0.958904,0.945167,0.774487,0.875228,0.227278,0.765247,0.266729,0.265748,0.788674,0.479787,...,0.561679,0.518685,0.660648,0.459015,0.603621,0.390414,0.422669,0.316114,0.000467,0.0
261981,0.927485,0.993764,0.769756,0.787331,0.267158,0.774262,0.254221,0.270104,0.780702,0.465429,...,0.561637,0.517479,0.663442,0.383752,0.604682,0.543243,0.414582,0.312112,0.001541,0.0
273067,0.957255,0.912098,0.793833,0.826742,0.295973,0.762403,0.252298,0.267936,0.794169,0.437742,...,0.564464,0.521599,0.660692,0.435123,0.642749,0.38417,0.401864,0.313718,0.000984,0.0
252788,0.902634,0.941462,0.780733,0.838821,0.29033,0.772977,0.25936,0.27257,0.787013,0.418952,...,0.563119,0.519496,0.656935,0.290815,0.648394,0.401774,0.414934,0.312023,0.001714,0.0
273429,0.958297,0.99334,0.765582,0.824819,0.253608,0.76611,0.263462,0.262885,0.784811,0.501292,...,0.558979,0.497789,0.669555,0.384363,0.567052,0.275285,0.417694,0.312637,3.9e-05,0.0


# Validate data set

In [31]:
df_validate.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
218267,0.817399,0.99446,0.757625,0.804437,0.198703,0.765546,0.257372,0.264055,0.783162,0.435827,...,0.562615,0.507757,0.665952,0.232055,0.576666,0.381421,0.415278,0.311628,0.002997,0.0
180616,0.721214,0.924666,0.782946,0.831813,0.190927,0.764579,0.258103,0.274213,0.790166,0.440624,...,0.559019,0.490665,0.663499,0.485732,0.639555,0.415175,0.413515,0.310747,0.006506,0.0
215932,0.811861,0.993317,0.767575,0.799144,0.24581,0.770539,0.25194,0.269565,0.781157,0.45968,...,0.564846,0.539882,0.664961,0.493571,0.601463,0.455206,0.414711,0.311539,0.000866,0.0
207128,0.790147,0.996977,0.758945,0.806687,0.203366,0.764429,0.254372,0.263195,0.781797,0.450289,...,0.567342,0.557755,0.663301,0.252945,0.59567,0.447022,0.415648,0.31159,0.000973,0.0
204859,0.784058,0.961924,0.766434,0.849226,0.224125,0.7705,0.258358,0.267605,0.781708,0.498991,...,0.561251,0.529163,0.669633,0.330218,0.441919,0.255702,0.416564,0.311063,9.7e-05,0.0


In [37]:
len(df_validate[df_validate.Class==1])

56863

# Write into files
write training, testing and validation data into files


In [47]:
TRAIN_FILE='creditcard_training.csv'
TEST_FILE='creditcard_tesring.csv'
VALIDATE_FILE='creditcard_validation.csv'

df_train.to_csv(TRAIN_FILE, index=False, header=False)
df_validate.to_csv(VALIDATE_FILE,index=False,header=False)
df_test.to_csv(TEST_FILE,index=False,header=False)


In [48]:
!head $TRAIN_FILE

0.261400990787,0.920081173661,0.739618573549,0.846635483264,0.211875205478,0.747219863054,0.265201445517,0.289390808274,0.777545474788,0.4048072684,0.491891094326,0.265551327178,0.664452868446,0.44376239723,0.631950627163,0.284476651591,0.470581452676,0.742050573509,0.56383930726,0.565842969614,0.604834165582,0.573748782649,0.535247306924,0.693081227577,0.467797602723,0.641372908539,0.389837325093,0.407267961081,0.311111755229,0.037989720978,0.0
0.506742210288,0.973964159764,0.738640423594,0.82010776525,0.196860877767,0.753955093069,0.255521771603,0.265874836226,0.78235249496,0.563993732223,0.471831807744,0.238617297587,0.770681953793,0.516385949707,0.631182799881,0.355103125787,0.413136868551,0.734736065727,0.642395251379,0.653631887907,0.590838866382,0.564060756051,0.494823338943,0.661687908698,0.395672246834,0.558785394119,0.243907627883,0.416475806625,0.314553037595,0.0225595107422,0.0
0.307450576416,0.931388300646,0.777800900498,0.851944763997,0.253852162842,0.767592518089,0.261