# data = Kaggle IEEE fraud
## n = 10K
## binary classification (is fraud)

### fn -- check for non-nums

In [1]:
# print n unique vals for non-num cols
def col_types(df):
    for c in df.columns:
        if not np.issubdtype(df[c].dtype, np.number):
            print(c , ' NOT num...', df[c].nunique())
            if (df[c].nunique() <= 5):
                print(df[c].unique())
            print('')

### csv -> pd

In [2]:
%%time
# ~ 15 secs

import pandas as pd

df = pd.read_csv('train_transaction.csv', low_memory=True)

CPU times: user 14.1 s, sys: 2.55 s, total: 16.7 s
Wall time: 16.5 s


### check types

In [3]:
col_types(df)

ProductCD  NOT num... 5
['W' 'H' 'C' 'S' 'R']

card4  NOT num... 4
['discover' 'mastercard' 'visa' 'american express' nan]

card6  NOT num... 4
['credit' 'debit' nan 'debit or credit' 'charge card']

P_emaildomain  NOT num... 59

R_emaildomain  NOT num... 60

M1  NOT num... 2
['T' nan 'F']

M2  NOT num... 2
['T' nan 'F']

M3  NOT num... 2
['T' nan 'F']

M4  NOT num... 3
['M2' 'M0' nan 'M1']

M5  NOT num... 2
['F' 'T' nan]

M6  NOT num... 2
['T' 'F' nan]

M7  NOT num... 2
[nan 'F' 'T']

M8  NOT num... 2
[nan 'F' 'T']

M9  NOT num... 2
[nan 'F' 'T']



### booleans (T/F) -> integers

In [4]:
for i in range(1,10):
    df['M' + str(i)] = df['M' + str(i)].eq('T').mul(1)

### remove cols w/ large n unique vals

In [5]:
df = df.drop(columns=['P_emaildomain', 'R_emaildomain'], axis=1)

### one hot encode

In [6]:
df = pd.concat([df, pd.get_dummies(df['ProductCD'], drop_first=True)], axis=1)
df = df.drop(columns=['ProductCD'], axis=1)

In [7]:
df = pd.concat([df, pd.get_dummies(df['card4'])], axis=1)
df = df.drop(columns=['card4'], axis=1)

In [8]:
df = pd.concat([df, pd.get_dummies(df['card6'])], axis=1)
df = df.drop(columns=['card6'], axis=1)

### check types again

In [9]:
col_types(df)

### impute

In [10]:
# impute with mean value
for c in df.columns:
    df[c] = df[c].fillna(df[c].mean())

### shuffle, split, save

In [11]:
n_div_4 = int(df.shape[0] / 4)
n_div_4

147635

In [12]:
from sklearn.model_selection import train_test_split

splits = []

df, df_0   = train_test_split(df, test_size=n_div_4, random_state=0, shuffle=True)
splits.append(df_0)

df, df_1   = train_test_split(df, test_size=n_div_4, random_state=0, shuffle=True)
splits.append(df_1)

df_3, df_2 = train_test_split(df, test_size=n_div_4, random_state=0, shuffle=True)
splits.append(df_2)
splits.append(df_3)

In [14]:
# save memory
del df

### save splits

In [20]:
for i, split in enumerate(splits):
    print(i)
    split.to_csv('fraud_' + str(i) + '.csv')

0
1
2
3


In [27]:
!wc -l *.csv

  147636 fraud_0.csv
  147636 fraud_1.csv
  147636 fraud_2.csv
  147636 fraud_3.csv
  590541 train_transaction.csv
 1181085 total


In [28]:
!ls -l *.csv

-rw-r--r--  1 wihill  staff  583288129 Jan 21 22:05 fraud_0.csv
-rw-r--r--  1 wihill  staff  583028037 Jan 21 22:06 fraud_1.csv
-rw-r--r--  1 wihill  staff  583227557 Jan 21 22:07 fraud_2.csv
-rw-r--r--  1 wihill  staff  583322065 Jan 21 22:08 fraud_3.csv
-rw-r--r--@ 1 wihill  staff  683351067 Dec 11 23:13 train_transaction.csv


In [29]:
!du -h *.csv

560M	fraud_0.csv
561M	fraud_1.csv
560M	fraud_2.csv
560M	fraud_3.csv
655M	train_transaction.csv


In [24]:
!head -2 fraud_0.csv | tail -1

7681,2994681,0,242834,25.0,9803,583.0,150.0,226.0,269.0,87.0,118.50218030881064,231.85542296754988,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,169.56323145317248,28.343347718075332,140.0024405978872,42.33596485915518,69.80571686228429,41.63894955931855,7.791666030883789,0.7916659712791443,123.98213667937759,146.6214654959149,54.03753264211876,17.901294550619834,57.724444015630276,163.744578971615,0,0,0,0,0,0,0,0,0,0.9999453820525425,1.0452043835722065,1.0780747494803262,0.8464560984151157,0.8769907438643162,1.0456863066380082,1.072869980369667,1.0277041506427247,1.0415289169903583,0.4639152072429824,0.47898654792082324,0.5597113128733233,0.5991657385216156,0.9995004538677894,0.12234215216913816,0.12345981374898682,0.1340396954517977,0.13536339551419235,0.8163711180697693,0.8478425243990382,0.12968373092929186,0.13229225586869517,1.0347913471612369,1.0580970208001679,0.9776603747179119,0.9880400492159847,0.0007755599484515042,0.0008299852079919606,0.3878402307631004,0.40

In [25]:
!head -2 fraud_1.csv | tail -1

577787,3564787,0,15370039,26.95,13960,206.0,150.0,166.0,123.0,87.0,118.50218030881064,231.85542296754988,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,169.56323145317248,28.343347718075332,0.0,42.33596485915518,69.80571686228429,41.63894955931855,146.0581076468787,0.5610571265961662,0.0,0.0,54.03753264211876,17.901294550619834,57.724444015630276,0.0,1,1,1,0,0,0,0,0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036439074989322105,1.073915431081823,1.12526694734273,0.0376960156