# Classifying the Risk of Default Payments of Purchases for an Online Trader

# Data Preparation

Name: Terry Lay

Student Number: N01601584

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

## Importing the Data

In [2]:
training = pd.read_csv('data/risk-train.txt', sep='\t')
testing = pd.read_csv('data/risk-test.txt', sep='\t')

In [3]:
def check_missing(dataset):
# Function to check the number of '?' in the dataset.
    missing = dataset.isin(['?']).sum(axis=0)
    return missing[missing!=0]

#### Replacing credit card with type of card used and dropping Z_CARD_ART, Z_CARD_VALID and Z_LAST_NAME.

In [4]:
training.loc[training.Z_METHODE == 'credit_card', 'Z_METHODE'] = training.Z_CARD_ART
testing.loc[testing.Z_METHODE == 'credit_card', 'Z_METHODE'] = testing.Z_CARD_ART

training = training.drop(['Z_CARD_ART', 'Z_CARD_VALID', 'Z_LAST_NAME'], axis=1)
testing = testing.drop(['Z_CARD_ART', 'Z_CARD_VALID', 'Z_LAST_NAME'], axis=1)

#### Dropping ANUMMER columns indicating item ID.

In [5]:
training = training.drop(['ANUMMER_01', 
                 'ANUMMER_02',
                 'ANUMMER_03', 
                 'ANUMMER_04', 
                 'ANUMMER_05', 
                 'ANUMMER_06', 
                 'ANUMMER_07', 
                 'ANUMMER_08', 
                 'ANUMMER_09', 
                 'ANUMMER_10'], axis=1)

testing = testing.drop(['ANUMMER_01', 
                 'ANUMMER_02',
                 'ANUMMER_03', 
                 'ANUMMER_04', 
                 'ANUMMER_05', 
                 'ANUMMER_06', 
                 'ANUMMER_07', 
                 'ANUMMER_08', 
                 'ANUMMER_09', 
                 'ANUMMER_10'], axis=1)

#### Dropping B_BIRTHDATE, TIME_ORDER and columns related to returning customers.

In [6]:
training = training.drop(['B_BIRTHDATE',
                          'AMOUNT_ORDER_PRE',
                          'VALUE_ORDER_PRE',
                          'DATE_LORDER',
                          'MAHN_AKT',
                          'MAHN_HOECHST',
                          'TIME_ORDER'], axis=1)
testing = testing.drop(['B_BIRTHDATE',
                          'AMOUNT_ORDER_PRE',
                          'VALUE_ORDER_PRE',
                          'DATE_LORDER',
                          'MAHN_AKT',
                          'MAHN_HOECHST',
                          'TIME_ORDER'], axis=1)

#### Scaling the Data

In [7]:
scaler = MinMaxScaler()
training[['VALUE_ORDER', 'SESSION_TIME']] = scaler.fit_transform(training[['VALUE_ORDER', 'SESSION_TIME']])
testing[['VALUE_ORDER', 'SESSION_TIME']] = scaler.transform(testing[['VALUE_ORDER', 'SESSION_TIME']])

#### Getting Dummies

In [8]:
training_dummies = pd.get_dummies(training, drop_first=True)
training_dummies.columns

Index(['ORDER_ID', 'VALUE_ORDER', 'AMOUNT_ORDER', 'SESSION_TIME', 'CLASS_yes',
       'B_EMAIL_yes', 'B_TELEFON_yes', 'FLAG_LRIDENTISCH_yes',
       'FLAG_NEWSLETTER_yes', 'Z_METHODE_Eurocard', 'Z_METHODE_Visa',
       'Z_METHODE_check', 'Z_METHODE_debit_card', 'Z_METHODE_debit_note',
       'WEEKDAY_ORDER_Monday', 'WEEKDAY_ORDER_Saturday',
       'WEEKDAY_ORDER_Sunday', 'WEEKDAY_ORDER_Thursday',
       'WEEKDAY_ORDER_Tuesday', 'WEEKDAY_ORDER_Wednesday', 'CHK_LADR_yes',
       'CHK_RADR_yes', 'CHK_KTO_yes', 'CHK_CARD_yes', 'CHK_COOKIE_yes',
       'CHK_IP_yes', 'FAIL_LPLZ_yes', 'FAIL_LORT_yes', 'FAIL_LPLZORTMATCH_yes',
       'FAIL_RPLZ_yes', 'FAIL_RORT_yes', 'FAIL_RPLZORTMATCH_yes',
       'NEUKUNDE_yes'],
      dtype='object')

In [9]:
testing_dummies = pd.get_dummies(testing, drop_first=True)
testing_dummies.columns

Index(['ORDER_ID', 'VALUE_ORDER', 'AMOUNT_ORDER', 'SESSION_TIME',
       'B_EMAIL_yes', 'B_TELEFON_yes', 'FLAG_LRIDENTISCH_yes',
       'FLAG_NEWSLETTER_yes', 'Z_METHODE_Eurocard', 'Z_METHODE_Visa',
       'Z_METHODE_check', 'Z_METHODE_debit_card', 'Z_METHODE_debit_note',
       'WEEKDAY_ORDER_Monday', 'WEEKDAY_ORDER_Saturday',
       'WEEKDAY_ORDER_Sunday', 'WEEKDAY_ORDER_Thursday',
       'WEEKDAY_ORDER_Tuesday', 'WEEKDAY_ORDER_Wednesday', 'CHK_LADR_yes',
       'CHK_RADR_yes', 'CHK_KTO_yes', 'CHK_CARD_yes', 'CHK_COOKIE_yes',
       'CHK_IP_yes', 'FAIL_LPLZ_yes', 'FAIL_LORT_yes', 'FAIL_LPLZORTMATCH_yes',
       'FAIL_RPLZ_yes', 'FAIL_RORT_yes', 'FAIL_RPLZORTMATCH_yes',
       'NEUKUNDE_yes'],
      dtype='object')

In [10]:
training_dummies.to_csv('training-set.csv', index=False)
testing_dummies.to_csv('testing-set.csv', index=False)