# Classifying the Risk of Default Payments of Purchases for an Online Trader

Name: Terry Lay

Student Number: N01601584

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

## Model Training

### Importing the training data

In [2]:
training = pd.read_csv('training-set.csv')
training = training.drop(['ORDER_ID'], axis=1)

In [3]:
training.columns

Index(['VALUE_ORDER', 'AMOUNT_ORDER', 'SESSION_TIME', 'CLASS_yes',
       'B_EMAIL_yes', 'B_TELEFON_yes', 'FLAG_LRIDENTISCH_yes',
       'FLAG_NEWSLETTER_yes', 'Z_METHODE_Eurocard', 'Z_METHODE_Visa',
       'Z_METHODE_check', 'Z_METHODE_debit_card', 'Z_METHODE_debit_note',
       'WEEKDAY_ORDER_Monday', 'WEEKDAY_ORDER_Saturday',
       'WEEKDAY_ORDER_Sunday', 'WEEKDAY_ORDER_Thursday',
       'WEEKDAY_ORDER_Tuesday', 'WEEKDAY_ORDER_Wednesday', 'CHK_LADR_yes',
       'CHK_RADR_yes', 'CHK_KTO_yes', 'CHK_CARD_yes', 'CHK_COOKIE_yes',
       'CHK_IP_yes', 'FAIL_LPLZ_yes', 'FAIL_LORT_yes', 'FAIL_LPLZORTMATCH_yes',
       'FAIL_RPLZ_yes', 'FAIL_RORT_yes', 'FAIL_RPLZORTMATCH_yes',
       'NEUKUNDE_yes'],
      dtype='object')

### Fitting a Logistic Regression Model

In [4]:
X_train = training.drop('CLASS_yes', axis=1)
y_train = training['CLASS_yes']

In [5]:
log_model = LogisticRegression(class_weight={0:1, 1:10}, random_state=0, n_jobs=-1)

In [6]:
log_model.fit(X_train, y_train)

LogisticRegression(class_weight={0: 1, 1: 10}, n_jobs=-1, random_state=0)

In [7]:
y_pred = log_model.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)

[[23670  4584]
 [  869   877]]


In [8]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.84      0.90     28254
           1       0.16      0.50      0.24      1746

    accuracy                           0.82     30000
   macro avg       0.56      0.67      0.57     30000
weighted avg       0.92      0.82      0.86     30000



## Classifying the Test Data

### Importing the Data

In [9]:
testing = pd.read_csv('testing-set.csv')
X_test = testing.drop('ORDER_ID', axis=1)

In [10]:
predictions = log_model.predict(X_test)

In [11]:
testing['CLASS'] = predictions
testing.loc[testing.CLASS == 0, 'CLASS'] = 'no'
testing.loc[testing.CLASS == 1, 'CLASS'] = 'yes'

In [12]:
testing = testing.rename(columns={'ORDER_ID':'ORDER-ID'})

In [13]:
result = testing[['ORDER-ID', 'CLASS']]

In [14]:
result.to_csv('classification-result.txt', index=False, sep='\t')