In [41]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel

In [42]:
data = pd.read_csv("C:\\Users\\Wimukthi\\Desktop\\DataStorm\\credit_card_default_train.csv") #import dataset

In [43]:
data.shape

(24000, 25)

In [44]:
data.isnull().sum().sum() #no. of missing Values

0

In [45]:
data.dtypes

Client_ID             object
Balance_Limit_V1      object
Gender                object
EDUCATION_STATUS      object
MARITAL_STATUS        object
AGE                   object
PAY_JULY               int64
PAY_AUG                int64
PAY_SEP                int64
PAY_OCT                int64
PAY_NOV                int64
PAY_DEC                int64
DUE_AMT_JULY           int64
DUE_AMT_AUG            int64
DUE_AMT_SEP            int64
DUE_AMT_OCT            int64
DUE_AMT_NOV            int64
DUE_AMT_DEC            int64
PAID_AMT_JULY          int64
PAID_AMT_AUG           int64
PAID_AMT_SEP           int64
PAID_AMT_OCT           int64
PAID_AMT_NOV           int64
PAID_AMT_DEC           int64
NEXT_MONTH_DEFAULT     int64
dtype: object

In [46]:
#replaceing M and K with relavant number of zeros
data['Balance_Limit_V1']=data['Balance_Limit_V1'].str.replace("M","000000")  
data['Balance_Limit_V1']=data['Balance_Limit_V1'].str.replace("K","000")
data['Balance_Limit_V1']=data['Balance_Limit_V1'].astype("float") #convert datatype into float

In [47]:
data["NEXT_MONTH_DEFAULT"].value_counts() #value counts of the target variable

0    18670
1     5330
Name: NEXT_MONTH_DEFAULT, dtype: int64

In [48]:
data.columns

Index(['Client_ID', 'Balance_Limit_V1', 'Gender', 'EDUCATION_STATUS',
       'MARITAL_STATUS', 'AGE', 'PAY_JULY', 'PAY_AUG', 'PAY_SEP', 'PAY_OCT',
       'PAY_NOV', 'PAY_DEC', 'DUE_AMT_JULY', 'DUE_AMT_AUG', 'DUE_AMT_SEP',
       'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC', 'PAID_AMT_JULY',
       'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT', 'PAID_AMT_NOV',
       'PAID_AMT_DEC', 'NEXT_MONTH_DEFAULT'],
      dtype='object')

In [49]:
#standardizing numerical features
col_to_norm = ['Balance_Limit_V1','DUE_AMT_JULY', 'DUE_AMT_AUG', 'DUE_AMT_SEP','DUE_AMT_OCT', 
               'DUE_AMT_NOV', 'DUE_AMT_DEC', 'PAID_AMT_JULY','PAID_AMT_AUG', 'PAID_AMT_SEP', 
               'PAID_AMT_OCT', 'PAID_AMT_NOV','PAID_AMT_DEC']
data[col_to_norm]=data[col_to_norm].apply(lambda x: (x-np.mean(x))/np.std(x))

In [50]:
#Creating Dummy Variable for Categorical data
cat_vars=['Gender', 'EDUCATION_STATUS','MARITAL_STATUS', 'AGE', 'PAY_JULY', 'PAY_AUG', 
          'PAY_SEP', 'PAY_OCT','PAY_NOV', 'PAY_DEC']
for var in cat_vars:
    cat_list="var"+'_'+var
    cat_list=pd.get_dummies(data[var], prefix=var)
    data=data.join(cat_list)
data_var=data.columns.values.tolist()
to_keep=[i for i in data_var if i not in cat_vars]

In [51]:
data_final=data[to_keep]
data_final.columns.values

array(['Client_ID', 'Balance_Limit_V1', 'DUE_AMT_JULY', 'DUE_AMT_AUG',
       'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC',
       'PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT',
       'PAID_AMT_NOV', 'PAID_AMT_DEC', 'NEXT_MONTH_DEFAULT', 'Gender_F',
       'Gender_M', 'EDUCATION_STATUS_Graduate',
       'EDUCATION_STATUS_High School', 'EDUCATION_STATUS_Other',
       'MARITAL_STATUS_Other', 'MARITAL_STATUS_Single', 'AGE_31-45',
       'AGE_46-65', 'AGE_Less than 30', 'AGE_More than 65', 'PAY_JULY_-2',
       'PAY_JULY_-1', 'PAY_JULY_0', 'PAY_JULY_1', 'PAY_JULY_2',
       'PAY_JULY_3', 'PAY_JULY_4', 'PAY_JULY_5', 'PAY_JULY_6',
       'PAY_JULY_7', 'PAY_JULY_8', 'PAY_AUG_-2', 'PAY_AUG_-1',
       'PAY_AUG_0', 'PAY_AUG_1', 'PAY_AUG_2', 'PAY_AUG_3', 'PAY_AUG_4',
       'PAY_AUG_5', 'PAY_AUG_6', 'PAY_AUG_7', 'PAY_AUG_8', 'PAY_SEP_-2',
       'PAY_SEP_-1', 'PAY_SEP_0', 'PAY_SEP_1', 'PAY_SEP_2', 'PAY_SEP_3',
       'PAY_SEP_4', 'PAY_SEP_5', 'PAY_SEP_6', 'PAY_S

In [52]:
data_final.shape

(24000, 90)

In [53]:
X = data_final[['Balance_Limit_V1', 'DUE_AMT_JULY', 'DUE_AMT_AUG',
       'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC',
       'PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT',
       'PAID_AMT_NOV', 'PAID_AMT_DEC', 'Gender_F',
        'EDUCATION_STATUS_Graduate',
       'EDUCATION_STATUS_High School', 
       'MARITAL_STATUS_Other',  'AGE_31-45',
       'AGE_46-65', 'AGE_Less than 30',  'PAY_JULY_-2',
       'PAY_JULY_-1', 'PAY_JULY_1', 'PAY_JULY_2',
       'PAY_JULY_3', 'PAY_JULY_4', 'PAY_JULY_5', 'PAY_JULY_6',
       'PAY_JULY_7', 'PAY_JULY_8', 'PAY_AUG_-2', 'PAY_AUG_-1',
        'PAY_AUG_1', 'PAY_AUG_2', 'PAY_AUG_3', 'PAY_AUG_4',
       'PAY_AUG_5', 'PAY_AUG_6', 'PAY_AUG_7', 'PAY_AUG_8', 'PAY_SEP_-2',
       'PAY_SEP_-1',  'PAY_SEP_1', 'PAY_SEP_2', 'PAY_SEP_3',
       'PAY_SEP_4', 'PAY_SEP_5', 'PAY_SEP_6', 'PAY_SEP_7', 'PAY_SEP_8',
       'PAY_OCT_-2', 'PAY_OCT_-1',  'PAY_OCT_1', 'PAY_OCT_2',
       'PAY_OCT_3', 'PAY_OCT_4', 'PAY_OCT_5', 'PAY_OCT_6', 'PAY_OCT_7',
       'PAY_OCT_8', 'PAY_NOV_-2', 'PAY_NOV_-1',  'PAY_NOV_2',
       'PAY_NOV_3', 'PAY_NOV_4', 'PAY_NOV_5', 'PAY_NOV_6', 'PAY_NOV_7',
       'PAY_NOV_8', 'PAY_DEC_-2', 'PAY_DEC_-1',  'PAY_DEC_2',
       'PAY_DEC_3', 'PAY_DEC_4', 'PAY_DEC_5', 'PAY_DEC_6', 'PAY_DEC_7',
       'PAY_DEC_8']]
y=data_final["NEXT_MONTH_DEFAULT"]

In [54]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print("train set:",X_train.shape)
print("test set:",X_test.shape)

train set: (16800, 78)
test set: (7200, 78)


In [55]:
#logistic regression
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train,y_train) #model fit



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
y_pred = logistic_regression.predict(X_test) #predicting

In [57]:
#model accuracy
from sklearn import metrics 
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
print("Accuracy of the Logistic Regression Model:",accuracy_percentage)

Accuracy of the Logistic Regression Model: 82.43055555555556


In [58]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred)) #classification report

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      5647
           1       0.67      0.36      0.47      1553

    accuracy                           0.82      7200
   macro avg       0.76      0.66      0.68      7200
weighted avg       0.81      0.82      0.80      7200



### Prediction

In [59]:
td = pd.read_csv("C:\\Users\\Wimukthi\\Desktop\\DataStorm\\credit_card_default_test.csv") #import test data

In [60]:
td.shape

(6000, 24)

In [61]:
td.isnull().sum().sum() #no. of missing Values

0

In [62]:
td.dtypes

Client_ID           object
Balance_Limit_V1    object
Gender              object
EDUCATION_STATUS    object
MARITAL_STATUS      object
AGE                 object
PAY_JULY             int64
PAY_AUG              int64
PAY_SEP              int64
PAY_OCT              int64
PAY_NOV              int64
PAY_DEC              int64
DUE_AMT_JULY         int64
DUE_AMT_AUG          int64
DUE_AMT_SEP          int64
DUE_AMT_OCT          int64
DUE_AMT_NOV          int64
DUE_AMT_DEC          int64
PAID_AMT_JULY        int64
PAID_AMT_AUG         int64
PAID_AMT_SEP         int64
PAID_AMT_OCT         int64
PAID_AMT_NOV         int64
PAID_AMT_DEC         int64
dtype: object

In [63]:
#replaceing M and K with relavant number of zeros
td['Balance_Limit_V1']=td['Balance_Limit_V1'].str.replace("M","000000")  
td['Balance_Limit_V1']=td['Balance_Limit_V1'].str.replace("K","000")
td['Balance_Limit_V1']=td['Balance_Limit_V1'].astype("float") #convert datatype into float

In [64]:
#standardizing numerical features
col_to_norm = ['Balance_Limit_V1','DUE_AMT_JULY', 'DUE_AMT_AUG', 'DUE_AMT_SEP','DUE_AMT_OCT', 
               'DUE_AMT_NOV', 'DUE_AMT_DEC', 'PAID_AMT_JULY','PAID_AMT_AUG', 'PAID_AMT_SEP', 
               'PAID_AMT_OCT', 'PAID_AMT_NOV','PAID_AMT_DEC']
td[col_to_norm]=td[col_to_norm].apply(lambda x: (x-np.mean(x))/np.std(x))

In [65]:
#Creating Dummy Variable for Categorical data
cat_vars=['Gender', 'EDUCATION_STATUS','MARITAL_STATUS', 'AGE', 'PAY_JULY', 'PAY_AUG', 
          'PAY_SEP', 'PAY_OCT','PAY_NOV', 'PAY_DEC']
for var in cat_vars:
    cat_list="var"+'_'+var
    cat_list=pd.get_dummies(td[var], prefix=var)
    td=td.join(cat_list)
data_var=td.columns.values.tolist()
to_keep=[i for i in data_var if i not in cat_vars]

In [66]:
td_final=td[to_keep]
td_final.columns.values

array(['Client_ID', 'Balance_Limit_V1', 'DUE_AMT_JULY', 'DUE_AMT_AUG',
       'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC',
       'PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT',
       'PAID_AMT_NOV', 'PAID_AMT_DEC', 'Gender_F', 'Gender_M',
       'EDUCATION_STATUS_Graduate', 'EDUCATION_STATUS_High School',
       'EDUCATION_STATUS_Other', 'MARITAL_STATUS_Other',
       'MARITAL_STATUS_Single', 'AGE_31-45', 'AGE_46-65',
       'AGE_Less than 30', 'AGE_More than 65', 'PAY_JULY_-2',
       'PAY_JULY_-1', 'PAY_JULY_0', 'PAY_JULY_1', 'PAY_JULY_2',
       'PAY_JULY_3', 'PAY_JULY_4', 'PAY_JULY_5', 'PAY_JULY_6',
       'PAY_JULY_7', 'PAY_JULY_8', 'PAY_AUG_-2', 'PAY_AUG_-1',
       'PAY_AUG_0', 'PAY_AUG_1', 'PAY_AUG_2', 'PAY_AUG_3', 'PAY_AUG_4',
       'PAY_AUG_5', 'PAY_AUG_6', 'PAY_AUG_7', 'PAY_SEP_-2', 'PAY_SEP_-1',
       'PAY_SEP_0', 'PAY_SEP_2', 'PAY_SEP_3', 'PAY_SEP_4', 'PAY_SEP_5',
       'PAY_SEP_6', 'PAY_SEP_7', 'PAY_SEP_8', 'PAY_OCT_-2', 'PAY_OCT_-1',


In [67]:
td_final['PAY_SEP_1'] = 0
td_final['PAY_OCT_1'] = 0
td_final['PAY_AUG_8'] = 0
td_final['PAY_OCT_8'] = 0
td_final['PAY_NOV_8'] = 0
td_final['PAY_DEC_8'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [68]:
#Final Columns to keep
X=td_final[['Balance_Limit_V1', 'DUE_AMT_JULY', 'DUE_AMT_AUG',
       'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC',
       'PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT',
       'PAID_AMT_NOV', 'PAID_AMT_DEC', 'Gender_F',
        'EDUCATION_STATUS_Graduate',
       'EDUCATION_STATUS_High School', 
       'MARITAL_STATUS_Other',  'AGE_31-45',
       'AGE_46-65', 'AGE_Less than 30',  'PAY_JULY_-2',
       'PAY_JULY_-1', 'PAY_JULY_1', 'PAY_JULY_2',
       'PAY_JULY_3', 'PAY_JULY_4', 'PAY_JULY_5', 'PAY_JULY_6',
       'PAY_JULY_7', 'PAY_JULY_8', 'PAY_AUG_-2', 'PAY_AUG_-1',
        'PAY_AUG_1', 'PAY_AUG_2', 'PAY_AUG_3', 'PAY_AUG_4',
       'PAY_AUG_5', 'PAY_AUG_6', 'PAY_AUG_7', 'PAY_AUG_8', 'PAY_SEP_-2',
       'PAY_SEP_-1',  'PAY_SEP_1', 'PAY_SEP_2', 'PAY_SEP_3',
       'PAY_SEP_4', 'PAY_SEP_5', 'PAY_SEP_6', 'PAY_SEP_7', 'PAY_SEP_8',
       'PAY_OCT_-2', 'PAY_OCT_-1',  'PAY_OCT_1', 'PAY_OCT_2',
       'PAY_OCT_3', 'PAY_OCT_4', 'PAY_OCT_5', 'PAY_OCT_6', 'PAY_OCT_7',
       'PAY_OCT_8', 'PAY_NOV_-2', 'PAY_NOV_-1',  'PAY_NOV_2',
       'PAY_NOV_3', 'PAY_NOV_4', 'PAY_NOV_5', 'PAY_NOV_6', 'PAY_NOV_7',
       'PAY_NOV_8', 'PAY_DEC_-2', 'PAY_DEC_-1',  'PAY_DEC_2',
       'PAY_DEC_3', 'PAY_DEC_4', 'PAY_DEC_5', 'PAY_DEC_6', 'PAY_DEC_7',
       'PAY_DEC_8']]
#y=td_final["NEXT_MONTH_DEFAULT"]

In [69]:
y_test_predict = logistic_regression.predict(X) #predicting

In [71]:
credit_default = pd.DataFrame(y_test_predict,columns = ['NEXT_MONTH_DEFAULT']) #convert to a dataframe

In [72]:
#merge the predictions and test data set
td['index'] = td.index 
credit_default['index'] = credit_default.index
result = pd.merge(td,credit_default, on= 'index')

In [75]:
result.head()

Unnamed: 0,Client_ID,Balance_Limit_V1,Gender,EDUCATION_STATUS,MARITAL_STATUS,AGE,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,...,PAY_DEC_-1,PAY_DEC_0,PAY_DEC_2,PAY_DEC_3,PAY_DEC_4,PAY_DEC_5,PAY_DEC_6,PAY_DEC_7,index,NEXT_MONTH_DEFAULT
0,A20170,0.172446,M,Other,Single,31-45,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,A16887,-0.699964,F,Other,Single,31-45,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,A3430,-0.118357,M,Other,Other,Less than 30,-1,-1,2,0,...,1,0,0,0,0,0,0,0,2,0
3,A3696,-1.281566,M,Graduate,Single,31-45,1,-1,-1,-2,...,0,0,0,0,0,0,0,0,3,0
4,A20474,-0.990767,F,High School,Other,Less than 30,0,0,-2,-2,...,0,0,0,0,0,0,0,0,4,0


In [83]:
predictions = result[["Client_ID","NEXT_MONTH_DEFAULT"]]
predictions.to_csv("C:\\Users\\Wimukthi\\Desktop\\DataStorm\\predictions_1.csv", index=False) #export prediction to a csv file