In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
train_dir = "input/lt-vehicle-loan-default-prediction/train.csv"

train_data = pd.read_csv(train_dir)
train_data.set_index("UniqueID")

#conversion of the date columns to int type
import datetime as dt
#first convert to datetime
train_data['Date.of.Birth'] = pd.to_datetime(train_data['Date.of.Birth'])
train_data['Date.of.Birth'] = train_data['Date.of.Birth'].map(dt.datetime.toordinal)

#separate X and Y
train_data_y = train_data['loan_default']
train_data_y = train_data_y.astype('int64')

train_data_x = train_data.drop(['loan_default'], axis =1)

In [5]:
#separating train data (80%) and test data (20%)
from sklearn.model_selection import train_test_split
train_data_x, test_data, y_train, y_test = train_test_split(train_data_x, train_data_y, train_size = 0.8, test_size = 0.2)
train_data_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186523 entries, 144812 to 62019
Data columns (total 40 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   UniqueID                             186523 non-null  int64  
 1   disbursed_amount                     186523 non-null  int64  
 2   asset_cost                           186523 non-null  int64  
 3   ltv                                  186523 non-null  float64
 4   branch_id                            186523 non-null  int64  
 5   supplier_id                          186523 non-null  int64  
 6   manufacturer_id                      186523 non-null  int64  
 7   Current_pincode_ID                   186523 non-null  int64  
 8   Date.of.Birth                        186523 non-null  int64  
 9   Employment.Type                      180357 non-null  object 
 10  DisbursalDate                        186523 non-null  object 
 11  State_ID 

Looking at the dataset, some columns will bequite useful in our prediction, like the AVERAGE.ACCT.AGE and CREDIT.HISTORY.LENGTH but these columns are in formats that's cant be used by our model, so we intervene.

Things to clean up:
1. Fix missing values in Employment.Type
1. Remove DisbursalDate, PERFORM_CNS.SCORE.DESCRIPTION, AVERAGE.ACCT.AGE, CREDIT.HISTORY.LENGTH, branch_id, supplier_id
2. Categorical variable: Employment.Type

In [6]:
from sklearn.impute import SimpleImputer

SI = SimpleImputer(strategy = "most_frequent", missing_values = np.nan)
x_train = pd.DataFrame(SI.fit_transform(train_data_x))
x_test = pd.DataFrame(SI.transform(test_data))

#replace column names that were removed by simple imputer
x_train.columns = train_data_x.columns
x_test.columns = test_data.columns

x_train.index = train_data_x.index
x_test.index = test_data.index
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186523 entries, 144812 to 62019
Data columns (total 40 columns):
 #   Column                               Non-Null Count   Dtype 
---  ------                               --------------   ----- 
 0   UniqueID                             186523 non-null  object
 1   disbursed_amount                     186523 non-null  object
 2   asset_cost                           186523 non-null  object
 3   ltv                                  186523 non-null  object
 4   branch_id                            186523 non-null  object
 5   supplier_id                          186523 non-null  object
 6   manufacturer_id                      186523 non-null  object
 7   Current_pincode_ID                   186523 non-null  object
 8   Date.of.Birth                        186523 non-null  object
 9   Employment.Type                      186523 non-null  object
 10  DisbursalDate                        186523 non-null  object
 11  State_ID              

In [7]:
drop_cols = ['DisbursalDate', 'PERFORM_CNS.SCORE.DESCRIPTION', 'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'branch_id', 'supplier_id']
x_train = x_train.drop(drop_cols, axis = 1)
x_test = x_test.drop(drop_cols, axis = 1)
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186523 entries, 144812 to 62019
Data columns (total 34 columns):
 #   Column                               Non-Null Count   Dtype 
---  ------                               --------------   ----- 
 0   UniqueID                             186523 non-null  object
 1   disbursed_amount                     186523 non-null  object
 2   asset_cost                           186523 non-null  object
 3   ltv                                  186523 non-null  object
 4   manufacturer_id                      186523 non-null  object
 5   Current_pincode_ID                   186523 non-null  object
 6   Date.of.Birth                        186523 non-null  object
 7   Employment.Type                      186523 non-null  object
 8   State_ID                             186523 non-null  object
 9   Employee_code_ID                     186523 non-null  object
 10  MobileNo_Avl_Flag                    186523 non-null  object
 11  Aadhar_flag           

In [9]:
from sklearn.preprocessing import OneHotEncoder


hot_encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
OH_train = pd.DataFrame(hot_encoder.fit_transform(x_train[['Employment.Type']]))
OH_test = pd.DataFrame(hot_encoder.transform(x_test[['Employment.Type']]))

#adding back the index
OH_train.index = x_train.index
OH_test.index = x_test.index

#removing raw categorical variable
x_train = x_train.drop(['Employment.Type'], axis = 1)
x_test = x_test.drop(['Employment.Type'], axis = 1)

#replacing the dropped categorical vars
x_train = pd.concat([x_train, OH_train], axis = 1)
x_test = pd.concat([x_test, OH_test], axis = 1)

#transform changed dtype to object, we convert it back to int64
x_train = x_train.astype('int64')
x_test = x_test.astype('int64')

print(x_train.info())

KeyError: "None of [Index(['Employment.Type'], dtype='object')] are in the [columns]"

Now that we have cleaned the data, it's time to find a classification model, I have decided to use logistic regression, we'll compare its accuracy with that of decision trees and random forest models to see which works best.

In [None]:
from sklearn.linear_model import LogisticRegression

#defining the model
#I used a max iteration of 500 because
LR_model = LogisticRegression(random_state = 0, max_iter = 500, solver = 'liblinear')

#fitting the model
LR_model.fit(x_train, y_train)

#predict
prediction = LR_model.predict(x_test)

#evaluate
from sklearn.metrics import mean_absolute_error as mae
accuracy = 1 - mae(y_test,prediction)
print("the accuracy is ", accuracy * 100, "%")