In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
import re

Importing Data

In [19]:
raw_data = pd.read_csv('..\Sample Datasets\credit_data_train.csv', low_memory = False)
test = pd.read_csv('..\Sample Datasets\credit_data_test.csv', low_memory = False)

Exploring Data

In [None]:
raw_data.head(3)

In [None]:
print(raw_data.shape)
raw_data.describe()

In [None]:
raw_data.dtypes

Removing Unnecessary Columns

In [20]:
drop_col = ['ID', 'Name', 'SSN', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Num_of_Loan', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_Mix']
df = raw_data.drop(drop_col, axis=1).copy()
df.head(3)

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Interest_Rate,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,CUS_0xd40,January,23,Scientist,19114.12,1824.843333,3,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.4152954390025,High_spent_Small_value_payments,312.494088679437,Good
1,CUS_0xd40,February,23,Scientist,19114.12,,3,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,,809.98,31.94496,,No,49.574949,118.280221622367,Low_spent_Large_value_payments,284.629162496072,Good
2,CUS_0xd40,March,-500,Scientist,19114.12,,3,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.209862853791,Good


Checking Null Values

In [None]:
df.isnull().sum()

Pre-processing Data

In [21]:
def check_and_copy_row_data(df, column_name):
    last_valid_value = {}

    for i in range(len(df)):
        customer_id = df.loc[i, 'Customer_ID']
        
        if pd.isnull(df.loc[i, column_name]) or df.loc[i, column_name] == '':
            if customer_id in last_valid_value:
                df.loc[i, column_name] = last_valid_value[customer_id]
                print(f"Filled missing value at index {i} with last valid value for Customer_ID {customer_id}.")
            else:
                j = i + 1
                while j < len(df) and (pd.isnull(df.loc[j, column_name]) or df.loc[j, column_name] == ''):
                    j += 1
                if j < len(df) and df.loc[j, 'Customer_ID'] == customer_id:
                    df.loc[i, column_name] = df.loc[j, column_name]
                    print(f"Filled missing value at index {i} with value from index {j}.")
                else:
                    print(f"No valid value found for Customer_ID {customer_id} at index {i}.")
        else:
            last_valid_value[customer_id] = df.loc[i, column_name]

    return df

In [26]:
df['Monthly_Inhand_Salary'].unique()

array([ 1824.84333333,  3037.98666667, 12187.22      , ...,
        3097.00833333,  1929.90666667,  3359.41583333])

In [27]:
filtered_df = df[df['Customer_ID'] == 'CUS_0x5c7d'] #CUS_0x5c7d #CUS_0x95ee
print(filtered_df['Monthly_Inhand_Salary'])

89672    661.321667
89673    661.321667
89674    661.321667
89675    661.321667
89676    661.321667
89677    661.321667
89678    661.321667
89679    661.321667
Name: Monthly_Inhand_Salary, dtype: float64


In [28]:
df['Monthly_Inhand_Salary'].isnull().sum()

0

In [25]:
df['Monthly_Inhand_Salary'] = check_and_copy_row_data(df, 'Monthly_Inhand_Salary')

Filled missing value at index 1 with last valid value for Customer_ID CUS_0xd40.
Filled missing value at index 2 with last valid value for Customer_ID CUS_0xd40.
Filled missing value at index 3 with last valid value for Customer_ID CUS_0xd40.
Filled missing value at index 5 with last valid value for Customer_ID CUS_0xd40.
Filled missing value at index 11 with last valid value for Customer_ID CUS_0x21b1.
Filled missing value at index 14 with last valid value for Customer_ID CUS_0x21b1.
Filled missing value at index 18 with last valid value for Customer_ID CUS_0x2dbc.
Filled missing value at index 33 with last valid value for Customer_ID CUS_0x1cdb.
Filled missing value at index 37 with last valid value for Customer_ID CUS_0x1cdb.
Filled missing value at index 40 with value from index 41.
Filled missing value at index 53 with last valid value for Customer_ID CUS_0x284a.
Filled missing value at index 54 with last valid value for Customer_ID CUS_0x284a.
Filled missing value at index 56 wit

ValueError: Columns must be same length as key

In [None]:
#Annual Income And Monthly Inhand Salary
df['Annual_Income'] = df['Annual_Income'].map(lambda x: x.rstrip('_'))
df['Annual_Income'] = df['Annual_Income'].astype('float64')

In [None]:
#Age
#df['Age'].unique()
df[df['Age']>'55']['Age']

In [None]:
df['Age'] = df['Age'].map(lambda x: x.lstrip('-').rstrip('_'))

In [None]:
df['Age'] = df['Age'].astype('int64')
df['Age'].dtypes

In [None]:
#Type of Loan
df['Type_of_Loan'] = df['Type_of_Loan'].fillna('NA')

In [None]:
#Delay from due date
#df['Delay_from_due_date'].unique()
df['Delay_from_due_date'] = df.apply(lambda x: (x['Delay_from_due_date']*-1) if x['Delay_from_due_date'] < 0 else x['Delay_from_due_date'], axis=1)

In [None]:
df['Num_of_Delayed_Payment'].unique()

In [None]:
df['Delay_from_due_date'] = df['Delay_from_due_date'].astype('str')

In [None]:
df['Num_of_Delayed_Payment'] = df['Delay_from_due_date'].map(lambda x: x.lstrip('-').rstrip('_'))