In [1]:
import pandas as pd
import plotly.express as pe

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix


## step 1: load the data

In [2]:
path = "/home/harshit/Desktop/TataSteelML2023/dataset/Loan_Status_Classification.csv"
df = pd.read_csv(path)
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,1,0,0,6608,0,137,180,1,1,1
1,0,1,2,0,0,4226,1040,110,360,1,1,1
2,1,1,0,1,0,3167,2283,154,360,1,2,1
3,0,0,0,1,1,6950,0,175,180,1,2,1
4,0,1,0,1,0,3993,3274,207,360,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
293,1,0,0,1,0,3846,0,111,360,1,2,1
294,0,0,0,1,0,2435,0,75,360,1,1,0
295,0,0,2,1,0,4923,0,166,360,0,2,1
296,0,1,3,0,0,2071,754,94,480,1,2,1


## Step 2: Data preprocessing & data exploration

In [3]:
print(df.columns, df.shape, sep="\n")

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')
(298, 12)


In [4]:
categorical_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Loan_Amount_Term', 'Credit_History', 'Property_Area']
real_value_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']


In [5]:
df[real_value_features].describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
count,298.0,298.0,298.0
mean,5351.265101,1673.026846,143.560403
std,6306.080712,2892.404818,80.395182
min,150.0,0.0,9.0
25%,2883.75,0.0,99.25
50%,3854.0,1106.0,125.5
75%,5721.5,2281.0,171.5
max,81000.0,33837.0,600.0


In [6]:
sc = StandardScaler()

for col in real_value_features:
    df[[col]] = sc.fit_transform(df[[col]])


df[real_value_features].describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount
count,298.0,298.0,298.0
mean,-4.7687430000000005e-17,2.980464e-18,-1.311404e-16
std,1.001682,1.001682,1.001682
min,-0.8261889,-0.5793937,-1.676553
25%,-0.3919496,-0.5793937,-0.5520845
50%,-0.2378313,-0.1963697,-0.2250232
75%,0.05880953,0.21055,0.3481128
max,12.01634,11.13885,5.687


In [89]:
pe.box(y=real_value_features, data_frame=df)

## categorical data!

In [90]:
for col in categorical_features:
    
    fig = pe.histogram(x=col, data_frame=df, color='Loan_Status')

    display(fig)

### 3 real value features: ApplicantIncome, CoapplicantIncome, LoanAmount


In [91]:
import itertools

options = list ( itertools.combinations(     real_value_features, 2   ) )
print(options)

[('ApplicantIncome', 'CoapplicantIncome'), ('ApplicantIncome', 'LoanAmount'), ('CoapplicantIncome', 'LoanAmount')]


## a column with numbers (numerical values int or float) will be considered as real_value_feature by default in plotly

In [92]:
status_string = [str(x) for x in df.Loan_Status]

for combination in options:
    
    fig = pe.scatter(  x=combination[0], y=combination[1], data_frame=df, color=status_string  )
    display(fig)

In [93]:
df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [94]:
df[categorical_features].nunique()

Gender              2
Married             2
Dependents          4
Education           2
Self_Employed       2
Loan_Amount_Term    9
Credit_History      2
Property_Area       3
dtype: int64

In [95]:
df.Loan_Status.value_counts()

1    150
0    148
Name: Loan_Status, dtype: int64

### step 2:

        separated real_value & categorical columns
        missing values
        count of unique values per category
        scale the real_value
        outliers in the data were detected
        relation between categorical features and loan_status i.e impact of categorical category on loan_status

# step 3: splitting the data 

In [96]:
df['TotalIncome'] = df.ApplicantIncome + df.CoapplicantIncome

df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome
0,0,1,1,0,0,0.199625,-0.579394,-0.081739,180,1,1,1,-0.379769
1,0,1,2,0,0,-0.178741,-0.219226,-0.418145,360,1,1,1,-0.397968
2,1,1,0,1,0,-0.346957,0.211243,0.130072,360,1,2,1,-0.135714
3,0,0,0,1,1,0.253949,-0.579394,0.391721,180,1,2,1,-0.325444
4,0,1,0,1,0,-0.215752,0.554440,0.790424,360,1,2,1,0.338688
...,...,...,...,...,...,...,...,...,...,...,...,...,...
293,1,0,0,1,0,-0.239102,-0.579394,-0.405686,360,1,2,1,-0.818496
294,0,0,0,1,0,-0.463231,-0.579394,-0.854227,360,1,1,0,-1.042624
295,0,0,2,1,0,-0.068027,-0.579394,0.279586,360,0,2,1,-0.647421
296,0,1,3,0,0,-0.521050,-0.318272,-0.617497,480,1,2,1,-0.839322


In [97]:
features = real_value_features.copy() + ['TotalIncome'] #create a copy of it in new variable

target = ["Loan_Status"]


In [98]:
x_train, x_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    stratify=df[target],
    test_size=0.2,
    random_state=42
)

### step 4: train the model

In [111]:
model = SVC(C=10)

model.fit(x_train,y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [112]:
predicted_values = model.predict(   x_test  )

In [113]:
ans = confusion_matrix(  y_test,  predicted_values  )

print(ans)

[[ 9 21]
 [ 9 21]]


In [114]:
ans = classification_report(  y_test,  predicted_values  )

print(ans)

              precision    recall  f1-score   support

           0       0.50      0.30      0.37        30
           1       0.50      0.70      0.58        30

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.48        60
weighted avg       0.50      0.50      0.48        60

