## Data Import, Cleaning and Processing

In [29]:
%matplotlib inline 
import pandas as pd 

data=pd.read_csv('Assessment4Data.csv') 

In [30]:
data.head()

Unnamed: 0,Age,Income,Experience,LoanAmount,InterestRate,CreditHistoryLength,CreditScore,LoanApprovalStatus
0,24,80145,1,25000.0,9.91,2,636.0,Yes
1,22,80303,0,25000.0,13.06,3,554.0,Yes
2,22,241048,0,9000.0,6.17,4,587.0,No
3,144,241424,121,6000.0,11.86,2,807.0,No
4,25,241005,4,18000.0,16.32,3,708.0,No


In [31]:
data = data.drop_duplicates()
print(data.shape)

(20000, 8)


In [32]:
print(data.shape)
data.info()

(20000, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  20000 non-null  int64  
 1   Income               20000 non-null  int64  
 2   Experience           20000 non-null  int64  
 3   LoanAmount           19999 non-null  float64
 4   InterestRate         20000 non-null  float64
 5   CreditHistoryLength  20000 non-null  int64  
 6   CreditScore          19999 non-null  float64
 7   LoanApprovalStatus   20000 non-null  object 
dtypes: float64(3), int64(4), object(1)
memory usage: 1.4+ MB


In [33]:
data = data.dropna()
print(data.shape)

(19998, 8)


In [34]:
#Encoding 
data['LoanApprovalStatus'] = data['LoanApprovalStatus'].map({'Yes': 1, 'No': 0})
data.head()

Unnamed: 0,Age,Income,Experience,LoanAmount,InterestRate,CreditHistoryLength,CreditScore,LoanApprovalStatus
0,24,80145,1,25000.0,9.91,2,636.0,1
1,22,80303,0,25000.0,13.06,3,554.0,1
2,22,241048,0,9000.0,6.17,4,587.0,0
3,144,241424,121,6000.0,11.86,2,807.0,0
4,25,241005,4,18000.0,16.32,3,708.0,0


In [59]:
#Data Processing 
from sklearn.model_selection import train_test_split

X = data.drop(['LoanApprovalStatus'], axis=1)
y = data['LoanApprovalStatus']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=250002) 

X_train = X_train.reset_index(drop=True)
X_test  = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test  = y_test.reset_index(drop=True)

In [60]:
#Standardizing Data Set (features)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), 
                              columns=X_train.columns, 
                              index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test), 
                             columns=X_test.columns, 
                             index=X_test.index)

## Model Building and Evaluation

## Logistic Regression

In [61]:
from sklearn import linear_model 

log_regress = linear_model.LogisticRegression() 
log_regress.fit(X_train, y_train)

In [62]:
print(log_regress.intercept_)
print(log_regress.coef_)

[-1.73463991]
[[ 0.0841158  -1.55839181 -0.03419546  0.68493335  0.85482631  0.03836073
  -0.03420638]]


In [69]:
#testing the model 

# Predicted probabilities
pred_prob = pd.DataFrame(
    log_regress.predict_proba(X_test),  # or X_test_scaled if you kept both
    columns=['Denied', 'Approved'])

# Predicted classes
pred_class = pd.Series(
    log_regress.predict(X_test),        # or X_test_scaled (must match above)
    name='Prediction')

# Original results
original_result = y_test.reset_index(drop=True).rename('Original Result')


result = pd.concat([
    pred_prob.reset_index(drop=True),
    pred_class.reset_index(drop=True),
    original_result], axis=1)

print(result.head(10))

     Denied  Approved  Prediction  Original Result
0  0.697030  0.302970           0                1
1  0.935332  0.064668           0                0
2  0.455243  0.544757           1                0
3  0.750997  0.249003           0                0
4  0.746152  0.253848           0                0
5  0.900966  0.099034           0                1
6  0.400971  0.599029           1                1
7  0.564491  0.435509           0                0
8  0.832640  0.167360           0                0
9  0.827191  0.172809           0                0


In [70]:
#getting the table of prediction vs actual 
print("Confusion Matrix")
print(pd.crosstab(result["Prediction"], result["Original Result"],
                  rownames=["Predicted"], colnames=["Actual"]))

Confusion Matrix
Actual        0    1
Predicted           
0          2981  567
1           166  286


In [71]:
from sklearn import metrics 

print(metrics.confusion_matrix(y_true = y_test,
                               y_pred = pred_class))

[[2981  166]
 [ 567  286]]


In [72]:
print('Accuracy')
print(log_regress.score(X = X_test,
                           y = y_test)) 

Accuracy
0.81675


In [73]:
#View summary of common classification metrics 

print('Metrics') 
print(metrics.classification_report(
    y_true = y_test,
    y_pred = pred_class))

Metrics
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      3147
           1       0.63      0.34      0.44       853

    accuracy                           0.82      4000
   macro avg       0.74      0.64      0.66      4000
weighted avg       0.80      0.82      0.79      4000



## Support Vector Machine (SVM)

In [74]:
from sklearn import svm
from sklearn.svm import SVC 

In [75]:
svm_classifier = SVC(kernel = 'linear') 

In [76]:
#training the classifer on the training data set 
svm_classifier.fit(X_train, y_train)

In [80]:
#making predictions of the test data set 
y_pred = svm_classifier.predict(X_test)

In [83]:
#calculating the accuracy of the model 
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:",accuracy)

Accuracy: 0.80125


In [86]:
from sklearn.metrics import confusion_matrix, classification_report

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(cm)

Confusion Matrix
[[3133   14]
 [ 781   72]]


In [87]:
# Classification Report
report = classification_report(y_test, y_pred)
print("Classification Report")
print(report)

Classification Report
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      3147
           1       0.84      0.08      0.15       853

    accuracy                           0.80      4000
   macro avg       0.82      0.54      0.52      4000
weighted avg       0.81      0.80      0.73      4000

