In [1]:
#importing the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#loading the dataset for data preprocessing
try:
    df=pd.read_csv('data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("File not found. Please ensure the dataset is in the correct directory.")
    exit()

#we'll drop the column profile_id since it is not useful for our analysis
if 'Profile_ID' in df.columns:
    df = df.drop('Profile_ID', axis=1)

#creating a copy of the dataset for further processing
df_encoded = df.copy()

#since the dataset contains non numerical data, we will encode it using LabelEncoder
#We will store the encoders to potentially decode later if needed
encoders={}
for column in df_encoded.select_dtypes(include=['object']).columns:
    le=LabelEncoder()
    df_encoded[column] = le.fit_transform(df_encoded[column])
    encoders[column] = le

print("\nDataset after converting categorical columns to numbers:")
print(df_encoded.head())

#Model 1: we'll be using the KMeans clustering algorithm to find patterns in the data and then creating the 'Behavior' column in the dataset for psychometric analysis
behavioral_cols = [
    'Utility_Payment_History',
    'Rent_Payment_History',
    'Telecommunications_Payment_History',
    'Online_Shopping_Behavior',
    'Social_Media_Footprint',
    'Gig_Economy_Participation'
]

x_behavior=df_encoded[behavioral_cols]
kmeans=KMeans(n_clusters=3, random_state=42,n_init=10)
behavior_clusters=kmeans.fit_predict(x_behavior)

# Add the new behavior column to both the original and encoded DataFrames
df['Behavior'] = behavior_clusters
df_encoded['Behavior'] = behavior_clusters

print("\n'Behavior' column created and added to the dataset.")
print("Distribution of user behaviors (clusters):")
print(df['Behavior'].value_counts())

#now we have to create the target variable which is 'credit worthy or not', 
consistent_payment_mask = (df['Utility_Payment_History'] == 'Consistent') & \
                          (df['Rent_Payment_History'] == 'Consistent') & \
                          (df['Telecommunications_Payment_History'] == 'Consistent')

# A simple rule: Creditworthy if all payments are consistent and they are employed/self-employed
df_encoded['Creditworthy'] = (consistent_payment_mask & df['Employment_Status'].isin(['Employed', 'Self-employed'])).astype(int)

print("\n'Creditworthy' target column created.")
print("Distribution of Creditworthy status (1 = Yes, 0 = No):")
print(df_encoded['Creditworthy'].value_counts())


#now we have to create the second model which will predict whether based on the behavioral patterns and other features the user is eligible for credit or not
#we'll use a random forest classifier for this purpose
x=df_encoded.drop(['Creditworthy'], axis=1)
y=df_encoded['Creditworthy']

#splitting the dataset into training and testing sets about 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(f"\nData split into training and testing sets.")
print(f"Training set size: {x_train.shape[0]} samples")
print(f"Testing set size: {x_test.shape[0]} samples")

model=RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
print("\nFinal prediction model has been trained.")


#making predictions on the test set
y_pred=model.predict(x_test)

#accuracy metrics of the model
print("\n--- Model Evaluation Results ---")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Creditworthy', 'Creditworthy']))

# Display feature importances to see which factors were most influential
feature_importances = pd.DataFrame({
    'feature': x.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)


print("\nMost Important Features for Predicting Creditworthiness:")
print(feature_importances)

print("\n--- Process Complete ---")
print("\nFinal dataset with 'Behavior' and 'Creditworthy' columns (first 5 rows):")
final_df_preview = df_encoded.head()
# Map numeric back to categorical for the preview's target column for clarity
final_df_preview['Creditworthy'] = final_df_preview['Creditworthy'].map({1: 'Yes', 0: 'No'})
print(final_df_preview)

Dataset loaded successfully.

Dataset after converting categorical columns to numbers:
   Age  Employment_Status  Residential_Stability  Bank_Account_Tenure  \
0   64                  2                     10                   15   
1   27                  1                     14                   17   
2   30                  1                     16                   21   
3   19                  0                     20                    7   
4   24                  1                     10                   13   

   Utility_Payment_History  Rent_Payment_History  \
0                        1                     2   
1                        2                     2   
2                        1                     1   
3                        2                     1   
4                        0                     2   

   Telecommunications_Payment_History  Educational_Background  \
0                                   1                       2   
1                              

[WinError 2] The system cannot find the file specified
  File "c:\Users\saaja\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\saaja\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\saaja\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\saaja\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,



Final prediction model has been trained.

--- Model Evaluation Results ---
Accuracy: 0.99

Confusion Matrix:
[[97  0]
 [ 1  2]]

Classification Report:
                  precision    recall  f1-score   support

Not Creditworthy       0.99      1.00      0.99        97
    Creditworthy       1.00      0.67      0.80         3

        accuracy                           0.99       100
       macro avg       0.99      0.83      0.90       100
    weighted avg       0.99      0.99      0.99       100


Most Important Features for Predicting Creditworthiness:
                               feature  importance
5                 Rent_Payment_History    0.162240
4              Utility_Payment_History    0.155815
6   Telecommunications_Payment_History    0.104323
1                    Employment_Status    0.101761
0                                  Age    0.098164
11                            Behavior    0.081914
2                Residential_Stability    0.079709
3                  Bank_Accoun

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_preview['Creditworthy'] = final_df_preview['Creditworthy'].map({1: 'Yes', 0: 'No'})
