In [None]:
#importing the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#loading the dataset for data preprocessing
try:
    df=pd.read_csv('data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("File not found. Please ensure the dataset is in the correct directory.")
    exit()

#we'll drop the column profile_id since it is not useful for our analysis
if 'Profile_ID' in df.columns:
    df = df.drop('Profile_ID', axis=1)

#creating a copy of the dataset for further processing
df_encoded = df.copy()

#since the dataset contains non numerical data, we will encode it using LabelEncoder
#We will store the encoders to potentially decode later if needed
encoders={}
for column in df_encoded.select_dtypes(include=['object']).columns:
    le=LabelEncoder()
    df_encoded[column] = le.fit_transform(df_encoded[column])
    encoders[column] = le

print("\nDataset after converting categorical columns to numbers:")
print(df_encoded.head())

#Model 1: we'll be using the KMeans clustering algorithm to find patterns in the data and then creating the 'Behavior' column in the dataset for psychometric analysis
behavioral_cols = [
    'Utility_Payment_History',
    'Rent_Payment_History',
    'Telecommunications_Payment_History',
    'Online_Shopping_Behavior',
    'Social_Media_Footprint',
    'Gig_Economy_Participation'
]

x_behavior=df_encoded[behavioral_cols]
kmeans=KMeans(n_clusters=3, random_state=42,n_init=10)
behavior_clusters=kmeans.fit_predict(x_behavior)

# Add the new behavior column to both the original and encoded DataFrames
df['Behavior'] = behavior_clusters
df_encoded['Behavior'] = behavior_clusters

print("\n'Behavior' column created and added to the dataset.")
print("Distribution of user behaviors (clusters):")
print(df['Behavior'].value_counts())

#now we have to create the target variable which is 'credit worthy or not', 

Dataset loaded successfully.

Dataset after converting categorical columns to numbers:
   Age  Employment_Status  Residential_Stability  Bank_Account_Tenure  \
0   64                  2                     10                   15   
1   27                  1                     14                   17   
2   30                  1                     16                   21   
3   19                  0                     20                    7   
4   24                  1                     10                   13   

   Utility_Payment_History  Rent_Payment_History  \
0                        1                     2   
1                        2                     2   
2                        1                     1   
3                        2                     1   
4                        0                     2   

   Telecommunications_Payment_History  Educational_Background  \
0                                   1                       2   
1                              

Unnamed: 0,Age,Employment_Status,Residential_Stability,Bank_Account_Tenure,Utility_Payment_History,Rent_Payment_History,Telecommunications_Payment_History,Educational_Background,Online_Shopping_Behavior,Social_Media_Footprint,Gig_Economy_Participation,Behavior
0,64,Student,10,15,Inconsistent,New,Inconsistent,Master's,Occasional,High,,1
1,27,Self-employed,14,17,New,New,Consistent,High School,,High,Part-Time,2
2,30,Self-employed,16,21,Inconsistent,Inconsistent,New,High School,Frequent,High,,0
3,19,Employed,20,7,New,Inconsistent,Consistent,PhD,,Medium,Full-Time,2
4,24,Self-employed,10,13,Consistent,New,Inconsistent,PhD,,Low,Full-Time,1
...,...,...,...,...,...,...,...,...,...,...,...,...
495,40,Employed,6,23,Inconsistent,New,Inconsistent,High School,Frequent,Medium,Part-Time,2
496,49,Employed,20,17,Consistent,Consistent,Inconsistent,PhD,Occasional,Low,,1
497,47,Unemployed,15,29,Consistent,Inconsistent,New,Bachelor's,Occasional,High,Full-Time,1
498,40,Employed,22,4,Consistent,Consistent,Inconsistent,High School,,Medium,,1
