# Analisa Data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib

# Load dataset
df = pd.read_csv('/content/credit_risk_dataset.csv')

# Preprocessing missing values
df['loan_int_rate'].fillna(df['loan_int_rate'].mean(), inplace=True)
df['person_emp_length'].fillna(round(df['person_emp_length'].mean()), inplace=True)

# Define features and label
features = [
    'person_income', 'loan_grade', 'loan_int_rate', 'loan_percent_income',
    'person_home_ownership', 'cb_person_default_on_file'
]
target = 'loan_status'

X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numeric and categorical features
numeric_features = ['person_income', 'loan_int_rate', 'loan_percent_income']
categorical_features = ['loan_grade', 'person_home_ownership', 'cb_person_default_on_file']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Create full pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', DecisionTreeClassifier(criterion='entropy', max_depth=8, random_state=0))
])

# Fit pipeline
pipeline.fit(X_train, y_train)

# Save pipeline
joblib.dump(pipeline, 'pipeline_model.pkl')


In [None]:
#Untuk memberi tampilan yang menarik
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    GREYBACKBLACK =  '\033[0;30;47m'
    END = '\033[0m'

In [None]:
#menampilkan cuplikan data
print(color.BOLD + color.GREYBACKBLACK + "Berikut adalah dataset yang akan kita gunakan"+color.END)
display(df.head())
print("")

In [None]:
#menampilkan cuplikan data
print(color.BOLD + color.GREYBACKBLACK + "Berikut adalah dataset yang akan kita gunakan"+color.END)
display(df.info())
print("")

In [None]:
#menampilkan ukuran dari data csv
df.shape

In [None]:
#menampilkan semua nama kolom pada csv
print(df.columns)

In [None]:
#menampilkan semua karakteristik statistik pada data csv
df.describe()

# Missing Value

In [None]:
#memeriksa apakah terdapat missing value
print(color.BOLD+"Periksa isi data apakah sudah siap digunakan"+color.END)
print("Terdapat data hilang? ",df.isnull().values.any())

In [None]:
#mencari jumlah missing value pda data
df.isna().sum()

In [None]:
#mengubah missing value loan_int_rate menjadi rata-rata
loanmean = df['loan_int_rate'].mean()
df['loan_int_rate'] = df['loan_int_rate'].fillna(loanmean)

In [None]:
#mengubah missing value person_emp_length menjadi rata-rata
persempmean = round(df['person_emp_length'].mean())
df['person_emp_length'] = df['person_emp_length'].fillna(persempmean)

In [None]:
df.isna().sum()

In [None]:
# menampilkan informasi tiap kolom
df['person_emp_length']=df['person_emp_length'].astype('int64')
df.info()

In [None]:
#menampilkan semua karakteristik statistik pada data csv setelah missing value ditangani
df.describe()

# Memeriksa Data Pengganggu

In [None]:
#periksa jenis data bukan numerik untuk melihat terdapat data mengganggu atau tidak
display(df.person_home_ownership.unique())
display(df.loan_intent.unique())
display(df.loan_grade.unique())
display(df.cb_person_default_on_file.unique())
display(df.loan_status.unique())

# Penanganan Outliers

In [None]:
#mendeteksi outlier
import matplotlib.pyplot as plt
import seaborn as sns
dfq = ['person_age','person_income','person_emp_length','loan_amnt','loan_int_rate','loan_percent_income','cb_person_cred_hist_length']
plt.figure(figsize=(10,5))
for i in range(0,len(dfq)):
    plt.subplot(1,len(dfq),i+1)
    sns.boxplot(y=df[dfq[i]])
    plt.tight_layout()

In [None]:
df.loc[df['person_age'] >100]

In [None]:
#menghapus row
df.drop([81,183,575,747], axis=0, inplace=True)

In [None]:
df.loc[df['person_emp_length'] >df['person_age'] ]

In [None]:
#menghapus row
df.drop([0,210], axis=0, inplace=True)

In [None]:
#mendeteksi outlier
import matplotlib.pyplot as plt
import seaborn as sns
dfq = ['person_age','person_income','person_emp_length','loan_amnt','loan_int_rate','loan_percent_income','cb_person_cred_hist_length']
plt.figure(figsize=(10,5))
for i in range(0,len(dfq)):
    plt.subplot(1,len(dfq),i+1)
    sns.boxplot(y=df[dfq[i]])
    plt.tight_layout()

# Visualisasi Data

In [None]:
import warnings # tambahan agar tidak muncul warnings saat running
warnings.simplefilter("ignore")

In [None]:
# First, make sure df is defined by importing your data
# For example: df = pd.read_csv('your_data.csv')

# membuat plot untuk melihat visualisasi data
dfcount=['person_age','person_home_ownership','person_emp_length','loan_intent','loan_grade','loan_status','cb_person_default_on_file','cb_person_cred_hist_length']
dfhisto=['person_income','loan_amnt','loan_int_rate','loan_percent_income']

# Import required libraries if not already imported
import matplotlib.pyplot as plt
import seaborn as sns

# Check if columns exist in the DataFrame before plotting
for i in range(0, len(dfcount)):
    if dfcount[i] in df.columns:  # Check if column exists
        plt.figure(figsize=(25,5))
        sns.countplot(x=df[dfcount[i]])  # Specify x parameter explicitly
        plt.title(f'Count plot of {dfcount[i]}')
        plt.xticks(rotation=45)  # Rotate labels for better readability
        plt.show()
    else:
        print(f"Column '{dfcount[i]}' not found in DataFrame")

for i in range(0, len(dfhisto)):
    if dfhisto[i] in df.columns:  # Check if column exists
        plt.figure(figsize=(25,5))
        sns.histplot(df[dfhisto[i]], kde=True)
        plt.title(f'Histogram of {dfhisto[i]}')
        plt.show()
    else:
        print(f"Column '{dfhisto[i]}' not found in DataFrame")

In [None]:
df_tidakberhasil = df[df['loan_status'] == 1]
# membuat plot untuk melihat visualisasi variabel-variabel terhadap loan_status tidak berhasil
dfcount=['person_age','person_home_ownership','person_emp_length','loan_intent','loan_grade','cb_person_default_on_file','cb_person_cred_hist_length']
dfhisto=['person_income','loan_amnt','loan_int_rate','loan_percent_income']

# Fix for countplot - use x parameter to specify column name
for i in range(0, len(dfcount)):
    plt.figure(figsize=(25,5))
    sns.countplot(x=dfcount[i], data=df_tidakberhasil)  # Use x parameter and data parameter
    plt.show()

# Fix for histplot - use x parameter to specify column name
for i in range(0, len(dfhisto)):
    plt.figure(figsize=(25,5))
    sns.histplot(x=dfhisto[i], data=df_tidakberhasil, kde=True)  # Use x parameter and data parameter
    plt.show()

# Encoding

In [None]:
df.select_dtypes(include=['object'])

In [None]:
# OneHot encoding
i = ['person_home_ownership', 'loan_intent', 'cb_person_default_on_file']
OneHot = pd.get_dummies(df, columns = i)
df = OneHot

# Ordinal encoding
df.loan_grade = pd.Categorical(df.loan_grade)
df['loan_grade'] = df.loan_grade.cat.codes

df

In [None]:
#Melihat info dataframe tidak ada yang object
df.info()

# Uji Multikolinearitas dengan Variance Inflation Factor (VIF)

In [None]:
# feature selection with VIF
# VIF (Variance Inflation Factor) checking
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Select only numerical columns from the dataframe
# This is the key fix - we need to exclude non-numerical columns
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate VIF only for numerical columns
vif_data = pd.DataFrame()
vif_data['feature'] = numerical_df.columns
vif_data['VIF'] = [variance_inflation_factor(numerical_df.values, i) for i in range(len(numerical_df.columns))]
pd.set_option('display.max_rows', None)

# column's vif
vif_data.sort_values(by='VIF', ascending=False)  # batas vif = 11

# Feature Selection dengan Pearson Correlation

In [None]:
df.columns

In [None]:
#meletakkan variabel target di paling akhir
df = df[['person_age', 'person_income', 'person_emp_length', 'loan_grade',
       'loan_amnt', 'loan_int_rate', 'loan_percent_income',
       'cb_person_cred_hist_length', 'person_home_ownership_MORTGAGE',
       'person_home_ownership_OTHER', 'person_home_ownership_OWN',
       'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION',
       'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT',
       'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE',
       'cb_person_default_on_file_N', 'cb_person_default_on_file_Y', 'loan_status']]

In [None]:
# Feature Selection (Pearson Correlation)
print(color.BOLD + color.BLUE + 'Dilakukan seleksi fitur dengan Pearson Correlation' + color.END)
plt.figure(figsize = (15,15))
korelasi = df.corr()
display(korelasi)
sns.heatmap(korelasi, annot = True, cmap = plt.cm.Blues)
plt.show()

# Akan dicari variabel-variabel yang berkorelasi dengan variabel target (>rata-rata seluruh korelasi)
print(color.BOLD + color.RED + 'Didapat nilai absolut korelasi antar variabel dengan variabel output adalah sebagai berikut' + color.END)
print(abs(korelasi['loan_status'][1:-1]))
korelasi['loan_status'] = korelasi['loan_status'][1:-1]
bataskor = abs(korelasi['loan_status'][1:-1]).mean()

print('\n')

print('Mean dari korelasi tiap variabel yang ada adalah ' + color.BOLD + '{0}'.format(bataskor) + color.END)
print('Sehingga,' + color.BOLD + color.DARKCYAN + ' variabel yang dipilih sebagai hasil seleksi fitur adalah' + color.END)
hasilseleksi = abs(korelasi['loan_status'][1:-1])[abs(korelasi['loan_status']) > bataskor]
print(hasilseleksi)

In [None]:
'person_income', 'loan_grade', 'loan_int_rate', 'loan_percent_income', 'person_home_ownership_MORTGAGE', 'person_home_ownership_RENT', 'cb_person_default_on_file_N', 'cb_person_default_on_file_Y'

# Feature Selection dengan Backward Elimination

In [None]:
# yang tadi pearsonn skrg pake backward FS nya.
# set X and y
X = df[['person_income', 'loan_grade', 'loan_int_rate', 'loan_percent_income',
        'person_home_ownership_MORTGAGE', 'person_home_ownership_RENT', 'cb_person_default_on_file_N',
        'cb_person_default_on_file_Y']]
y = df['loan_status']

# train - test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

import pandas as pd # import pandas
import numpy as np #import numpy
import statsmodels.api as sm # import statsmodels

# dengan menggunakan formula Backward Regre Elimination
def backward_regression(X, y,
                           initial_list =[ ],
                           threshold_in = 0.01,
                           threshold_out = 0.05,
                           verbose = True):
    included = list(X.columns)
    while True:
        changed = False
        # Fix: Convert to numeric types and ensure proper DataFrame format
        X_temp = pd.DataFrame(X[included], dtype=float)
        model = sm.OLS(y, sm.add_constant(X_temp)).fit()
        # Fix: Access pvalues correctly - they're a Series, not a DataFrame
        pvalues = model.pvalues[1:]  # Remove the constant term
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                # Fix: Add placeholders in the format string
                print('Drop {} with p-value {}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

backward_regression(X_train, y_train)

# Splitting dan Normalisasi Data

In [None]:
# set X and y
X = df[['person_income', 'loan_grade', 'loan_int_rate', 'loan_percent_income',
        'person_home_ownership_MORTGAGE', 'person_home_ownership_RENT', 'cb_person_default_on_file_N',
        'cb_person_default_on_file_Y']]
y = df['loan_status']

# train - test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# MinmMaxScaler
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train) # fit X_train
X_test = sc.transform(X_test) #transform X_test
#save scaler
joblib.dump(sc, 'scaler.pkl')

# Tuning Hyperparameter

In [None]:
# hypertuning for LogisticRegression, DecisionTreeClassifier, and SVC
# import model yang ingin digunakan
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# memberikan pilihan model beserta parameternya masing-masing
model_params = {
    'Logistic_Regression': {
        'model': LogisticRegression(),
        'params' : {
            'penalty':['l1', 'l2', 'elasticnet', 'none'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'multi_class': ['auto', 'ovr', 'multinomial'],
            'C' : [0.1,8,0.1]
        }
    },
    'decision_tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'splitter':['best','random'],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : [4,5,6,7,8],
            'criterion' :['gini', 'entropy']
        }
    },
    'SVM_Classifier':{
        'model' : SVC(),
        'params':{
            'kernel' : ['rbf','poly','sigmoid','linear'],
            'gamma' : ['scale','auto'],
        }
    }
    }
scores = []

# scoring model and parameters, pencarian model dan parameter terbaik dengan menggunakan GridSearchCV
from sklearn.model_selection import GridSearchCV
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False)
    clf.fit(X_train,y_train) # fit GridSearch ke X_train dan y_train
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
best = pd.DataFrame(scores,columns=['model','best_score','best_params'])
best

In [None]:
#Best Parameter Logistic Regression
best.best_params[0]

In [None]:
#Best Parameter Decision Tree
best.best_params[1]

In [None]:
#Best SVM Classifier
best.best_params[2]

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression # import LogisticRegression
LR = LogisticRegression(C=0.1, solver='saga', multi_class='ovr', penalty=None, max_iter=1000).fit(X_train, y_train)
 # membentuk model

yhat = LR.predict(X_test)

# classification report
print(color.BOLD + color.DARKCYAN + "Classification Report Logistic Regression Model"+color.END)
from sklearn.metrics import classification_report # import classification_report
print (classification_report(y_test, yhat)) # mencetak classification report

print(color.BOLD +"f1-score: "+color.END,f1_score(y_test, yhat, pos_label=0))
print(color.BOLD +"accuracy-score: "+color.END,accuracy_score(y_test, yhat))

# Calculate the confusion matrix
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
conf_matrix = confusion_matrix(y_true=y_test, y_pred=yhat)

fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

print("")
print(color.BOLD + color.DARKCYAN + "Confusion Matrix Logistic Regression Model"+color.END)
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
#SAVE MODEL
joblib.dump(LR, 'logistic_regression_model.pkl')
print("Model saved as 'logistic_regression_model.pkl'")

# Decision Tree

In [None]:
# model
from sklearn.tree import DecisionTreeClassifier
credittree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 8, random_state = 0, splitter = 'best')
credittree.fit(X_train, y_train)

# prediction
y_pred = credittree.predict(X_test)

# classification report
from sklearn.metrics import classification_report
print(color.BOLD + color.DARKCYAN + "Classification Report Decision Tree Model"+color.END)
print(classification_report(y_test,y_pred))
print('\n')


print(color.BOLD +"f1-score: "+color.END,f1_score(y_test, y_pred, pos_label=0))
print(color.BOLD +"accuracy-score: "+color.END,accuracy_score(y_test, y_pred))

# Calculate the confusion matrix
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)

fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

print("")
print(color.BOLD + color.DARKCYAN + "Confusion Matrix Decision Tree Model"+color.END)
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
#SAVE MODEL
joblib.dump(credittree, 'decision_tree_model.pkl')
print("Model saved as 'decision_tree_model.pkl'")

#TN = 4977, TP = 876, FN = 476 , FP = 186
#Precision = TP/(TP+FP)
#Recall = TP/(TP+FN)
#1/F1-score = 1/2(1/Precision + 1/Recall) F1-Score adalah harmonic mean dari precision dan recall
#Accuracy =

# Support Vector Machines (SVM)

In [None]:
import numpy
from sklearn import svm
import pandas as pd

clf = svm.SVC(C=1, gamma = 'scale', kernel='poly')
clf.fit(X_train, y_train)

yhat = clf.predict(X_test)

# classification report
from sklearn.metrics import classification_report # import classification_report
print(color.BOLD + color.DARKCYAN + "Classification Report SVM Model"+color.END)
print (classification_report(y_test, yhat)) # mencetak classification report

print(color.BOLD +"f1-score: "+color.END,f1_score(y_test, yhat, pos_label=0))
print(color.BOLD +"accuracy-score: "+color.END,accuracy_score(y_test, yhat))

# Calculate the confusion matrix
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
conf_matrix = confusion_matrix(y_true=y_test, y_pred=yhat)

fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

print("")
print(color.BOLD + color.DARKCYAN + "Confusion Matrix SVM Model"+color.END)
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

# Testing Model

In [None]:
y_test2=y_test.to_numpy()
dftest = [y_test2,y_pred]

dfframe = pd.DataFrame (dftest).transpose()
dfframe.columns = ['y_test', 'y_pred']
dfframe.head()

In [None]:
loan_grade_dict = {
  "A": 0,
  "B": 1,
  "C": 2,
  "D": 3,
  "E": 4,
  "F": 5,
  "G": 6 }

X = df[['person_income', 'loan_grade', 'loan_int_rate', 'loan_percent_income',
        'person_home_ownership_MORTGAGE', 'person_home_ownership_RENT', 'cb_person_default_on_file_N',
        'cb_person_default_on_file_Y']]
y = df['loan_status']

print(color.BOLD + color.DARKCYAN + "Credit Risk Prediction Programming Algorithm"+color.END)


person_income = int(input("Person Income: "))
loan_grade = str(input("Loan Grade: ")).upper()
loan_int_rate = float(input("Loan Interest Rate: "))
loan_percent_income = float(input("Loan Percent Income: "))
person_home_ownership = str(input("Person Home Ownership: ")).upper()
cb_person_default_on_file = str(input("CB Person Default on File: ")).upper()

if person_home_ownership == 'MORTGAGE':
  person_home_ownership_MORTGAGE=1
  person_home_ownership_RENT=0
elif person_home_ownership == 'RENT':
  person_home_ownership_MORTGAGE=0
  person_home_ownership_RENT=1
else:
  person_home_ownership_MORTGAGE=0
  person_home_ownership_RENT=0

if cb_person_default_on_file == 'Y':
  cb_person_default_on_file_Y=1
  cb_person_default_on_file_N=0
else:
  cb_person_default_on_file_Y=0
  cb_person_default_on_file_N=1

x_data = []
x_data.append(person_income)
x_data.append(loan_grade_dict[loan_grade])
x_data.append(loan_int_rate)
x_data.append(loan_percent_income)
x_data.append(person_home_ownership_MORTGAGE)
x_data.append(person_home_ownership_RENT)
x_data.append(cb_person_default_on_file_N)
x_data.append(cb_person_default_on_file_Y)

import numpy as np
x_data_array=np.array(x_data)
x_data_reshape=x_data_array.reshape(1,-1)

# MinmMaxScaler
x_data_fit = sc.transform(x_data_reshape) #transform X_test

y_pred = credittree.predict(x_data_fit)

print(color.BOLD + "Kelayakan Penerima Nasabah Kredit: "+color.END)
if y_pred[0]==0:
  print(color.BOLD + color.BLUE + "Layak"+color.END)
else:
  print(color.BOLD + color.RED + "Tidak Layak"+color.END)