# Machine Learning

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from numpy.linalg import inv
from sklearn.linear_model import Lasso, Ridge



## 資料前處理

### data

In [18]:
data = pd.read_csv(r"C:\Users\chewei\Downloads\loan_train.csv")

In [19]:
data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,credit_card,0.1197,481.4,11.77529,10.85,682,5071.0,966,13.1,5,0,0,0
1,1,all_other,0.0907,238.75,11.0021,4.86,752,7626.958333,8575,25.0,1,0,0,0
2,1,debt_consolidation,0.1222,148.28,11.835009,19.0,677,6059.958333,27587,74.8,1,1,0,0
3,1,all_other,0.0774,156.1,10.803649,4.71,797,4200.041667,1299,4.2,2,0,0,0
4,0,major_purchase,0.1253,267.73,10.819778,21.29,697,3060.041667,34938,59.6,4,1,0,0


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9478 entries, 0 to 9477
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9478 non-null   int64  
 1   purpose            9478 non-null   object 
 2   int.rate           9478 non-null   float64
 3   installment        9478 non-null   float64
 4   log.annual.inc     9478 non-null   float64
 5   dti                9478 non-null   float64
 6   fico               9478 non-null   int64  
 7   days.with.cr.line  9478 non-null   float64
 8   revol.bal          9478 non-null   int64  
 9   revol.util         9478 non-null   float64
 10  inq.last.6mths     9478 non-null   int64  
 11  delinq.2yrs        9478 non-null   int64  
 12  pub.rec            9478 non-null   int64  
 13  not.fully.paid     9478 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [21]:
# 檢查缺失值與data是否重複
print(data.isnull().sum())
print('duplicate: ', data.duplicated().sum())

credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not.fully.paid       0
dtype: int64
duplicate:  0


### 1. 處理變數 puepose
1. label Encoder
2. dummy variables

In [22]:
def purpose_transformation(data, method):
    '''將 purpose 的型態轉換'''
    if method == 'label':
        label_encoder = LabelEncoder()
        data['purpose'] = label_encoder.fit_transform(data['purpose'])
    elif method == 'dummy' or method == 'one_hot':
        data = pd.get_dummies(data=data, columns=['purpose'], drop_first=True)
    else:
        raise ValueError("Method must be either 'label' or 'one_hot'")
    return data

purpose_transformation(data, 'label')

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,0.1197,481.40,11.775290,10.85,682,5071.000000,966,13.1,5,0,0,0
1,1,0,0.0907,238.75,11.002100,4.86,752,7626.958333,8575,25.0,1,0,0,0
2,1,2,0.1222,148.28,11.835009,19.00,677,6059.958333,27587,74.8,1,1,0,0
3,1,0,0.0774,156.10,10.803649,4.71,797,4200.041667,1299,4.2,2,0,0,0
4,0,5,0.1253,267.73,10.819778,21.29,697,3060.041667,34938,59.6,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9473,1,4,0.1253,133.87,10.858999,1.34,742,1770.000000,2386,11.3,4,0,0,1
9474,1,3,0.1253,167.34,10.308953,0.00,697,1860.000000,0,0.0,1,1,0,1
9475,1,1,0.1461,861.88,11.695247,20.52,707,5580.000000,78468,84.3,2,0,0,1
9476,1,0,0.0894,305.01,11.350407,12.01,742,5039.958333,47474,56.1,0,0,0,1


### 2.極端值處理 - IQR

In [23]:
skewColumns = ['int.rate', 'installment', 'dti', 'fico', 'days.with.cr.line', 'revol.util']
skewness = data[skewColumns].skew()
print(skewness)

int.rate             0.164135
installment          0.910142
dti                  0.023261
fico                 0.469720
days.with.cr.line    1.156222
revol.util           0.062470
dtype: float64


In [24]:
def Outlier_IQR(data, column):
    '''透過 IQR 處理極端值'''
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3-Q1
    maxOutlier = Q3 + IQR*1.5
    minOutlier = Q1 - IQR*1.5
    data[column] = data[column].clip(lower=minOutlier, upper=maxOutlier) 
    return data

Outlier_IQR(data, 'installment')
Outlier_IQR(data, 'days.with.cr.line')

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,0.1197,481.40000,11.775290,10.85,682,5071.000000,966,13.1,5,0,0,0
1,1,0,0.0907,238.75000,11.002100,4.86,752,7626.958333,8575,25.0,1,0,0,0
2,1,2,0.1222,148.28000,11.835009,19.00,677,6059.958333,27587,74.8,1,1,0,0
3,1,0,0.0774,156.10000,10.803649,4.71,797,4200.041667,1299,4.2,2,0,0,0
4,0,5,0.1253,267.73000,10.819778,21.29,697,3060.041667,34938,59.6,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9473,1,4,0.1253,133.87000,10.858999,1.34,742,1770.000000,2386,11.3,4,0,0,1
9474,1,3,0.1253,167.34000,10.308953,0.00,697,1860.000000,0,0.0,1,1,0,1
9475,1,1,0.1461,836.91125,11.695247,20.52,707,5580.000000,78468,84.3,2,0,0,1
9476,1,0,0.0894,305.01000,11.350407,12.01,742,5039.958333,47474,56.1,0,0,0,1


### 3.資料對數化

In [25]:
def Column_ln(data, column):
    ''' column 取 log'''
    data[column] = np.log(data[column])
    return data

Column_ln(data, 'int.rate')

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,-2.122767,481.40000,11.775290,10.85,682,5071.000000,966,13.1,5,0,0,0
1,1,0,-2.400198,238.75000,11.002100,4.86,752,7626.958333,8575,25.0,1,0,0,0
2,1,2,-2.102096,148.28000,11.835009,19.00,677,6059.958333,27587,74.8,1,1,0,0
3,1,0,-2.558768,156.10000,10.803649,4.71,797,4200.041667,1299,4.2,2,0,0,0
4,0,5,-2.077044,267.73000,10.819778,21.29,697,3060.041667,34938,59.6,4,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9473,1,4,-2.077044,133.87000,10.858999,1.34,742,1770.000000,2386,11.3,4,0,0,1
9474,1,3,-2.077044,167.34000,10.308953,0.00,697,1860.000000,0,0.0,1,1,0,1
9475,1,1,-1.923464,836.91125,11.695247,20.52,707,5580.000000,78468,84.3,2,0,0,1
9476,1,0,-2.414635,305.01000,11.350407,12.01,742,5039.958333,47474,56.1,0,0,0,1


### 4.資料標準化

In [26]:
def Standard_data(data, method):
    ''' 資料標準化 '''
    stdVars = ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec']
    if method == 'standardization':
        scaler = StandardScaler()
    elif method == 'min-max scaling':
        scaler = MinMaxScaler()
    else:
        raise ValueError("standard_data method must be either 'standardization' or 'min-max scaling'")
    data[stdVars] = scaler.fit_transform(data[stdVars])
    return data

Standard_data(data, 'standardization')

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,0,1,0.003856,0.794432,1.371606,-0.254420,-0.760466,0.254116,-0.471535,-1.159523,1.554484,-0.299776,-0.236378,0
1,1,0,-1.218811,-0.388700,0.112801,-1.124810,1.081185,1.372369,-0.246349,-0.749289,-0.261372,-0.299776,-0.236378,0
2,1,2,0.094953,-0.829821,1.468833,0.929832,-0.892012,0.686794,0.316305,0.967489,-0.261372,1.526238,-0.236378,0
3,1,0,-1.917648,-0.791692,-0.210290,-1.146606,2.265104,-0.126935,-0.461680,-1.466337,0.192592,-0.299776,-0.236378,0
4,0,5,0.205359,-0.247397,-0.184030,1.262585,-0.365826,-0.625695,0.533856,0.443493,1.100520,1.526238,-0.236378,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9473,1,4,0.205359,-0.900083,-0.120177,-1.636290,0.818092,-1.190098,-0.429510,-1.221575,1.100520,-0.299776,-0.236378,1
9474,1,3,0.205359,-0.736887,-1.015689,-1.831002,-0.365826,-1.150723,-0.500123,-1.611125,-0.261372,1.526238,-0.236378,1
9475,1,1,0.882204,2.527863,1.241291,1.150699,-0.102733,0.476808,1.822112,1.294987,0.192592,-0.299776,-0.236378,1
9476,1,0,-1.282435,-0.065624,0.679868,-0.085864,0.818092,0.240535,0.904855,0.322836,-0.715336,-0.299776,-0.236378,1


## 變數篩選

### 1.分割 data

In [27]:
def split_data(data, size):
    '''training data or testing data'''
    data_paid = data[data['not.fully.paid'] == 0]
    data_unpaid = data[data['not.fully.paid'] == 1]
    
    data_paid_train, data_paid_test = train_test_split(data_paid, train_size=size, random_state=1117)
    data_unpaid_train, data_unpaid_test = train_test_split(data_unpaid, train_size=size, random_state=1117)
    
    data_train = pd.concat([data_paid_train, data_unpaid_train], ignore_index=True)
    data_test = pd.concat([data_paid_test, data_unpaid_test], ignore_index=True)
    
    x_train = data_train.drop('not.fully.paid', axis=1)
    x_test = data_test.drop('not.fully.paid', axis=1)
    y_train = data_train['not.fully.paid']
    y_test = data_test['not.fully.paid']
    return data_train, data_test, x_train, x_test, y_train, y_test    


data_train, data_test, x_train, x_test, y_train, y_test = split_data(data, 0.7)

### 2.特徵選擇
1. forward stepwise( $R^2 \rightarrow AIC$)
2. lasso
3. ridge

In [31]:
def forward_stepwise_selection(data):
    '''使用 forward stepwise 進行變數篩選'''
    x = data.drop('not.fully.paid', axis=1)
    x = np.array(x)
    y = data['not.fully.paid']
    y = np.array(y)
    y = y[:, np.newaxis]
    n, k = x.shape
    
    SST = y.T@y
    remainingIndices = list(range(k))
    selectedIndices = []
    xk = x[:, selectedIndices]
    AIC = np.zeros((k+1, 1))
    AIC[0, 0] = SST /n
 
    minAIC = np.inf
    minAIC_indices = []

    for i in range(k):
        Rsquared = np.zeros([k]) - 999
        for m in remainingIndices:
            x1 = np.concatenate((xk, x[:, m:m+1]), axis=1)
            bhats = inv(x1.T@x1)@x1.T@y
            SSR = (y-x1@bhats).T @ (y-x1@bhats)
            Rsquared[m] = (1-SSR/SST).item()
        '''求出使 R squared 最大的 X並加入selectedIndices中'''
        selectedIndices.append(np.argmax(Rsquared))         
        remainingIndices.remove(selectedIndices[-1])        
        xk = x[:, selectedIndices]
        AIC[i+1, 0] = (y - xk @ inv(xk.T@xk) @ xk.T@y).T@(y - xk @ inv(xk.T@xk) @ xk.T@y) / n + 2*(i+1)/n
        
        '''select the minimum AIC'''
        if AIC[i+1, 0] < minAIC:
            minAIC = AIC[i+1, 0]
            minAIC_indices = selectedIndices.copy()
            
        selectedIndices_name = data.columns[selectedIndices].tolist()
        minAIC_indicesName = data.columns[minAIC_indices].tolist()
    return selectedIndices_name, minAIC_indicesName

def lasso_selection(x_train, y_train, alpha):
    '''使用 lasso 進行變數篩選'''
    lasso = Lasso(alpha=alpha)
    lasso.fit(x_train, y_train)
    coef = lasso.coef_
    selected_features = x_train.columns[coef != 0].tolist()
    return selected_features

def ridge_selection(x_train, y_train, alpha):
    '''使用 ridge 進行變數篩選'''
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train, y_train)
    coef = ridge.coef_
    selected_features = x_train.columns[abs(coef) > 1e-5].tolist()
    return selected_features

forward_columnName, selectedFeatures_forward = forward_stepwise_selection(data)
print("Selected features by Forward Stepwise Model Selection:", selectedFeatures_forward)
selectedFeatures_lasso = lasso_selection(x_train, y_train, alpha=0.01)
print("Selected features by Lasso:", selectedFeatures_lasso)
selectedFeatures_ridge = ridge_selection(x_train, y_train, alpha=0.01)
print("Selected features by Ridge:", selectedFeatures_ridge)


  AIC[0, 0] = SST /n
  AIC[i+1, 0] = (y - xk @ inv(xk.T@xk) @ xk.T@y).T@(y - xk @ inv(xk.T@xk) @ xk.T@y) / n + 2*(i+1)/n


Selected features by Forward Stepwise Model Selection: ['purpose', 'fico', 'credit.policy', 'inq.last.6mths', 'revol.bal', 'log.annual.inc', 'installment']
Selected features by Lasso: ['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc', 'fico', 'revol.bal', 'inq.last.6mths', 'pub.rec']
Selected features by Ridge: ['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec']


In [32]:
#%% data_features

def data_features(method):
    if method == 'forward stepwise':        
        xTrain = x_train[selectedFeatures_forward]
        xTest = x_test[selectedFeatures_forward]
    elif method == 'lasso':
        xTrain = x_train[selectedFeatures_lasso]
        xTest = x_test[selectedFeatures_lasso]
    elif method == 'ridge':
        xTrain = x_train[selectedFeatures_ridge]
        xTest = x_test[selectedFeatures_ridge]
    return xTrain, xTest

x_train, x_test = data_features(method = 'lasso')


In [33]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'Logistic Regression': LogisticRegression(random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), random_state=42)
    }

    for model_name, model in models.items():
        print(f"Training and evaluating {model_name}:")
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-score:", f1)
        
        # Perform cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        print("Cross-validation scores:", cv_scores)
        print("Mean cross-validation score:", cv_scores.mean())
        
        print()

# 假設您已經將數據拆分為 X_train, X_test, y_train, y_test
train_and_evaluate_models(x_train, x_test, y_train, y_test)


Training and evaluating Logistic Regression:
Accuracy: 0.8431786216596343
Precision: 0.47058823529411764
Recall: 0.017977528089887642
F1-score: 0.03463203463203463
Cross-validation scores: [0.84476262 0.83948757 0.84626978 0.8417483  0.84313725]
Mean cross-validation score: 0.8430811058409798

Training and evaluating Decision Tree:
Accuracy: 0.7380450070323488
Precision: 0.21264367816091953
Recall: 0.24943820224719102
F1-score: 0.2295760082730093
Cross-validation scores: [0.75056518 0.73850791 0.75659382 0.7317257  0.7413273 ]
Mean cross-validation score: 0.7437439830143407

Training and evaluating Random Forest:
Accuracy: 0.8361462728551337
Precision: 0.3018867924528302
Recall: 0.035955056179775284
F1-score: 0.0642570281124498
Cross-validation scores: [0.84024115 0.83496609 0.83948757 0.84024115 0.8438914 ]
Mean cross-validation score: 0.8397654696914415

Training and evaluating SVM:
Accuracy: 0.8438818565400844
Precision: 0.6
Recall: 0.006741573033707865
F1-score: 0.01333333333333333



Accuracy: 0.840365682137834
Precision: 0.3902439024390244
Recall: 0.035955056179775284
F1-score: 0.06584362139917696




Cross-validation scores: [0.84024115 0.83195177 0.8455162  0.83647325 0.84238311]
Mean cross-validation score: 0.8393130946657255





In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        'Logistic Regression': LogisticRegression(random_state=1117),
        'Decision Tree': DecisionTreeClassifier(random_state=1117),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=1117),
        'SVM': SVC(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=1117),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), random_state=1117)
    }

    for model_name, model in models.items():
        print(f"Training and evaluating {model_name}:")
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        
        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-score:", f1)
        
        # Perform cross-validation
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        print("Cross-validation scores:", cv_scores)
        print("Mean cross-validation score:", cv_scores.mean())
        
        print()

# 假設您已經將數據拆分為 X_train, X_test, y_train, y_test
train_and_evaluate_models(x_train, x_test, y_train, y_test)


Training and evaluating Logistic Regression:
Accuracy: 0.8431786216596343
Precision: 0.47058823529411764
Recall: 0.017977528089887642
F1-score: 0.03463203463203463
Cross-validation scores: [0.84476262 0.83948757 0.84626978 0.8417483  0.84313725]
Mean cross-validation score: 0.8430811058409798

Training and evaluating Decision Tree:
Accuracy: 0.7454289732770746
Precision: 0.21818181818181817
Recall: 0.24269662921348314
F1-score: 0.2297872340425532
Cross-validation scores: [0.74453655 0.73549359 0.74453655 0.73021854 0.73906486]
Mean cross-validation score: 0.7387700173107328

Training and evaluating Random Forest:
Accuracy: 0.8340365682137834
Precision: 0.2631578947368421
Recall: 0.033707865168539325
F1-score: 0.05976095617529881
Cross-validation scores: [0.84400904 0.83195177 0.84250188 0.84024115 0.83559578]
Mean cross-validation score: 0.8388599240055422

Training and evaluating SVM:
Accuracy: 0.8438818565400844
Precision: 0.6
Recall: 0.006741573033707865
F1-score: 0.0133333333333333



Cross-validation scores: [0.8417483  0.83496609 0.8417483  0.84250188 0.84162896]
Mean cross-validation score: 0.8405187082078788



