<a href="https://colab.research.google.com/github/wuliubao/ML-000/blob/main/Week16/graduation_project_and_summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**毕业设计**

In [9]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, accuracy_score

In [37]:
#处理数据
df_train = pd.read_csv("train_final.csv")
df_test = pd.read_csv("test_final.csv")

df_train.fillna(0,inplace=True)
df_test.fillna(0,inplace=True)

seed = 42 # for the same data division

kf = KFold(n_splits=5, random_state=seed,shuffle=True)
X_train = df_train.drop(columns=['loan_status']).values
Y_train = df_train['loan_status'].values.astype(int)

X_test = df_test.drop(columns=['loan_status']).values
Y_test = df_test['loan_status'].values.astype(int)

# 一.构建衍生变量

1.组合新特征：根据业务逻辑，构造变量：还款压力

In [None]:
#把每月的还款:continuous_installment和月收入即年收入/12:continuous_annual_inc/12比值，得到结果即还款的压力
X_train['pressure'] = X_train['continuous_installment'] / (X_train['continuous_annual_inc'] / 12)
X_test['pressure'] = X_test['continuous_installment'] / (X_test['continuous_annual_inc'] / 12)

2.数据处理

In [None]:
#对大数据取对数展开信息
X_train['log_annual_inc'] = np.floor(np.log10(X_train['continuous_annual_inc'])
X_test['log_annual_inc'] = np.floor(np.log10(X_test['continuous_annual_inc'])

3.特征离散化

In [None]:
# 等宽离散
k = 5 # 分为5个等宽区间
X_train['area_dti'] = pd.cut(X_train['continuous_dti'], k, labels=range(k)])
X_test['area_dti'] = pd.cut(X_test['continuous_dti'], k, labels=range(k)])

# 二.传统模型：逻辑回归、高斯朴素贝叶斯、随机森林

1.逻辑回归

In [39]:
from sklearn import linear_model

lr_model = linear_model.LogisticRegression(C=1e9)

lr_model.fit(X_train, Y_train)
lr_model.score(X_test, Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.91132

2.高斯朴素贝叶斯

In [40]:
from sklearn import naive_bayes

bayes_clf = naive_bayes.GaussianNB()
bayes_clf.fit(X_train, Y_train)
bayes_clf.score(X_test, Y_test)

0.90478

3.随机森林

In [41]:
from sklearn import ensemble

rf_clf = ensemble.RandomForestClassifier()
rf_clf.fit(X_train, Y_train)
rf_clf.score(X_test, Y_test)

0.91628

# 三.集成学习：基于lightGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold

X_train.shape, Y_train.shape

lgb_train = lgb.Dataset(x_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train, free_raw_data=False)

five_fold_data = []

for train_index, eval_index in kf.split(X_train):
    x_train, x_eval = X_train[train_index], X_train[eval_index]
    y_train, y_eval = Y_train[train_index], Y_train[eval_index]
    
    five_fold_data.append([(x_train, y_train), (x_eval, y_eval)])

def get_model(param):
    model_list = []
    for idx, [(x_train, y_train), (x_eval, y_eval)] in enumerate(five_fold_data):
        print('{}-th model is training:'.format(idx))
        train_data = lgb.Dataset(x_train, label=y_train)
        validation_data = lgb.Dataset(x_eval, label=y_eval)
        bst = lgb.train(param, train_data, valid_sets=[validation_data])
        model_list.append(bst)
    return model_list

param_base = {'num_leaves': 31, 'objective': 'binary', 'metric': 'binary', 'num_round':1000}

param_fine_tuning = {'num_thread': 8,'num_leaves': 128, 'metric': 'binary', 'objective': 'binary', 'num_round': 1000, 
                     'learning_rate': 3e-3, 'feature_fraction': 0.6, 'bagging_fraction': 0.8}


# base param train
param_base_model = get_model(param_base)

# param fine tuning
param_fine_tuning_model = get_model(param_fine_tuning)

def test_model(model_list):
    data = X_test
    five_fold_pred = np.zeros((5, len(X_test)))
    for i, bst in enumerate(model_list):
        ypred = bst.predict(data, num_iteration=bst.best_iteration)
        five_fold_pred[i] = ypred
    ypred_mean = (five_fold_pred.mean(axis=-2)>0.5).astype(int)
    return accuracy_score(ypred_mean, Y_test)

base_score = test_model(param_base_model)
fine_tuning_score = test_model(param_fine_tuning_model)

In [43]:
print('base: {}, fine tuning: {}'.format(base_score, fine_tuning_score))

base: 0.91626, fine tuning: 0.9176


# 四.深度学习：基于tabNet

In [None]:
!pip install pytorch-tabnet

In [42]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

tabnet_clf = TabNetClassifier()
tabnet_clf.fit(X_train, Y_train, max_epochs=50, patience=5)
pre_test = tabnet_clf.predict(X_test)
accuracy_score(Y_test, pre_test)

Device used : cpu
No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 0.3637  |  0:00:07s
epoch 1  | loss: 0.22322 |  0:00:15s
epoch 2  | loss: 0.21633 |  0:00:22s
epoch 3  | loss: 0.21387 |  0:00:30s
epoch 4  | loss: 0.21347 |  0:00:37s
epoch 5  | loss: 0.21252 |  0:00:44s
epoch 6  | loss: 0.21078 |  0:00:52s
epoch 7  | loss: 0.21084 |  0:00:59s
epoch 8  | loss: 0.20861 |  0:01:06s
epoch 9  | loss: 0.20852 |  0:01:13s
epoch 10 | loss: 0.20786 |  0:01:20s
epoch 11 | loss: 0.20745 |  0:01:27s
epoch 12 | loss: 0.20732 |  0:01:34s
epoch 13 | loss: 0.20591 |  0:01:41s
epoch 14 | loss: 0.20517 |  0:01:48s
epoch 15 | loss: 0.2058  |  0:01:55s
epoch 16 | loss: 0.20533 |  0:02:02s
epoch 17 | loss: 0.2057  |  0:02:09s
epoch 18 | loss: 0.20717 |  0:02:16s
epoch 19 | loss: 0.2051  |  0:02:23s
epoch 20 | loss: 0.2058  |  0:02:30s
epoch 21 | loss: 0.20426 |  0:02:37s
epoch 22 | loss: 0.20468 |  0:02:44s
epoch 23 | loss: 0.20402 |  0:02:51s
epoch 24 | loss: 0.20

0.91412

**毕业总结**

通过这几个月跟在王老师后面的学习，收获非常多，在对整个机器学习的路上进步很多。总结了一下：

1.   对机器学习知识面整体有个概论，对业务实现有个流程了解。
2.   所需的数学知识学习有了思路
3.   手写代码，加深了对机器学习的理解

最后感谢王老师、助教、班主任同学的帮助，让我打开了机器学习之门，后面学习的路还能艰辛和漫长，需要继续努力！



