In [1]:
import pandas as pd
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.metrics import roc_curve, accuracy_score
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import StratifiedKFold
import warnings
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
warnings.filterwarnings('ignore')
%matplotlib inline

#### 1、训练四分类器 M1

In [2]:
train = pd.read_csv('EdTech_train_data.csv')
X = train.drop(['No','rank'],axis=1)
y = train['rank']
test = pd.read_csv('EdTech_test_data.csv')

In [3]:
#数据标准化
from sklearn import preprocessing
# standardize the data attributes
standardized_X = preprocessing.scale(X)
# normalize the data attributes
normalized_X = preprocessing.normalize(X)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)
accs = []
conf_mats=[]
# np_X = X.values
# np_y = y.values
model = RandomForestClassifier(n_estimators=100, random_state=2, max_depth=100)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    labels = list(set(y_test))
    conf_mat = confusion_matrix(y_test, y_pred, labels = labels)
    conf_mats.append(conf_mat)
    acc = accuracy_score(y_test,y_pred)
    accs.append(acc)
    print('accuracy:', acc)
print('mean_acc',np.mean(np.array(accs)))

accuracy: 0.9120879120879121
accuracy: 0.8791208791208791
accuracy: 0.8681318681318682
accuracy: 0.8888888888888888
accuracy: 0.9111111111111111
accuracy: 0.9111111111111111
accuracy: 0.8555555555555555
accuracy: 0.8666666666666667
accuracy: 0.9101123595505618
accuracy: 0.9090909090909091
mean_acc 0.8911877261315462


查看混淆矩阵

In [5]:
conf_mat_total = np.zeros( (4,4) )
for i in range(10):
    conf_mat_total +=conf_mats[i]
print(conf_mat_total)
#可以看到影响模型分类性能的最主要是0和1分类不清楚

[[ 87.  73.   0.   0.]
 [ 24. 214.   1.   0.]
 [  0.   0. 333.   0.]
 [  0.   0.   0. 168.]]


#### 2、划分交叉验证数据集

In [6]:
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)
train_indexes = []
test_indexes = []
for train_index, test_index in skf.split(X, y):
    train_indexes.append(train_index)
    test_indexes.append(test_index)
#取第二折当验证集
X_train, X_test = X.loc[train_indexes[6]], X.loc[test_indexes[6]]
y_train, y_test = y.loc[train_indexes[6]], y.loc[test_indexes[6]]

一、训练过程

1)训练四分类模型Model1

In [7]:
Model1 = RandomForestClassifier(n_estimators=100, random_state=2)
Model1.fit(X_train,y_train)
y_pred = Model1.predict(X_test)
labels = list(set(y_test))
conf_mat = confusion_matrix(y_test, y_pred, labels = labels)
acc = accuracy_score(y_test,y_pred)
print('accuracy:', acc)
print('conf_mat:\n',conf_mat)
#很容易将0预测为1

accuracy: 0.8555555555555555
conf_mat:
 [[ 6 10  0  0]
 [ 3 21  0  0]
 [ 0  0 33  0]
 [ 0  0  0 17]]


2)训练三分类模型Model2

In [8]:
Model2 = RandomForestClassifier(n_estimators=100, random_state=2, max_depth=100)
Model2.fit(X_train.loc[y_train!=0],y_train.loc[y_train!=0]) 
y_pred = Model2.predict(X_test)
labels = list(set(y_test))
conf_mat = confusion_matrix(y_test, y_pred, labels = labels)
acc = accuracy_score(y_test,y_pred)
print('accuracy:', acc)
print('conf_mat:\n',conf_mat)
#对测试集里为123的基本预测正确

accuracy: 0.8222222222222222
conf_mat:
 [[ 0 16  0  0]
 [ 0 24  0  0]
 [ 0  0 33  0]
 [ 0  0  0 17]]


3)训练二分类模型Model3

In [9]:
y_train_M3 = copy.deepcopy(y_train)
y_test_M3 = copy.deepcopy(y_test)
y_train_M3[y_train_M3!=0]=123
y_test_M3[y_test_M3!=0]=123
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
Model3 = RandomForestClassifier(n_estimators=100, random_state=2, class_weight={0: 5, 123:1})
# Model3 = AdaBoostClassifier()
# Model3 = XGBClassifier(n_estimators=10)
# sm = SMOTEENN()
# sm = SMOTETomek()
# X_resampled, y_resampled = sm.fit_sample(X_train,y_train_M3)
# Model3.fit(X_resampled, y_resampled)
Model3.fit(X_train,y_train_M3)
y_pred = Model3.predict(X_test.values)
labels = list(set(y_test_M3))
conf_mat = confusion_matrix(y_test_M3, y_pred, labels = labels)
acc = accuracy_score(y_test_M3,y_pred)
print('accuracy:', acc)
print('conf_mat:\n',conf_mat)
#0更容易错分为123

accuracy: 0.8666666666666667
conf_mat:
 [[ 6 10]
 [ 2 72]]


使用imbalanced learn

二、预测过程

In [10]:
#先用四分类模型M1判断是属于 0123四个类中的哪一个
y_pred_M1 = Model1.predict(X_test)
#再用二分类模型M3判断是属于0还是123
y_pred_M3 = Model3.predict(X_test)
#三分类模型M2判断是属于123中的哪一个
y_pred_M2 = Model2.predict(X_test)

In [11]:
#M1和M3两个都说属于0，那就真是0
y_pred = []
for i in range(len(y_pred_M1)):
    y_pred.append('nan')
    
for i in range(len(y_pred)):
    if y_pred_M1[i] == 0 and y_pred_M3[i]==0:
        y_pred[i]=0

In [12]:
#M1和M3两个都说属于123，那就用M2（三分类模型）的预测值
for i in range(len(y_pred)):
    if y_pred_M1[i] !=0 and y_pred_M3[i]==123:
        y_pred[i]= y_pred_M2[i]

In [13]:
#M1说0，M3说123，使用M1的结果预测为0
for i in range(len(y_pred)):
    if y_pred_M1[i] ==0 and y_pred_M3[i]==123:
        y_pred[i]= y_pred_M1[i]

In [14]:
#M3说0，M1说123，如M1预测值为1，则用M3的结果0，否则用M1的结果
for i in range(len(y_pred)):
    if y_pred_M1[i] !=0 and y_pred_M3[i]==0:
        if y_pred_M1[i] ==1:
            y_pred[i]= 0
        else:
            y_pred[i]= y_pred_M1[i]

In [15]:
acc = accuracy_score(y_test,y_pred)
print('accuracy:', acc)
labels = list(set(y_pred))
conf_mat = confusion_matrix(y_test, y_pred, labels = labels)
print('conf_mat:\n',conf_mat)
#性能由原来的0.86上升到了0.89

accuracy: 0.8444444444444444
conf_mat:
 [[ 6 10  0  0]
 [ 4 20  0  0]
 [ 0  0 33  0]
 [ 0  0  0 17]]


该问题最主要的痛点就是如何辨别rank是属于0还是123  
相当于将原来的四分类问题转化成了二分类问题

3、交叉验证使用该方法

In [16]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,ExtraTreesClassifier

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)
accs = []
conf_mats=[]
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    Model1 = RandomForestClassifier(n_estimators=100, random_state=2)
    
    Model1.fit(X_train,y_train)
    Model2 = RandomForestClassifier(n_estimators=100, random_state=3)
    Model2.fit(X_train.loc[y_train!=0],y_train.loc[y_train!=0]) 
    y_train_M3 = copy.deepcopy(y_train)
    y_test_M3 = copy.deepcopy(y_test)
    y_train_M3[y_train_M3!=0]=123

    Model3 = RandomForestClassifier(n_estimators=100, random_state=2, class_weight={0: 10, 123:1})
    Model3.fit(X_train,y_train_M3)

#     Model3.fit(X_resampled, y_resampled)
    #先用四分类模型M1判断是属于 0123四个类中的哪一个
    y_pred_M1 = Model1.predict(X_test)
    #再用二分类模型M3判断是属于0还是123
    y_pred_M3 = Model3.predict(X_test)
    #三分类模型M2判断是属于123中的哪一个
    y_pred_M2 = Model2.predict(X_test)
    
    y_pred = []
    for i in range(len(y_pred_M1)):
        y_pred.append('nan')
    #M1和M3两个都说属于0，那就真是0
    for i in range(len(y_pred)):
        if y_pred_M1[i] == 0 and y_pred_M3[i]==0:
            y_pred[i]=0
    #M1和M3两个都说属于123，那就用M2（三分类模型）的预测值
    for i in range(len(y_pred)):
        if y_pred_M1[i] !=0 and y_pred_M3[i]==123:
            y_pred[i]= y_pred_M2[i]
    #M1说0，M3说123，使用M1的结果预测为0
    for i in range(len(y_pred)):
        if y_pred_M1[i] ==0 and y_pred_M3[i]==123:
            y_pred[i]= y_pred_M1[i] 
    #M3说0，M1说123，如M1预测值为1，则用M3的结果0，否则用M1的结果
    for i in range(len(y_pred)):
        if y_pred_M1[i] !=0 and y_pred_M3[i]==0:
            y_pred[i]= 0
            if y_pred_M1[i] ==1:
                y_pred[i]= 0
            else:
                y_pred[i]= y_pred_M1[i]
                
    acc = accuracy_score(y_test,y_pred)
    print('accuracy:', acc)
    labels = list(set(y_pred))
    conf_mat = confusion_matrix(y_test, y_pred, labels = labels)
#     print('conf_mat:\n',conf_mat)
    accs.append(acc)
print('mean_acc',np.mean(np.array(accs)))

accuracy: 0.9120879120879121
accuracy: 0.9120879120879121
accuracy: 0.8901098901098901
accuracy: 0.8777777777777778
accuracy: 0.9111111111111111
accuracy: 0.9111111111111111
accuracy: 0.8333333333333334
accuracy: 0.8666666666666667
accuracy: 0.9101123595505618
accuracy: 0.9090909090909091
mean_acc 0.8933488982927184


### 4、预测输出

In [18]:
train = pd.read_csv('EdTech_train_data.csv')
X_train = train.drop(['No','rank'],axis=1)
y_train = train['rank']
test = pd.read_csv('EdTech_test_data.csv')
X_test = test.drop(['No'],axis=1)

In [19]:
Model1 = RandomForestClassifier(n_estimators=100, random_state=2)
Model1.fit(X_train,y_train)

Model2 = RandomForestClassifier(n_estimators=100, random_state=3)
Model2.fit(X_train.loc[y_train!=0],y_train.loc[y_train!=0]) 

y_train_M3 = copy.deepcopy(y_train)
y_test_M3 = copy.deepcopy(y_test)
y_train_M3[y_train_M3!=0]=123

Model3 = RandomForestClassifier(n_estimators=100, random_state=2, class_weight={0: 10, 123:1})
Model3.fit(X_train,y_train_M3)

#     Model3.fit(X_resampled, y_resampled)
#先用四分类模型M1判断是属于 0123四个类中的哪一个
y_pred_M1 = Model1.predict(X_test)
#再用二分类模型M3判断是属于0还是123
y_pred_M3 = Model3.predict(X_test)
#三分类模型M2判断是属于123中的哪一个
y_pred_M2 = Model2.predict(X_test)

y_pred = []
for i in range(len(y_pred_M1)):
    y_pred.append('nan')
#M1和M3两个都说属于0，那就真是0
for i in range(len(y_pred)):
    if y_pred_M1[i] == 0 and y_pred_M3[i]==0:
        y_pred[i]=0
#M1和M3两个都说属于123，那就用M2（三分类模型）的预测值
for i in range(len(y_pred)):
    if y_pred_M1[i] !=0 and y_pred_M3[i]==123:
        y_pred[i]= y_pred_M2[i]
#M1说0，M3说123，使用M1的结果预测为0
for i in range(len(y_pred)):
    if y_pred_M1[i] ==0 and y_pred_M3[i]==123:
        y_pred[i]= y_pred_M1[i] 
#M3说0，M1说123，如M1预测值为1，则用M3的结果0，否则用M1的结果
for i in range(len(y_pred)):
    if y_pred_M1[i] !=0 and y_pred_M3[i]==0:
        y_pred[i]= 0
        if y_pred_M1[i] ==1:
            y_pred[i]= 0
        else:
            y_pred[i]= y_pred_M1[i]