In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [3]:
import scipy.stats as stats
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [28]:
train = pd.read_csv('data/train.csv')

In [29]:
train.sample(10)

Unnamed: 0,uesr_id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
653,723,46,No,Travel_Rarely,566,Research & Development,7,2,Medical,1,...,2,80,1,13,3,3,8,7,0,7
625,766,50,No,Travel_Rarely,1464,Research & Development,2,4,Medical,1,...,4,80,1,29,2,2,8,1,7,7
782,698,35,No,Travel_Rarely,1219,Sales,18,3,Medical,1,...,2,80,0,5,3,3,5,2,1,0
633,507,29,No,Travel_Rarely,1176,Sales,3,2,Medical,1,...,1,80,1,6,5,2,6,0,1,2
191,1328,27,No,Travel_Rarely,728,Sales,23,1,Medical,1,...,4,80,1,9,5,3,9,8,5,8
152,320,27,No,Travel_Rarely,1377,Sales,2,3,Life Sciences,1,...,1,80,0,5,3,3,5,4,0,4
407,1005,29,No,Travel_Rarely,332,Human Resources,17,3,Other,1,...,1,80,0,10,3,2,10,9,0,9
100,689,20,Yes,Travel_Rarely,129,Research & Development,4,3,Technical Degree,1,...,2,80,0,1,2,3,1,0,0,0
146,590,33,No,Travel_Rarely,213,Research & Development,7,3,Medical,1,...,4,80,0,14,3,4,13,9,3,7
1052,1456,35,No,Travel_Frequently,1199,Research & Development,18,4,Life Sciences,1,...,4,80,2,10,2,4,10,2,0,2


In [30]:
resumetable(train)

Dataset Shape: (1176, 36)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,uesr_id,int64,0,1176,1374,1092,768,10.2
1,Age,int64,0,43,58,45,40,5.13
2,Attrition,object,0,2,No,No,No,0.63
3,BusinessTravel,object,0,3,Travel_Rarely,Travel_Rarely,Travel_Rarely,1.15
4,DailyRate,int64,0,783,605,950,300,9.46
5,Department,object,0,3,Sales,Research & Development,Sales,1.12
6,DistanceFromHome,int64,0,29,21,28,26,4.35
7,Education,int64,0,5,3,3,3,2.0
8,EducationField,object,0,6,Life Sciences,Technical Degree,Marketing,2.04
9,EmployeeCount,int64,0,1,1,1,1,0.0


In [31]:
def extract_features(df, is_train=False):
    # target
    if is_train:
        attrition_dict = {'No':0,'Yes':1}
        df['Attrition'] = df['Attrition'].map(lambda x: attrition_dict[x])
    # BusinessTravel
    businesstravel_dict = {'Non-Travel':0, 'Travel_Rarely':1, 'Travel_Frequently':2}
    df['BusinessTravel'] = df['BusinessTravel'].map(lambda x: businesstravel_dict[x])
    # Department
    department_dict = {'Sales':0, 'Research & Development':1, 'Human Resources':2}
    df['Department'] = df['Department'].map(lambda x: department_dict[x])
    # EducationField
    educationfield_dict = {'Life Sciences':0, 'Medical':1, 'Marketing':2, 'Technical Degree':3, 'Human Resources':4, 'Other':5}
    df['EducationField'] = df['EducationField'].map(lambda x: educationfield_dict[x])
    # Gender
    gender_dict = {'Male':0, 'Female': 1}
    df['Gender'] = df['Gender'].map(lambda x: gender_dict[x])
    # JobRole
    jobrole_dict = {'Sales Executive':0, 
                    'Research Scientist':1, 
                    'Laboratory Technician':2, 
                    'Manufacturing Director':3, 
                    'Healthcare Representative':4,
                    'Manager':5, 
                    'Sales Representative':6,
                    'Research Director':7,
                    'Human Resources':8
                   }
    df['JobRole'] = df['JobRole'].map(lambda x: jobrole_dict[x])
    # MaritalStatus
    maritalstatus_dict = {'Single':0, 'Married':1, 'Divorced':2}
    df['MaritalStatus'] = df['MaritalStatus'].map(lambda x: maritalstatus_dict[x])
    # Over18
    df = df.drop(['Over18'], axis=1)
    # OverTime
    overtime_dict = {'Yes':0, 'No':1}
    df['OverTime'] = df['OverTime'].map(lambda x: overtime_dict[x])
    return df

In [32]:
train_ex = extract_features(train, True)

In [33]:
train_ex.sample(5)

Unnamed: 0,uesr_id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
314,200,27,0,2,472,1,1,1,3,1,...,3,80,1,6,1,3,2,2,2,0
413,95,54,0,1,1217,1,2,4,3,1,...,1,80,1,16,5,1,4,3,0,3
85,251,39,0,2,505,1,2,4,3,1,...,4,80,0,20,1,3,19,6,11,8
169,64,36,0,1,1223,1,8,3,3,1,...,2,80,3,17,2,3,17,14,12,8
618,119,43,0,2,394,0,26,2,0,1,...,4,80,2,25,3,4,25,12,4,12


In [34]:
test = pd.read_csv('data/test.csv')

In [35]:
test_ex = extract_features(test, False)

In [121]:
train['Attrition'].value_counts()

0    988
1    188
Name: Attrition, dtype: int64

In [36]:
test_ex.sample(5)

Unnamed: 0,user_id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
51,182,41,1,1356,0,20,2,2,1,248,...,4,80,0,4,5,2,4,3,0,2
204,572,29,1,657,1,27,3,1,1,793,...,1,80,1,11,3,2,8,7,1,1
124,82,55,1,111,0,1,2,0,1,106,...,4,80,1,24,4,3,1,0,1,0
208,303,31,1,218,0,7,3,3,1,416,...,2,80,1,10,3,2,8,7,7,7
242,18,53,1,1219,0,2,4,0,1,23,...,3,80,0,31,3,3,25,8,3,7


In [55]:
test_ex.shape

(294, 34)

In [42]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools
# 绘制混淆矩阵
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Input
    - cm : 计算出的混淆矩阵的值
    - classes : 混淆矩阵中每一行每一列对应的列
    - normalize : True:显示百分比, False:显示个数
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [46]:
target = 'Attrition'
features = [x for x in train_ex.columns if x not in ['Attrition', 'uesr_id']]

### Lightgbm

In [47]:
params = {'num_leaves': 60, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.1,             #l1正则
          "verbosity": -1,
          "nthread": -1,                #线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},  ##评价函数选择
          "random_state": 2019, #随机数种子，可以防止每次运行的结果不一致
         }

In [57]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_ex.shape[0], ))
test_pred_prob = np.zeros((test_ex.shape[0], ))

In [61]:
## train and predict
feature_importance_df = pd.DataFrame()
train_x = train_ex[features].copy()
train_y = train_ex[target]
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    print("fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y[trn_idx])
    val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y[val_idx])

    clf = lgb.train(params,
                    trn_data,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=20,
                    early_stopping_rounds=60)
    prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    test_pred_prob += clf.predict(test_ex[features], num_iteration=clf.best_iteration) / folds.n_splits

fold 1
Training until validation scores don't improve for 60 rounds
[20]	training's binary_logloss: 0.362436	training's auc: 0.904865	valid_1's binary_logloss: 0.389455	valid_1's auc: 0.794524
[40]	training's binary_logloss: 0.316149	training's auc: 0.926338	valid_1's binary_logloss: 0.367271	valid_1's auc: 0.803961
[60]	training's binary_logloss: 0.284053	training's auc: 0.943468	valid_1's binary_logloss: 0.360125	valid_1's auc: 0.803296
[80]	training's binary_logloss: 0.259559	training's auc: 0.953882	valid_1's binary_logloss: 0.353103	valid_1's auc: 0.810074
[100]	training's binary_logloss: 0.238779	training's auc: 0.96189	valid_1's binary_logloss: 0.350838	valid_1's auc: 0.810207
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.238779	training's auc: 0.96189	valid_1's binary_logloss: 0.350838	valid_1's auc: 0.810207
fold 2
Training until validation scores don't improve for 60 rounds
[20]	training's binary_logloss: 0.36504	training's auc: 0.888295	v

In [63]:
test_pred_prob

array([0.06452175, 0.05525384, 0.1600274 , 0.11032989, 0.73360876,
       0.33149214, 0.3391251 , 0.1223773 , 0.05471959, 0.14636679,
       0.08430259, 0.10360928, 0.074331  , 0.62371538, 0.09204717,
       0.02316301, 0.09601098, 0.08151625, 0.06997116, 0.13679904,
       0.41835902, 0.11354797, 0.06928391, 0.05947818, 0.36182525,
       0.27893989, 0.06511181, 0.05224476, 0.62019878, 0.05739555,
       0.06368382, 0.06047625, 0.22024513, 0.11955981, 0.0837441 ,
       0.05996718, 0.11916848, 0.16043379, 0.05537568, 0.11529996,
       0.08186755, 0.04727629, 0.07104397, 0.08192651, 0.06541345,
       0.56389666, 0.2534212 , 0.04650051, 0.71988045, 0.42331891,
       0.22118788, 0.43281723, 0.11838171, 0.09772702, 0.43488904,
       0.13721028, 0.05455279, 0.09854813, 0.0370498 , 0.23996588,
       0.04660964, 0.19797119, 0.07944034, 0.09607871, 0.36036127,
       0.10582262, 0.18843977, 0.11435005, 0.07568054, 0.18081901,
       0.10525129, 0.33393902, 0.10083872, 0.04566851, 0.07155

In [81]:
fresult = pd.DataFrame(test['user_id'])

In [82]:
threshold = 0.5
result = np.zeros((test.shape[0], ))
for idx, pred in enumerate(test_pred_prob):
    result[idx] = 1 if pred > threshold else 0

In [119]:
fresult['Attrition'] = test_pred_prob

In [117]:
fresult['Attrition'] = fresult['Attrition'].astype('int')

In [90]:
import time

In [120]:
fresult.to_csv(f'result-{int(time.time())}.csv', index=None)

In [97]:
fold_importance_df.sort_values('importance', ascending=False)

Unnamed: 0,Feature,importance,fold
0,Age,87,5
17,MonthlyIncome,76,5
18,MonthlyRate,65,5
20,OverTime,62,5
2,DailyRate,56,5
19,NumCompaniesWorked,56,5
11,HourlyRate,55,5
9,EnvironmentSatisfaction,46,5
4,DistanceFromHome,42,5
25,StockOptionLevel,41,5


### XGBClassifier

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb

In [106]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.20, random_state=1729)

In [107]:
clf = ExtraTreesClassifier(random_state=1729)
selector = clf.fit(X_train, y_train)



In [108]:
fs = SelectFromModel(selector, prefit=True)

X_train = fs.transform(X_train)
X_test = fs.transform(X_test)
test = fs.transform(test_ex[features])

In [109]:
m2_xgb = xgb.XGBClassifier(n_estimators=110, nthread=-1, max_depth = 4, seed=1729)
m2_xgb.fit(X_train, y_train, eval_metric="auc", verbose = False, eval_set=[(X_test, y_test)])

# calculate the auc score
print("Roc AUC: ", roc_auc_score(y_test, m2_xgb.predict_proba(X_test)[:,1], average='macro'))

Roc AUC:  0.8547787909490038


In [110]:
probs = m2_xgb.predict_proba(test)

In [111]:
test_id = test_ex.user_id

In [112]:
submission = pd.DataFrame({"user_id":test_id, "Attrition": probs[:,1]})
submission.to_csv(f'result-{int(time.time())}.csv', index=False)

In [113]:
probs

array([[0.9517491 , 0.04825092],
       [0.9498245 , 0.05017549],
       [0.9513846 , 0.04861542],
       [0.9574951 , 0.04250488],
       [0.26311904, 0.73688096],
       [0.9765531 , 0.0234469 ],
       [0.7440974 , 0.2559026 ],
       [0.92326665, 0.07673335],
       [0.95909214, 0.04090788],
       [0.7672973 , 0.23270266],
       [0.9873375 , 0.01266249],
       [0.8335949 , 0.16640511],
       [0.98026717, 0.01973284],
       [0.21220803, 0.78779197],
       [0.9873468 , 0.01265316],
       [0.9865611 , 0.01343888],
       [0.93694437, 0.06305563],
       [0.8832462 , 0.11675381],
       [0.9698582 , 0.0301418 ],
       [0.65969133, 0.34030867],
       [0.19703817, 0.8029618 ],
       [0.97723466, 0.02276536],
       [0.9544496 , 0.04555043],
       [0.9753926 , 0.02460744],
       [0.6848353 , 0.31516466],
       [0.953155  , 0.04684501],
       [0.94525576, 0.05474424],
       [0.98857605, 0.01142392],
       [0.06027222, 0.9397278 ],
       [0.97050387, 0.02949616],
       [0.

In [114]:
probs[:,1]

array([0.04825092, 0.05017549, 0.04861542, 0.04250488, 0.73688096,
       0.0234469 , 0.2559026 , 0.07673335, 0.04090788, 0.23270266,
       0.01266249, 0.16640511, 0.01973284, 0.78779197, 0.01265316,
       0.01343888, 0.06305563, 0.11675381, 0.0301418 , 0.34030867,
       0.8029618 , 0.02276536, 0.04555043, 0.02460744, 0.31516466,
       0.04684501, 0.05474424, 0.01142392, 0.9397278 , 0.02949616,
       0.00992165, 0.00946045, 0.06804861, 0.0378196 , 0.00465837,
       0.07071455, 0.05839734, 0.09302641, 0.2064541 , 0.02695853,
       0.01647674, 0.03219299, 0.00643916, 0.04784534, 0.06475449,
       0.6668541 , 0.37102807, 0.00960752, 0.96731913, 0.42801738,
       0.07129821, 0.16135834, 0.26409116, 0.02809616, 0.5109537 ,
       0.12626176, 0.02220662, 0.06819579, 0.011113  , 0.410419  ,
       0.0208216 , 0.02049159, 0.03745092, 0.0462529 , 0.20204404,
       0.00886445, 0.04057272, 0.02281234, 0.134923  , 0.06786575,
       0.06037744, 0.46350402, 0.0344633 , 0.03582943, 0.06543

In [115]:
submission

Unnamed: 0,user_id,Attrition
0,442,0.048251
1,1091,0.050175
2,981,0.048615
3,785,0.042505
4,1332,0.736881
...,...,...
289,1439,0.127115
290,481,0.029794
291,124,0.081470
292,198,0.033474
