In [226]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler,QuantileTransformer
import warnings
warnings.filterwarnings('ignore')

In [227]:
train = pd.read_csv("train.csv").drop(columns="id")
test = pd.read_csv("test.csv")
train.at[1398, "DailyRate"] = train["DailyRate"].median()
train.at[24, "DailyRate"] = train["DailyRate"].median()
train.at[26, "DailyRate"] = train["DailyRate"].median()
mean_age = train['Age'].mean()
train['Age'] = train['Age'].replace(60, mean_age)
test_idx = test.id
test = test.drop(columns="id")
original = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,36.0,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,Male,...,80,1,10,2,3,10,0,7,8,0
1,35.0,Travel_Rarely,921,Sales,8,3,Other,1,1,Male,...,80,1,4,3,3,4,2,0,3,0
2,32.0,Travel_Rarely,718,Sales,26,3,Marketing,1,3,Male,...,80,2,4,3,3,3,2,1,2,0
3,38.0,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,Female,...,80,0,15,1,1,6,0,0,2,0
4,50.0,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,Female,...,80,0,31,0,3,31,14,4,10,1


In [228]:
original['Attrition'] = (original['Attrition'] == 'Yes').astype(np.int64)

# in original data, id is termed as "EmployeeNumber", so let's drop it
original.drop(columns="EmployeeNumber", inplace=True)

In [229]:
# now reordering the features in original dataset
original = original[list(train.columns)]

In [230]:
# let's finally concatenate
train_extended = pd.concat([train, original]).reset_index(drop=True)
len(train_extended)

3147

In [231]:
y = train_extended.Attrition
df = pd.concat([train_extended.drop(columns="Attrition"), test])

In [232]:
feats_to_drop = [col for col in df.columns if df[col].nunique()==1]
cat_features = [col for col in df.columns if df[col].nunique() <= 10 and df[col].nunique() > 1]
df.drop(columns=feats_to_drop, inplace=True)

In [233]:
df.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,36.0,Travel_Frequently,599,Research & Development,24,3,Medical,4,Male,42,...,3,2,1,10,2,3,10,0,7,8
1,35.0,Travel_Rarely,921,Sales,8,3,Other,1,Male,46,...,3,4,1,4,3,3,4,2,0,3
2,32.0,Travel_Rarely,718,Sales,26,3,Marketing,3,Male,80,...,3,4,2,4,3,3,3,2,1,2
3,38.0,Travel_Rarely,1488,Research & Development,2,3,Medical,3,Female,40,...,3,3,0,15,1,1,6,0,0,2
4,50.0,Travel_Rarely,1017,Research & Development,5,4,Medical,2,Female,37,...,3,3,0,31,0,3,31,14,4,10


In [234]:
cat=['BusinessTravel','OverTime','Gender','MaritalStatus','Department','EducationField','JobRole']
cols=cat
for col in cols:
    df[col]=df[col].map(df.groupby(col)['Age'].count())

In [235]:
sc = StandardScaler()
df = sc.fit_transform(df)
#Let's seprate test and train sets
X_train = df[:-len(test), :]
X_test = df[-len(test): , :]

In [236]:
# random params values - make sure to tune yours
xgb_params = {'n_estimators': 150,
                 'max_depth': 3,
                 'learning_rate': 0.1,
                 'min_child_weight': 4,
                 'subsample': 0.7,
              'random_state':0,
                 'colsample_bytree': 0.3,'loss_function': 'CrossEntropy',
             }


xgb_clf = xgb.XGBClassifier(**xgb_params)

# cross_validate(X_train, y, xgb_clf)

xgb_clf.fit(X_train, y, verbose=0)

In [237]:
lgbm_params = {'n_estimators': 407,
                 'num_rounds': 274,
                 'learning_rate': 0.1,
                 #'num_leaves': 195,
                 'max_depth': -2,
                 'min_data_in_leaf': 46,
                 'lambda_l1': 0.01,
                 #'lambda_l2': 0.6,
               'random_state':0,
                 'min_gain_to_split': 1.42,
                 'bagging_fraction': 0.45,
                 'feature_fraction': 0.3,'loss_function': 'CrossEntropy'}

In [238]:
lgbm_clf = lgbm.LGBMClassifier(**lgbm_params)

# cross_validate(X_train, y, lgbm_clf)

lgbm_clf.fit(X_train, y)

[LightGBM] [Info] Number of positive: 437, number of negative: 2710
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1161
[LightGBM] [Info] Number of data points in the train set: 3147, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138862 -> initscore=-1.824771
[LightGBM] [Info] Start training from score -1.824771


In [239]:
#random params but feel free to tune
catboost_params = {'loss_function': 'CrossEntropy',
                     'learning_rate': 0.76,
                     'l2_leaf_reg': 0.014,
                   'random_state':0,
                     'colsample_bylevel': 0.1,
                     'depth': 1,
                     'boosting_type': 'Ordered',
                     'bootstrap_type': 'MVS',
                     'min_data_in_leaf': 18,
                     'one_hot_max_size': 14,
                     'subsample': 0.99}

catboost_clf = catboost.CatBoostClassifier(**catboost_params)

# cross_validate(X_train, y, catboost_clf)

catboost_clf.fit(X_train, y, verbose=False)

<catboost.core.CatBoostClassifier at 0x7e4fa7aa0520>

In [240]:
xgb_preds = xgb_clf.predict_proba(X_test)[:, 1]
lgbm_preds = xgb_clf.predict_proba(X_test)[:, 1]
cat_preds = catboost_clf.predict_proba(X_test)[:, 1]
#final_preds = np.column_stack([xgb_preds, lgbm_preds, cat_preds]).mean(axis=1)
final_preds = xgb_preds*0.01+lgbm_preds*(1-(0.1+0.01))+cat_preds*0.1
submission = pd.DataFrame({"id": test_idx, "Attrition": final_preds})
submission.head()

Unnamed: 0,id,Attrition
0,1677,0.155953
1,1678,0.151254
2,1679,0.043818
3,1680,0.053341
4,1681,0.556969


In [241]:
submission.to_csv('at5=4oy.csv',index=False)

In [242]:
cc

NameError: ignored

### kk

In [None]:
xgb_preds = xgb_clf.predict_proba(X_test)[:, 1]
lgbm_preds = xgb_clf.predict_proba(X_test)[:, 1]
cat_preds = catboost_clf.predict_proba(X_test)[:, 1]
final_preds = np.column_stack([xgb_preds, lgbm_preds, cat_preds]).mean(axis=1)
submission = pd.DataFrame({"id": test_idx, "Attrition": final_preds})
submission.head()

In [None]:
import lightgbm
import catboost
import xgboost
import sklearn

print(f"LightGBM version: {lightgbm.__version__}")
print(f"CatBoost version: {catboost.__version__}")
print(f"XGBoost version: {xgboost.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")
