In [76]:
import pandas as pd
import numpy as np
import pickle as pkl


from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


In [77]:
data = pkl.load(open("datos_grupos/attrition_available_2.pkl",'rb'))

Instancias y num atributos

In [78]:
print("num de instancias y atributos:", data.shape)
print("Nombre de los atributos:", data.columns)

num de instancias y atributos: (4410, 31)
Nombre de los atributos: Index(['hrs', 'absences', 'JobInvolvement', 'PerformanceRating',
       'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'Age',
       'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')


tipo de datos

In [79]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4410 entries, 1 to 4409
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   hrs                      3639 non-null   float64
 1   absences                 3575 non-null   float64
 2   JobInvolvement           3585 non-null   float64
 3   PerformanceRating        3534 non-null   float64
 4   EnvironmentSatisfaction  3428 non-null   float64
 5   JobSatisfaction          3637 non-null   float64
 6   WorkLifeBalance          3620 non-null   float64
 7   Age                      3636 non-null   float64
 8   Attrition                4410 non-null   object 
 9   BusinessTravel           3687 non-null   object 
 10  Department               3575 non-null   object 
 11  DistanceFromHome         3681 non-null   float64
 12  Education                3628 non-null   float64
 13  EducationField           4410 non-null   object 
 14  EmployeeCount           

Observamos missing values en la mayoria de atributos; la variavle de salida se corresponde con el índice 8 (Attrition) 

Miramos las columnas una a una; buscamos el tipo del atributo, si es constante (es decir si aporta información) y su proporción de missing values

In [80]:
info = {}
for colum in data:
    print(data[colum].value_counts(dropna = False, normalize = True ).to_frame())
    info[colum] = data[colum].value_counts(dropna = False, normalize = True ).to_frame()


               hrs
NaN       0.174830
6.033902  0.000454
9.853332  0.000454
6.002747  0.000227
5.691867  0.000227
...            ...
6.511941  0.000227
8.338820  0.000227
6.623272  0.000227
6.884605  0.000227
6.511790  0.000227

[3638 rows x 1 columns]
      absences
NaN   0.189342
7.0   0.049206
17.0  0.047846
6.0   0.045805
14.0  0.045578
19.0  0.045578
10.0  0.044671
8.0   0.044218
11.0  0.043991
18.0  0.043991
15.0  0.043991
12.0  0.043311
16.0  0.041270
13.0  0.040590
9.0   0.039683
20.0  0.039229
5.0   0.038095
21.0  0.032653
4.0   0.026304
22.0  0.020635
3.0   0.013605
23.0  0.011111
2.0   0.006576
24.0  0.001587
1.0   0.001134
     JobInvolvement
3.0        0.478912
2.0        0.208163
NaN        0.187075
4.0        0.078912
1.0        0.046939
     PerformanceRating
3.0           0.678458
NaN           0.198639
4.0           0.122902
     EnvironmentSatisfaction
3.0                 0.237415
4.0                 0.236508
NaN                 0.222676
2.0                 0.153515


In [81]:

for colum in data:
    if info[colum].shape[0] < 3:
        print(info[colum])


     Attrition
No    0.838776
Yes   0.161224
     EmployeeCount
1.0       0.790023
NaN       0.209977
       Over18
Y    0.805215
NaN  0.194785
     StandardHours
8.0        0.79161
NaN        0.20839


Consideramos que si una columna solo tiene dos valores (un valor y NaN) esta es constante; en este caso las columnas de Over18 y Standard Hours. Ignoramos la columna de attrition ya que es nuestra variable objetivo y se trata de una clasificación binaria.

Como ya se ha mencionado se trata de un problema de clasificación binaria. Y está desbalanceado (un 83% de la variable de salida se corresponde con una de las clases) 

Dividimos los datos en train y test

In [82]:
X = data.drop("Attrition", axis= 'columns')
Y = data['Attrition']

cat_col = []
num_col = []

for col in X.columns:
    if X[col].dtype != "object":
        num_col.append(col)
        continue
    cat_col.append(col)


train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)



In [83]:
imputer_num = SimpleImputer(strategy='median')
scaler = RobustScaler()
pipeline_num = Pipeline(
    steps=[
        ("imputer", imputer_num),
        ("scaler", scaler)
    ]
)

imputer_cat = SimpleImputer(strategy='most_frequent')
encoder_cat = OneHotEncoder(handle_unknown='ignore')
pipeline_cat = Pipeline(
    steps=[
        ("imputer", imputer_cat),
        ("encoder", encoder_cat)
    ]
)

Log_reg = LogisticRegression(class_weight='balanced', random_state=2)

processor = ColumnTransformer(
    transformers=[
        ("num", pipeline_num, num_col),
        ("cat", pipeline_cat, cat_col),
    ]
)

predictor = Pipeline(
    steps=[
        ("Transformer", processor),
        ("predictor", Log_reg)
    ]
)



In [84]:
predictor.fit(train_x, train_y)

pred = predictor.predict(test_x)

# recall_average = recall_score(Y_test, y_predict, average="binary", pos_label="neg")

print(classification_report(test_y, pred))


              precision    recall  f1-score   support

          No       0.91      0.74      0.82       740
         Yes       0.31      0.62      0.42       142

    accuracy                           0.72       882
   macro avg       0.61      0.68      0.62       882
weighted avg       0.81      0.72      0.75       882



Boosting

In [87]:
gb_clas = GradientBoostingClassifier(random_state=2)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.05, 0.01]
}

cv = StratifiedKFold(n_splits=5)

grid = GridSearchCV(gb_clas, param_grid, cv=cv, scoring='balanced_accuracy', verbose=4, n_jobs=-1)

predictor = Pipeline(
    steps=[
        ("Transformer", processor),
        ("predictor", grid)
    ]
)

In [88]:
predictor.fit(train_x, train_y)
pred = predictor.predict(test_x)

#f1 = f1_score(test_y, pred)
#bal = balanced_accuracy_score(test_y, pred)

report = classification_report(test_y, pred, zero_division=0)
print(report)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
              precision    recall  f1-score   support

          No       0.92      0.99      0.96       740
         Yes       0.93      0.56      0.70       142

    accuracy                           0.92       882
   macro avg       0.93      0.77      0.83       882
weighted avg       0.92      0.92      0.91       882

