In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# 导入工具包
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [None]:
# 引入数据
DATA_PATH = '/kaggle/input/bi-attrition-predict/'
train = pd.read_csv(f'{DATA_PATH}/train.csv')
test = pd.read_csv(f'{DATA_PATH}/test.csv')
train.shape, test.shape

In [None]:
# 数据查看
train.head()

In [None]:
# 数据查看
test.head()

In [None]:
# 信息查看
train.info()

In [None]:
# 列名查看
train.columns

In [None]:
train.StandardHours.unique()

# 1. 信息查看

In [None]:
# 区别不同的列

id_col = 'user_id'

# 预测列
target_col = 'Attrition'

# 数字列
digital_cols = ['Age', 'DailyRate', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike',
                'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
# 类别列
category_cols = ['BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField',
                'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel',
                'JobRole', 'JobSatisfaction', 'MaritalStatus', 'Over18', 'OverTime',
                'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TrainingTimesLastYear',
                'WorkLifeBalance']

In [None]:
# 分类相关列数据查看 得出StandardHours、Over18 两列数据全部相同训练时可删除
for col in category_cols:
    # 获取测试集和训练集合
    nunique_tr = train[col].nunique()
    nunique_te = test[col].nunique()
    na_tr = len(train.loc[train[col].isna()]) / len(train)
    na_te = len(test.loc[test[col].isna()]) / len(test)
    print(f'Col name:{col:30}\t unique cate num in train:{nunique_tr:5}\tunique cate num in test:{nunique_te:5}\tnull sample in train:{na_tr:.2f}\tnull sample in test:{na_te:.2f}')

In [None]:
# 数字相关列查看
for col in digital_cols:
    min_tr = train[col].min()
    max_tr = train[col].max()
    mean_tr = train[col].mean()
    median_tr = train[col].median()
    std_tr = train[col].std()
    
    min_te = test[col].min()
    max_te = test[col].max()
    mean_te = test[col].mean()
    median_te = test[col].median()
    std_te = test[col].std()
    
    na_tr = len(train.loc[train[col].isna()]) / len(train)
    na_te = len(test.loc[test[col].isna()]) / len(test)
    print(f'Col name:{col:30}')
    print(f'\tIn train data: min value:{min_tr:.2f}\tmax value:{max_tr:.2f}\tmean value:{mean_tr:.2f}\tmedian value:{median_tr:.2f}\tstd value:{std_tr:.2f}\tnan sample rate:{na_tr:.2f}\t')
    print(f'\tIn  test data: min value:{min_te:.2f}\tmax value:{max_te:.2f}\tmean value:{mean_te:.2f}\tmedian value:{median_te:.2f}\tstd value:{std_te:.2f}\tnan sample rate:{na_te:.2f}\t')

In [None]:
# 查看 age and attrition 关系
plt.figure(figsize=(4,3))
sns.barplot(x='Attrition', y='Age', data = train , palette = 'Set2')

# 2. 数据处理

In [None]:
# 目标值转化为0，1
train[target_col].unique()
target_col_dict = {'Yes': 1, 'No': 0}
train_labels = train[target_col].map(target_col_dict).values

In [None]:
# 数字列归一化 到[0,1]
sacalar = MinMaxScaler()
train_digital = sacalar.fit_transform(train[digital_cols])
test_digital = sacalar.transform(test[digital_cols])

In [None]:
# 分类列数据处理
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

train_category, test_category = None, None
# 删除无意义列 EmployeeNumber 员工号码：所有人不同，Over18、StandardHours：所有人相同
drop_cols = ['EmployeeNumber', 'Over18', 'StandardHours']
for col in [var for var in category_cols if var not in drop_cols]: 
    lbe, ohe = LabelEncoder(), OneHotEncoder()
    # LabelEncoder将Label标准化
    lbe.fit(pd.concat([train[col], test[col]]).values.reshape(-1, 1))
    train[col] = lbe.transform(train[col])
    test[col] = lbe.transform(test[col])
    
    ohe.fit(pd.concat([train[col], test[col]]).values.reshape(-1, 1))
    oht_train = ohe.transform(train[col].values.reshape(-1, 1)).todense()
    oht_test = ohe.transform(test[col].values.reshape(-1, 1)).todense()

    # 把训练数据和测试数据连起来
    if train_category is None:
        train_category = oht_train
        test_category = oht_test
    else:
        train_category = np.hstack((train_category, oht_train))
        test_category = np.hstack((test_category, oht_test))

In [None]:
train_features = np.hstack((train_digital, train_category))
test_features = np.hstack((test_digital, test_category))
train_features.shape, test_features.shape

# 3. 模型训练预测 

In [None]:
#  切分数据为训练集和验证集
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.20, random_state=42)

In [None]:
#  LinearRegression 训练
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(X_train, y_train)
ypred = clf.predict(X_val)

#验证集上的auc值
test_auc = metrics.roc_auc_score(y_val,ypred)
print(test_auc)

In [None]:
predictions = clf.predict(test_features)
predictions.shape

In [None]:
predictions.mean()

In [None]:
sub = test[['user_id']].copy()
sub['Attrition'] = predictions
sub['Attrition'] = sub['Attrition'].apply(lambda x: x if x >=0 else 0.0005)
print(sub.head())
sub.to_csv('submission.csv', index=False)