In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
df = pd.read_excel('df_processed.xlsx')

In [None]:
df.info()

In [None]:
df = df.dropna(axis=1).drop(columns = ['大区','小区','经销商销售代码','经销商服务代码','经销商名称','工单号','服务顾问','问卷回复时间','问卷标题'])

In [None]:
#空值不多 直接去掉
df.dropna(axis=0, inplace=True)

In [None]:
#定义过再次到店的时间是8个月以内，所以要去掉离现在八个月以内的数据
days_today = '10/26/22'
days_today = datetime.strptime(days_today,'%m/%d/%y')
df['days_today'] = days_today
df['days_today'] = pd.to_datetime(df['days_today'])
df['sales_time'] = pd.to_datetime(df['问卷回复时间_x'])
df['days_to_today'] = df['days_today'] - df['sales_time']
df = df.loc[df['days_to_today'] > '240days',:]
del df['days_today']
del df['days_to_today']

In [None]:
#问卷数据
df_questionnaire = df[['1.1 总体购车体验及感受','1.2 请您评价销售顾问在入店接待环节的表现',
    '1.2 店内环境设施','1.4 请您评价销售顾问在介绍产品环节的表现','1.6 请您评价支付/购买环节的体验',
    '1.9 请您评价提新车环节的体验','2.11 购车过程便捷，省心的程度如何？',
    '2.12 您是否愿意将这家4S店推荐给其他人？','2.13 您是否信任这家4S店？',
    '2.14 您是否愿意将福特品牌推荐给其他人？','2.15 您是否信赖福特这个品牌？',
    '1.1 请您评价此次4S店总体服务体验','1.2 请您评价服务人员在接待报价环节的表现',
    '1.4 请您评价在店等待环节的体验','1.6 请您评价结算交车环节的体验','1.8 请您评价此次维修保养质量',
    '1.9 请问：此次进店，您的车是一次维修保养好的吗？',
    '2.11 请您评分：此次维修保养过程便捷，省心的程度','2.12 请您评分：这家4S店值得信任',
    '2.13 根据您的体验，您将4S店推荐给他人的可能性有多大？','2.14 请您评分：福特品牌值得信任',
    '2.15 根据您的体验，您将福特品牌推荐给他人的可能性有多大？''target']]

In [None]:
#合并分类和情感标签，构建新特征
df['sales'] = df['sales_aspect'] + '_' + df['sales_sentiment']
df['service'] = df['service_aspect'] + df['service_sentiment']
df_text_cor = pd.get_dummies(df, columns = ['sales','service'])
df_text_cor['target'] = df['target']

In [None]:
###看每个问题每个分数的再次到店率
df_revisit_1 = pd.DataFrame(index = range(1,6))
question_list = df_questionnaire.columns[:-1]
for q in question_list:
    df_revisit_1[q] = df_questionnaire.groupby(q)['target'].sum() / df_questionnaire.groupby(q)['target'].count()

df_revisit_1

In [None]:
##看每个文本分类+情感的再次到店率
df_revisit_2 = pd.DataFrame(index = range(1,3))
question_list = df_text_cor.columns
for q in question_list:
    df_revisit_2[q] = df_text_cor.groupby(q)['label'].sum() / df_text_cor.groupby(q)['label'].count()

df_revisit_2

问卷问题和文本分别和再次到店的相关性

In [None]:
corr1 = df_questionnaire.corr()
corr1.style.background_gradient(cmap='coolwarm')

In [None]:
corr2 = df_text_cor.corr()
corr2.style.background_gradient(cmap='coolwarm')

In [None]:
df.drop(['sales_aspect','sales_sentiment','service_aspect','service_sentiment'], axis=1, inplace=True)

用几个特征筛选方法看看特征的重要性

In [None]:
#问卷选项重要性
clf = XGBClassifier()
clf.fit(df_questionnaire.drop('target',axis=1), df_questionnaire['target'])
plt.figure()
plt.bar(range(df_questionnaire.shape[1]-1), clf.feature_importances_)
plt.xticks(range(df_questionnaire.shape[1]-1), df_questionnaire.columns[:-1])

In [None]:
#文本重要性
clf = XGBClassifier()
clf.fit(df_text_cor.drop('target',axis=1), df_text_cor['target'])
plt.figure()
plt.bar(range(df_text_cor)-1, clf.feature_importances_)
plt.xticks(range(df_text_cor)-1, df_text_cor.columns[:-1])

用Null Importance检查一下树模型的特征筛选表现

In [None]:
def get_feature_importances(data, label, feature_name, shuffle = True):
    y = label.copy()
    if shuffle:
        np.random.shuffle(y)
    
    clf = XGBClassifier()
    clf.fit(data, y)
    imp_df = pd.DataFrame()
    imp_df['feature'] = feature_name
    imp_df['importance'] = clf.feature_importances_
    
    return imp_df

In [None]:
def null_importance(data, label, feature_name):
    #原特征重要性
    true_imp_df = get_feature_importances(data, label, feature_name, shuffle = False)

    #null Importance
    null_imp_df = pd.DataFrame()
    for i in range(20):
        imp_df = get_feature_importances(data, label, feature_name, shuffle=True)
        null_imp_df = pd.concat([null_imp_df, imp_df], axis= 0)
    
    #对比
    for feature in feature_name:
        true_imp = true_imp_df.loc[true_imp_df['feature'] == feature, 'importance'].values[0]
        null_imp = null_imp_df.loc[null_imp_df['feature'] == feature, 'importance'].mean()
        print(f'feature {feature}: importance {true_imp}     null_importance {null_imp}')

In [None]:
data = df_questionnaire.drop('target',axis=1)
label = df_questionnaire['target']
feature_name = data.columns
null_importance(data,label, feature_name)

In [None]:
data = df_text_cor.drop('target',axis=1)
label = df_text_cor['target']
feature_name = data.columns
null_importance(data,label, feature_name)

In [None]:
#后面的树模型不适合用one hot。文本标签部分让树模型自己编码
df_all = df[['车架号','大区','车型','sales','service']]
df_all = pd.concat([df_all, df_questionnaire], axis = 1)

In [None]:
#处理好的数据先保存下来
df_all.to_excel('df_revisit.xlsx')

In [None]:
x = df_all.drop(['车架号','label'], axis = 1)
y = df_all['label']

In [None]:
#lighgbm

prediction_test = np.zeros(df_all.shape[0])
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

for n_fold, (train_idx, test_idx) in enumerate(folds.split(x, y)):
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_test, y_test = x.iloc[test_idx], y.iloc[test_idx]
    
    clf = LGBMClassifier(n_estimators = 1000,
         num_leaves = 50,
         min_data_in_leaf = 10,
         max_depth = 8,
         learning_rate = 0.01)
    
    clf.fit(x_train, y_train, eval_set = [(x_train, y_train),(x_test, y_test)],
           eval_metric = 'auc', verbose = 1000, early_stopping_rounds = 100)
    
    prediction_test[test_idx] = clf.predict_proba(x_test, num_iteration = clf.best_iteration_)[:,1]

auc_score = roc_auc_score(y, prediction_test)
accuracy = (y == prediction_test).mean()
print(f'AUC score: {score}. Accuracy: {accuracy}')


In [None]:
df_all['prediction'] = prediction_test