In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [None]:
train=pd.read_csv('train_set.csv')
test=pd.read_csv('test_set.csv')

In [None]:
train.head()

In [None]:
# 从训练集查看是否平衡数据集

plt.rc('font', family='SimHei', size=13)
fig = plt.figure()
plt.pie(train['y'].value_counts(),
        labels=train['y'].value_counts().index,
        autopct='%1.2f%%',counterclock = False)
plt.title('购买率')
plt.show()

In [None]:
train.info()

In [None]:
# 查看是否有空值
data.isnull().sum()

In [None]:
data = pd.concat([train.drop(['y'],axis=1),test],axis=0).reset_index(drop=True)

In [None]:
# 对object型数据查看unique
str_features = []
num_features=[]
for col in train.columns:
    if train[col].dtype=='object':
        str_features.append(col)
        print(col,':  ',train[col].unique())
    if train[col].dtype=='int64' and col not in ['ID','y']:
        num_features.append(col)

In [None]:
train.isin(['unknown']).mean()*100

In [None]:
print(str_features)
print(num_features)

In [None]:
plt.figure(figsize=(15,15))
i=1
for col in str_features:
    plt.subplot(3,3,i)
    # 这里用mean是因为标签是0，1二分类，0*0的行数(即没购买的人数)+1*1的行数(购买的人数)/所有行数=购买率
    train.groupby([col])['y'].mean().plot(kind='bar',
                                          stacked=True,
                                          rot=90,
                                          title='Purchase rate of {}'.format(col))
    plt.subplots_adjust(wspace=0.2,hspace=0.7)  # 调整子图间距
    i=i+1
plt.show()

In [None]:
num_features

In [None]:
train[num_features].describe()

In [None]:
from scipy.stats import chi2_contingency       # 数值型特征检验，检验特征与标签的关系
from scipy.stats import f_oneway,ttest_ind     # 分类型特征检验，检验特征与标签的关系

In [None]:
#----------数据集处理--------------#
from sklearn.model_selection import train_test_split        # 划分训练集和验证集
from sklearn.model_selection import KFold,StratifiedKFold   # k折交叉
from imblearn.combine import SMOTETomek,SMOTEENN            # 综合采样
from imblearn.over_sampling import SMOTE                    # 过采样
from imblearn.under_sampling import RandomUnderSampler      # 欠采样

#----------数据处理--------------#
from sklearn.preprocessing import StandardScaler # 标准化
from sklearn.preprocessing import OneHotEncoder  # 热独编码
from sklearn.preprocessing import OrdinalEncoder

In [None]:
# 异常值处理
def outlier_processing(dfx):
    df = dfx.copy()
    q1 = df.quantile(q=0.25)
    q3 = df.quantile(q=0.75)
    iqr = q3 - q1
    Umin = q1 - 1.5*iqr
    Umax = q3 + 1.5*iqr 
    df[df>Umax] = df[df<=Umax].max()
    df[df<Umin] = df[df>=Umin].min()
    return df

In [None]:
train['age']=outlier_processing(train['age'])
train['day']=outlier_processing(train['day'])
train['duration']=outlier_processing(train['duration'])
train['campaign']=outlier_processing(train['campaign'])


test['age']=outlier_processing(test['age'])
test['day']=outlier_processing(test['day'])
test['duration']=outlier_processing(test['duration'])
test['campaign']=outlier_processing(test['campaign'])

In [None]:
dummy_train=train.join(pd.get_dummies(train[str_features])).drop(str_features,axis=1).drop(['ID','y'],axis=1)
dummy_test=test.join(pd.get_dummies(test[str_features])).drop(str_features,axis=1).drop(['ID'],axis=1)

In [None]:
for col in str_features:
    obs=pd.crosstab(train['y'],
                    train[col],
                    rownames=['y'],
                    colnames=[col])
    chi2, p, dof, expect = chi2_contingency(obs)
    print("{} 卡方检验p值: {:.4f}".format(col,p))

In [None]:
from sklearn.feature_selection import SelectKBest,f_classif

f,p=f_classif(train[num_features],train['y'])
k = f.shape[0] - (p > 0.05).sum()
selector = SelectKBest(f_classif, k=k)
selector.fit(train[num_features],train['y'])

print('scores_:',selector.scores_)
print('pvalues_:',selector.pvalues_)
print('selected index:',selector.get_support(True))

In [None]:
# 标准化，返回值为标准化后的数据
standardScaler=StandardScaler()
ss=standardScaler.fit(dummy_train.loc[:,num_features])
dummy_train.loc[:,num_features]=ss.transform(dummy_train.loc[:,num_features])
dummy_test.loc[:,num_features]=ss.transform(dummy_test.loc[:,num_features])

In [None]:
X=dummy_train
y=train['y']

In [None]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.2,random_state=2020)

In [None]:
smote_tomek = SMOTETomek(random_state=2020)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

In [None]:
#----------建模工具--------------#
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#----------模型评估工具----------#
from sklearn.metrics import confusion_matrix # 混淆矩阵
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score,f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
#逻辑回归
param = {"penalty": ["l1", "l2", ], "C": [0.1, 1, 10], "solver": ["liblinear","saga"]}
gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param, cv=2, scoring="roc_auc",verbose=10) 
gs.fit(X_resampled,y_resampled) 
print(gs.best_params_) 
y_pred = gs.best_estimator_.predict(X_valid) 
print(classification_report(y_valid, y_pred))

In [None]:
# 训练集
confusion_matrix(y_resampled,gs.best_estimator_.predict(X_resampled),labels=[1,0])

In [None]:
# 验证集
confusion_matrix(y_valid,y_pred,labels=[1,0])

In [None]:
#画roc-auc曲线
def get_rocauc(X,y,clf):
    from sklearn.metrics import roc_curve
    FPR,recall,thresholds=roc_curve(y,clf.predict_proba(X)[:,1],pos_label=1)
    area=roc_auc_score(y,clf.predict_proba(X)[:,1])
    
    maxindex=(recall-FPR).tolist().index(max(recall-FPR))
    threshold=thresholds[maxindex]
    
    plt.figure()
    plt.plot(FPR,recall,color='red',label='ROC curve (area = %0.2f)'%area)
    plt.plot([0,1],[0,1],color='black',linestyle='--')
    plt.scatter(FPR[maxindex],recall[maxindex],c='black',s=30)
    plt.xlim([-0.05,1.05])
    plt.ylim([-0.05,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('Recall')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc='lower right')
    plt.show()
    return threshold

In [None]:
threshold=get_rocauc(X_resampled, y_resampled,gs.best_estimator_)

In [None]:
# 阈值调整
def get_ypred(X,clf,threshold):
    y_pred=[]
    for i in clf.predict_proba(X)[:,1]:
        if i > threshold:
            y_pred.append(1)
        else:
            y_pred.append(0)
    return y_pred

# ytrain_pred=get_ypred(Xtrain,clf,threshold)

In [None]:
X_test = dummy_test
y_test=gs.best_estimator_.predict_proba(X_test)
test['pred']=y_test[:,1]

In [None]:
test[['ID','pred']].to_csv('lr.csv')