In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from numpy import nan as NA
from sklearn.preprocessing import OneHotEncoder
from category_encoders import OrdinalEncoder

In [None]:
train_data =  pd.read_csv("train.csv")
#替换缺失值"？"为NA
train_data.replace(' ?', NA, inplace = True)
#返回有nan的列
train_data.isna().any()
#替换NAN的值为Unknown
train_data['workclass'].replace(NA, 'Unknown', inplace = True)
train_data['occupation'].replace(NA, 'Unknown', inplace = True)
train_data['native-country'].replace(NA, 'Unknown', inplace = True)


#国家数据重分类,分为发达国家和发展中国家（不要运行2次）
#由于美国人最多，所以将国家缺失值填补为US
train_data['native-country'].replace(' United-States', 'US', inplace = True)
train_data['native-country'].replace('Unknown', 'US', inplace = True)
country = train_data['native-country']
country_type = country.unique()
for native_country in country_type:
    if native_country in [' United States',' United Kingdom',' Germany',' France',' Japan',' Italy',' Canada',' Russia']:
        train_data['native-country'].replace(native_country, 'Developed_country', inplace = True)
    else:
        train_data['native-country'].replace(native_country, 'Uneveloped_country', inplace = True)




#     if native_country != ' United-States' and native_country != 'Unknown':
#         train_data['native-country'].replace(native_country, 'Non_US', inplace = True)




In [None]:
sns.barplot(x='native-country',y='exceeds50K',data=train_data)
# sns.countplot(x='native-country',hue='exceeds50K',data=train_data)

In [None]:
#按学历，将1为没上过学，2-8为没上过大学，9为高中文凭，10为大学未毕业（可视为高中），11-12为专科，因此，应该重分类
def education(education_num):
    if education_num == 1:
        return 0
    elif (education_num > 1) & (education_num < 9):
        return 1
    elif (education_num >= 9) & (education_num < 11):
        return 2
    elif (education_num >= 11) & (education_num < 13):
        return 3
    else:
        return (education_num-9)
train_data['education-num'] = train_data['education-num'].map(education)

In [None]:
#只能运行一次
#将每周工作时长分为三段，一段为小于每周40小时，一段为每周40小时，一段为每周大于40小时
def workhours(hours_per_week):
    if hours_per_week < 40:
        return 0
    elif hours_per_week == 40:
        return 1
    else:
        return 2
train_data['hours-per-week'] = train_data['hours-per-week'].map(workhours)

In [None]:
train_data.info()

In [None]:
#性别编码
encoder1 = OrdinalEncoder(cols = ['sex']).fit(train_data,train_data.iloc[:,-1]) # 转换sex为 female为1 和 male为2
train_data = encoder1.transform(train_data)
#国家编码
encoder2 = OrdinalEncoder(cols = ['native-country']).fit(train_data,train_data.iloc[:,-1]) # 转换sex为 female为1 和 male为2
train_data = encoder2.transform(train_data)

In [None]:
train_data.head(50)

In [None]:
train_Sel=train_data.drop(['fnlwgt','education'],axis=1)

#查看各特征与标签的相关性
corrDf=pd.DataFrame()
corrDf=train_Sel.corr()
corrDf['exceeds50K'].sort_values(ascending=True)

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_Sel[['exceeds50K','capital-loss','capital-gain','age','hours-per-week','education-num','sex','native-country']].corr(),cmap='BrBG',annot=True,
           linewidths=.5)
plt.xticks(rotation=45)

In [None]:
train_Sel=pd.get_dummies(train_Sel)

In [None]:
#归一化效果更差
# from sklearn import preprocessing
# min_max_scaler = preprocessing.MinMaxScaler()  # 也可以用标准化类，然后调用方法
# train_Sel[['age','education-num','hours-per-week','capital-gain','capital-loss']] = min_max_scaler.fit_transform(train_Sel[['age','education-num','hours-per-week','capital-gain','capital-loss']])


In [None]:
train_Sel

In [None]:
y=train_Sel['exceeds50K']
x=train_Sel.drop('exceeds50K',axis=1)

In [None]:
x.info()

In [None]:
#导入机器学习算法库
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

#设置kfold，交叉采样法拆分数据集
kfold=StratifiedKFold(n_splits=10)

#汇总不同模型算法
classifiers=[]
classifiers.append(SVC())
classifiers.append(DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(ExtraTreesClassifier())
classifiers.append(GradientBoostingClassifier())
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression())
classifiers.append(LinearDiscriminantAnalysis())
classifiers.append(BaggingClassifier(base_estimator=SVC()))
classifiers.append(AdaBoostClassifier())
classifiers.append(MLPClassifier())

#不同机器学习交叉验证结果汇总
cv_results=[]
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier,x,y,
                                      scoring='f1',cv=kfold,n_jobs=-1))

#求出模型得分的均值和标准差
cv_means=[]
cv_std=[]
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    
#汇总数据
cvResDf=pd.DataFrame({'cv_mean':cv_means,
                     'cv_std':cv_std,
                     'algorithm':['SVC','DecisionTreeCla','RandomForestCla','ExtraTreesCla',
                                  'GradientBoostingCla','KNN','LR','LinearDiscrimiAna',
                                  'BaggingClassifier','AdaBoostClassifier','MLPClassifier']})

In [None]:
cvResDf

In [None]:
cvResFacet=sns.FacetGrid(cvResDf.sort_values(by='cv_mean',ascending=False),sharex=False,
            sharey=False,aspect=2)
cvResFacet.map(sns.barplot,'cv_mean','algorithm',**{'xerr':cv_std},
               palette='muted')
cvResFacet.set(xlim=(0.7,0.9))
cvResFacet.add_legend()


In [None]:
#GradientBoostingClassifier模型
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }
modelgsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, 
                                     scoring="accuracy", n_jobs= -1, verbose = 1)
modelgsGBC.fit(x,y)

#LogisticRegression模型
modelLR=LogisticRegression()
LR_param_grid = {'C' : [1,2,3],
                'penalty':['l1','l2']}
modelgsLR = GridSearchCV(modelLR,param_grid = LR_param_grid, cv=kfold, 
                                     scoring="accuracy", n_jobs= -1, verbose = 1)
modelgsLR.fit(x,y)

#Adaboost模型
Adaboost = AdaBoostClassifier()
ab_param_grid = {'n_estimators' : [100,150,200],
                 'learning_rate': [0.2, 0.1, 0.05]
                }
modelgsAB = GridSearchCV(Adaboost, param_grid = ab_param_grid, cv = kfold,
                                     scoring="accuracy", n_jobs= -1, verbose = 1)
modelgsAB.fit(x,y)


In [None]:
#modelgsGBC模型
print('modelgsGBC模型得分为：%.3f'%modelgsGBC.best_score_)
#modelgsLR模型
print('modelgsLR模型得分为：%.3f'%modelgsLR.best_score_)


In [None]:
#stacking

In [None]:
# coding=utf8
 
 
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
 
 
'''模型融合中使用到的各个单模型'''
clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]
 
'''切分一部分数据作为测试集'''
X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.33, random_state=2017)
 
 
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))
 
'''5折stacking'''
n_folds = 5
skf = list(StratifiedKFold(y, n_folds))
for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    # print(j, clf)
    dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        '''使用第i个部分作为预测，剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。'''
        # print("Fold", i)
        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
    '''对于测试集，直接用这k个模型的预测值均值作为新的特征。'''
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))
# clf = LogisticRegression()
clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
 
print("Linear stretch of predictions to [0,1]")
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print("blend result")
print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))



In [None]:
#testdata


test =  pd.read_csv("test.csv")
#替换缺失值"？"为NA
test.replace(' ?', NA, inplace = True)
#返回有nan的列
test.isna().any()
#替换NAN的值为Unknown
test['workclass'].replace(NA, 'Unknown', inplace = True)
test['occupation'].replace(NA, 'Unknown', inplace = True)
test['native-country'].replace(NA, 'Unknown', inplace = True)


#国家数据重分类,分为发达国家和发展中国家（不要运行2次）
#由于美国人最多，所以将国家缺失值填补为US
test['native-country'].replace(' United-States', 'US', inplace = True)
test['native-country'].replace('Unknown', 'US', inplace = True)
country = test['native-country']
country_type = country.unique()
for native_country in country_type:
    if native_country in [' United States',' United Kingdom',' Germany',' France',' Japan',' Italy',' Canada',' Russia']:
        test['native-country'].replace(native_country, 'Developed_country', inplace = True)
    else:
        test['native-country'].replace(native_country, 'Uneveloped_country', inplace = True)




#     if native_country != ' United-States' and native_country != 'Unknown':
#         train_data['native-country'].replace(native_country, 'Non_US', inplace = True)


#按学历，将1为没上过学，2-8为没上过大学，9为高中文凭，10为大学未毕业（可视为高中），11-12为专科，因此，应该重分类
def education(education_num):
    if education_num == 1:
        return 0
    elif (education_num > 1) & (education_num < 9):
        return 1
    elif (education_num >= 9) & (education_num < 11):
        return 2
    elif (education_num >= 11) & (education_num < 13):
        return 3
    else:
        return (education_num-9)
test['education-num'] = test['education-num'].map(education)

#只能运行一次
#将每周工作时长分为三段，一段为小于每周40小时，一段为每周40小时，一段为每周大于40小时
def workhours(hours_per_week):
    if hours_per_week < 40:
        return 0
    elif hours_per_week == 40:
        return 1
    else:
        return 2
test['hours-per-week'] = test['hours-per-week'].map(workhours)

#性别编码
encoder1 = OrdinalEncoder(cols = ['sex']).fit(test,test.iloc[:,-1]) # 转换sex为 female为1 和 male为2
test = encoder1.transform(test)
#国家编码
encoder2 = OrdinalEncoder(cols = ['native-country']).fit(test,test.iloc[:,-1]) # 转换sex为 female为1 和 male为2
test = encoder2.transform(test)

test_Sel=test.drop(['fnlwgt','education'],axis=1)
test_Sel=pd.get_dummies(test_Sel)


In [None]:
#归一化效果更差
# from sklearn import preprocessing
# min_max_scaler = preprocessing.MinMaxScaler()  # 也可以用标准化类，然后调用方法
# test_Sel[['age','education-num','hours-per-week','capital-gain','capital-loss']] = min_max_scaler.fit_transform(test_Sel[['age','education-num','hours-per-week','capital-gain','capital-loss']])



In [None]:
test_Sel

In [None]:
GBCpreData_y=modelgsGBC.predict(test_Sel)
GBCpreData_y=GBCpreData_y.astype(int)
ID=np.array(range(1,24422))
GBCpreResultDf=pd.DataFrame()
GBCpreResultDf['id']=ID
GBCpreResultDf['prediction']=GBCpreData_y


In [None]:
GBCpreResultDf.to_csv('GBSmodle.csv',index=False)

In [None]:
# pd.set_option('display.max_rows', None)

In [None]:
sns.catplot('relationship',col='marital-status',data=train_data,kind = 'count')

In [None]:
train_data['occupation'].value_counts()

In [None]:
(train_data.loc[(train_data['occupation'] == 'Unknown') &(train_data['hours-per-week'] == 0)])['exceeds50K'].value_counts()

In [None]:
train_data['workclass'].value_counts()

In [None]:
(train_data.loc[(train_data['workclass']== 'Unknown') &(train_data['occupation']== 'Unknown')])['workclass'].value_counts()

In [None]:
train_data[train_data['workclass']==' Never-worked'].replace('Unknown',' No-work',inplace = True)

In [None]:
train_data.loc[(train_data['workclass'] == ' Never-worked') &(train_data['occupation']== 'Unknown')]

In [None]:
train_data.loc[train_data['workclass'].isin([' Without-pay',' Never-worked'])]

In [None]:
train_data.loc[(train_data['workclass']== 'Unknown') & (train_data['age'] < 24) & (train_data['education-num']>=13)]