In [78]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,auc,log_loss
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') #为了整洁，去除弹出的warnings
pd.set_option('precision', 5) #设置精度
pd.set_option('display.float_format', lambda x: '%.5f' % x) #为了直观的显示数字，不采用科学计数法
%matplotlib inline

In [72]:
df = pd.read_csv("train.csv")

In [73]:
df = df[~np.isnan(df['3'])]
encode = preprocessing.LabelEncoder()
for column in df.columns:
    df.loc[:, column] = encode.fit_transform(df[column])

In [74]:
# 处理缺失值
df = df.fillna(10)

In [75]:
x = df.values[:,1:3]
y = df.values[:,3]
print x.shape
print y.shape

(171838L, 2L)
(171838L,)


In [11]:
#Gradient Boosting Decision Tree
gbdt = GradientBoostingClassifier(n_estimators=200)

In [82]:
#knn
knn = KNeighborsClassifier(n_neighbors=5)

In [80]:
def cv(x, y, model, n, model_name,mars=False):
    k_folds = KFold(x.shape[0], n_folds=n, shuffle=True)
    scores = []
    loss_list = []
    for train_indices, validation_indices in k_folds:
        # Generate training data
        x_train_cv = x[train_indices]
        y_train_cv = y[train_indices]
        # Generate validation data
        x_validate = x[validation_indices]
        y_validate = y[validation_indices]

        # Fit model on training data
        model.fit(x_train_cv, y_train_cv)
        # Score on validation data
        scores += [model.score(x_validate, y_validate)]
        # log_loss on validation data
        proba = model.predict_proba(x_validate)
        loss_list += [log_loss(y_validate,proba)]

    # Record and report accuracy
    average_score = np.mean(scores)
    average_log_loss = np.mean(loss_list)
    
    print "Score:", average_score
    print "Log_loss:", average_log_loss

    return average_score,average_log_loss

In [13]:
gbdt_score, gbdt_log_loss = cv(x,y,gbdt,5,"Gradient Boosting Decision Tree")

Score: 0.461958337628
Log_loss: 1.45617699281


In [83]:
knn_score, knn_log_loss = cv(x,y,knn,5,"KNN")

Score: 0.380858932519
Log_loss: 10.1704633383


# 五个特征的模型

In [48]:
data5 = pd.read_csv('MICEimputedTrain.csv')

In [49]:
data5.head()

Unnamed: 0,id,5,4,3,2,1
0,00005408fb82819bf6eef036180ce1d28ca64ac4553788...,1,2,1,1,0
1,000061e45fb216f4ad7fbc0cd86f620441a3091005eeb6...,5,4,3,2,0
2,00008faca7acd5b2edf91b274eedc88e90b1de3b4003f9...,2,7,6,3,0
3,0000b953a8b26886a7086673d8d1b7bd78efe139775728...,5,2,3,7,6
4,00015a0e069313122a1e4043c63625839dcb634b7de275...,6,4,3,1,0


In [67]:
data5 = data5[~np.isnan(data5['5'])]
encode = preprocessing.LabelEncoder()
for column in data5.columns:
    data5.loc[:, column] = encode.fit_transform(data5[column])

In [70]:
x5 = data5.values[:,2:6]
y5 = data5.values[:,1]
print x.shape
print y.shape

(337001L, 4L)
(337001L,)


# 5个feature的test

In [None]:
model5 = gbdt.fit(x5, y5)

In [None]:
test5 = pd.read_csv('testdf.csv')

In [None]:
test5.head()

In [None]:
user_id = test5['id'].values

In [None]:
encode = preprocessing.LabelEncoder()
for column in test2.columns:
    test2.loc[:, column] = encode.fit_transform(test2[column])

# 2个Feature的test

In [None]:
model2 = gbdt.fit(x, y)

In [37]:
test2 = pd.read_csv('test_feature2.csv')

In [38]:
test2.head()

Unnamed: 0,id,1,2
0,0001da55d168196bf25f06a497b5cf414126542f4d357d...,30018.0,30021.0
1,00024eca1053d4268df5c6d3308f0d008cf5e2678c443f...,30027.0,30018.0
2,000273e55809afd4a9ac4fb9175effe5d0ea449ed37e7c...,45003.0,45003.0
3,0002bd1d73c326ad6e337a5687f6787b055f13079d5c52...,30021.0,30027.0
4,00030b6d5b8013bcb9bb23a9bccf394d7b361a01d0d6b7...,30021.0,30027.0


In [39]:
user_id = test2['id'].values

In [40]:
encode = preprocessing.LabelEncoder()
for column in test2.columns:
    test2.loc[:, column] = encode.fit_transform(test2[column])

In [41]:
# 处理缺失值
test2 = test2.fillna(10)

In [42]:
x2 = test2.values[:,1:3]
x2.shape

(100870L, 2L)

In [43]:
proba2 = model2.predict_proba(x2)

In [44]:
result = pd.DataFrame({'0id':user_id,'event_30018':proba2[:,0],'event_30021':proba2[:,1],'event_30024':proba2[:,2],
                       'event_30027':proba2[:,3],'event_30039':proba2[:,4],'event_30042':proba2[:,5],'event_30045':proba2[:,6],
                       'event_30048':proba2[:,7],'event_36003':proba2[:,8],'event_45003':proba2[:,9]})
result.head()

Unnamed: 0,0id,event_30018,event_30021,event_30024,event_30027,event_30039,event_30042,event_30045,event_30048,event_36003,event_45003
0,0001da55d168196bf25f06a497b5cf414126542f4d357d...,0.3883,0.13103,0.04427,0.14448,0.00481,0.00206,0.01512,0.00362,0.2413,0.02501
1,00024eca1053d4268df5c6d3308f0d008cf5e2678c443f...,0.05404,0.39319,0.41101,0.01704,0.00331,0.00109,0.00569,0.0023,0.10508,0.00726
2,000273e55809afd4a9ac4fb9175effe5d0ea449ed37e7c...,0.19868,0.12256,0.06731,0.016,0.00224,0.00172,0.01943,0.02549,0.47916,0.06741
3,0002bd1d73c326ad6e337a5687f6787b055f13079d5c52...,0.14131,0.05557,0.02082,0.01488,0.00242,0.64118,0.01183,0.00587,0.09873,0.0074
4,00030b6d5b8013bcb9bb23a9bccf394d7b361a01d0d6b7...,0.14131,0.05557,0.02082,0.01488,0.00242,0.64118,0.01183,0.00587,0.09873,0.0074


In [45]:
result.to_csv('result_2feature.csv',index=False)