In [1]:
import pandas as pd
from collections import Counter
import datetime
import numpy as np
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.grid_search import GridSearchCV



In [3]:
data1_in = pd.read_csv('../Data/data1/data1.csv')
data2_in = pd.read_csv('../Data/data2/data2.csv')
data3_in = pd.read_csv('../Data/data3/data3.csv')

In [4]:
drop_cols = ['login_sum','login_max','loginvar','loginmean','login_3_cnt','login_week_cnt','device_map'] + ['page_sum','page_0_sigle','page_1_sigle','page_2_sigle','page_3_sigle','page_4_sigle',
                 'action_type_sum','action_type_0_sigle','action_type_1_sigle','action_type_2_sigle',
                 'action_type_3_sigle','action_type_4_sigle','action_type_5_sigle']

In [5]:
select_cols = ['user_id','label'] + ['login_day_min', 'device_type', 'login_week_arg_cnt', 'register_type',
       'act_last_cnt', 'login_day_std', 'action_type_0', 'act_week_cnt',
       'page_1', 'act_3_cnt', 'login_cnt', 'page_0', 'act_day_std', 'actmean',
       'page_2', 'login_3_arg_cnt', 'register_day', 'act_sum', 'act_cnt',
       'action_type_1', 'action_type_2', 'act_day_min', 'actvar',
       'act_day_max', 'act_max', 'page_3', 'video_last_cnt', 'action_type_3',
       'page_4', 'video_3_cnt', 'videomean', 'video_sum', 'is_author',
       'page_3_7_cnt', 'action_type_1_7_cnt', 'video_day_min',
       'video_week_cnt', 'action_type_0_7_cnt', 'video_day_max', 'video_cnt',
       'videovar', 'action_type_2_7_cnt', 'page_3_3_cnt', 'video_day_std',
       'action_type_0_1_cnt', 'action_type_0_3_cnt', 'action_type_5',
       'action_type_1_3_cnt', 'page_0_1_cnt', 'video_max', 'page_0_7_cnt',
       'page_0_3_cnt']

In [5]:
def mapDeviceType(thread_value=0.5):
    con_data = pd.concat([data1_in, data2_in])
    index = con_data['label'].groupby(con_data["device_type"]).mean().index
    values = con_data['label'].groupby(con_data["device_type"]).mean().get_values()
    return index[values>thread_value]

good_index = mapDeviceType()

data1_in['device_map'] = data1_in['device_type'].apply(lambda x: int(x in good_index))
data2_in['device_map'] = data1_in['device_type'].apply(lambda x: int(x in good_index))
data3_in['device_map'] = data1_in['device_type'].apply(lambda x: int(x in good_index))

In [6]:
# data1 = data1_in[[c for c in data1_in.columns if c not in drop_cols and c in select_cols]]
# data2 = data2_in[[c for c in data2_in.columns if c not in drop_cols and c in select_cols]]
# data3 = data3_in[[c for c in data3_in.columns if c not in drop_cols and c in select_cols]]

data1 = data1_in[[c for c in data1_in.columns if c not in drop_cols]]
data2 = data2_in[[c for c in data2_in.columns if c not in drop_cols]]
data3 = data3_in[[c for c in data3_in.columns if c not in drop_cols]]

In [7]:
print (data1.shape)
print (data2.shape)
print (data3.shape)

(22342, 105)
(26571, 105)
(51709, 104)


# CV调参法

In [8]:
param_learning_rate = {'learning_rate':[0.005,0.01,0.02,0.05,0.1,0.2,0.5]}

In [9]:
param_n_estimators = {'n_estimators': list(range(100,500,10))}
param_acc1 = {'max_depth':list(range(3,9,1)), 'min_child_weight':list(range(1,10,1))}
param_gamma = { 'gamma':list(np.arange(0.1,1,0.1))}
param_acc2 = {'subsample':list(np.arange(0.5,1,0.1)),'colsample_bytree':list(np.arange(0.5,1,0.1))}
param_guonihe2 = {'reg_alpha':[0.05, 0.1, 1, 2, 3], 'reg_lambda':[0.05, 0.1, 1, 2, 3]}

In [10]:
XGBM = xgb.XGBClassifier(learning_rate=0.1,
                        n_estimators=100,
                        max_depth=5,
                        min_child_weight=1,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        gamma=0,
                        reg_alpha=0,
                        reg_lambda=1)

In [11]:
train = pd.concat([data1, data2])
train.pop('user_id')
label = train.pop('label')

In [13]:
cv1 = GridSearchCV(estimator = XGBM,
                  param_grid = param_n_estimators, scoring='f1', iid=False)
cv1.fit(train, label)
cv1.grid_scores_, cv1.best_params_, cv1.best_score_

KeyboardInterrupt: 


————————————————————————————————————————————————————————

In [13]:
def sroceF1(pred, real):
    M = set(pred)
    N = set(real)
    Precision = len(M.intersection(N))/len(M)
    Recall = len(M.intersection(N))/len(N)
    F1 = 2*Precision*Recall/(Precision+Recall)

    print("Precision=",Precision,"| Recall=",Recall)
    print("F1=",F1)

In [14]:
def buildModelAndPredict(isOnLine=True, isTest=False, yuzhi=0.4, model=XGBM):
    if (isOnLine):
        # yuzhi=0.4
        train = pd.concat([data1, data2])
        test = data3.copy()
        train.pop('user_id')
        label = train.pop('label')
        
        model.fit(train, label)
        user_list = test.pop('user_id')
        print (len(user_list))
        user_df = pd.DataFrame(user_list)
        user_df['pre_act'] = model.predict_proba(test)[:,1]
        return user_df[user_df.pre_act>yuzhi]['user_id']
            
    else: 
        # best yuzhi 0.6
        train = data1.copy()
        test = data2.copy()
        # train pop user_id and get label
        train.pop('user_id')
        train_df_label = train.pop('label')
        train_df = train
        
        # test get user_id and pop label
        real_user = test[test.label==1]['user_id']
        user_list = test.pop('user_id')
        test.pop('label')
        test_df = test
        
        user_df = pd.DataFrame(user_list)
        # train the model and predict
        model.fit(train_df, train_df_label)
        user_df['pre_act'] = model.predict_proba(test_df)[:,1]
        
        # calculate the F1 score
        if (isTest):
            for i in np.arange(0.3, 0.8, 0.05):
                user_pre = user_df[user_df.pre_act>i]['user_id']
                sroceF1(user_pre, real_user)
                print (i)
        else:
            user_pre = user_df[user_df.pre_act>yuzhi]['user_id']
            print (len(user_pre),len(real_user))
            sroceF1(user_pre, real_user)
        return None

In [17]:
user_pre = buildModelAndPredict(isOnLine=True, isTest=False, yuzhi=0.4)

51709


In [18]:
len(user_pre)

23672

In [21]:
feature_importances = XGBM.feature_importances_

In [22]:
feature_importances = 100 * (feature_importances / max(feature_importances))
feature_importances

array([  1.82222214e+01,   2.27407417e+01,   1.00000000e+02,
         1.92592597e+00,   2.43703709e+01,   3.54074059e+01,
         1.22222223e+01,   1.03703701e+00,   6.14814854e+00,
         1.35555563e+01,   4.74074078e+00,   7.48148108e+00,
         4.14814806e+00,   1.85185182e+00,   3.18518543e+00,
         2.00000000e+00,   2.14814830e+00,   3.92592597e+00,
         2.00000000e+00,   3.33333302e+00,   5.18518507e-01,
         3.62962961e+00,   1.11111116e+00,   1.05925922e+01,
         1.44444447e+01,   2.63703709e+01,   9.18518543e+00,
         3.42222214e+01,   3.69629631e+01,   4.42222252e+01,
         4.88148155e+01,   2.67407379e+01,   3.33333359e+01,
         7.18518591e+00,   3.45185204e+01,   6.59259272e+00,
         8.88888896e-01,   8.14814866e-01,   3.70370358e-01,
         2.59259272e+00,   3.70370358e-01,   2.22222224e-01,
         3.70370358e-01,   2.22222224e-01,   1.40740740e+00,
         4.44444448e-01,   2.96296299e-01,   8.88888896e-01,
         2.96296299e-01,

# 结果数据提交

In [23]:
user_pre.to_csv('../Output/xgb799_23762.csv', index=False)