In [6]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
from sklearn import preprocessing
# 项目路径
project_path = '/home/wjunneng/Python/2019-Iflytek-Mobile-Advertising-Anti-Fraud-Algorithm-Challenge'
# round1_iflyad_anticheat_testdata_feature.txt文件路径
testdata_feature_path = project_path + '/data/original/round1_iflyad_anticheat_testdata_feature.txt'
# round1_iflyad_anticheat_traindata.txt文件路径
traindata_path = project_path + '/data/original/round1_iflyad_anticheat_traindata.txt'


def one_hot_col(col):
    """标签编码"""
    lbl = preprocessing.LabelEncoder()
    lbl.fit(col)
    return lbl


def calculate_null(data, key, col):
    """
    params:
    data -- input data
    key -- the key used for statistics
    col -- the columns for statistics
    return -- the data of DataFrame type, include two columns,
              first columns id key,second is number of null
    """
    return data.groupby(key, as_index=False)[col].agg({col + '_is_null': 'count'})


def xgb_model(new_train, y, new_test, lr):
    """定义模型"""
    xgb_params = {'booster': 'gbtree',
                  'eta': lr, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8,
                  'objective': 'binary:logistic',
                  'eval_metric': 'auc',
                  'silent': True,
                  }
    # skf=StratifiedKFold(y,n_folds=5,shuffle=True,random_state=2018)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_xgb = np.zeros(new_train.shape[0])
    prediction_xgb = np.zeros(new_test.shape[0])
    for i, (tr, va) in enumerate(skf.split(new_train, y)):
        print('fold:', i + 1, 'training')
        dtrain = xgb.DMatrix(new_train[tr], y[tr])
        dvalid = xgb.DMatrix(new_train[va], y[va])
        watchlist = [(dtrain, 'train'), (dvalid, 'valid_data')]
        bst = xgb.train(dtrain=dtrain, num_boost_round=30000, evals=watchlist, early_stopping_rounds=200,
                        verbose_eval=50, params=xgb_params)
        oof_xgb[va] += bst.predict(xgb.DMatrix(new_train[va]), ntree_limit=bst.best_ntree_limit)
        prediction_xgb += bst.predict(xgb.DMatrix(new_test), ntree_limit=bst.best_ntree_limit)
    print('the roc_auc_score for train:', roc_auc_score(y, oof_xgb))
    prediction_xgb /= 5
    return oof_xgb, prediction_xgb


def lgb_model(new_train, y, new_test):
    params = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'num_leaves': 1000,
        'verbose': -1,
        'max_depth': -1,
        #  'reg_alpha':2.2,
        #  'reg_lambda':1.4,
        'seed': 42,
    }
    # skf=StratifiedKFold(y,n_folds=5,shuffle=True,random_state=2018)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_lgb = np.zeros(new_train.shape[0])  # 用于存放训练集概率，由每折验证集所得
    prediction_lgb = np.zeros(new_test.shape[0])  # 用于存放测试集概率，k折最后要除以k取平均
    feature_importance_df = pd.DataFrame()  # 存放特征重要性，此处不考虑
    for i, (tr, va) in enumerate(skf.split(new_train, y)):
        print('fold:', i + 1, 'training')
        dtrain = lgb.Dataset(new_train[tr], y[tr])
        dvalid = lgb.Dataset(new_train[va], y[va], reference=dtrain)
        # 训练：
        bst = lgb.train(params, dtrain, num_boost_round=30000, valid_sets=dvalid, verbose_eval=400,
                        early_stopping_rounds=200)
        # 预测验证集：
        oof_lgb[va] += bst.predict(new_train[va], num_iteration=bst.best_iteration)
        # 预测测试集：
        prediction_lgb += bst.predict(new_test, num_iteration=bst.best_iteration)
        """
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = list(new_train.columns)
        fold_importance_df["importance"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)
        fold_importance_df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        """

    print('the roc_auc_score for train:', roc_auc_score(y, oof_lgb))  # 线下auc评分
    prediction_lgb /= 5
    return oof_lgb, prediction_lgb, feature_importance_df


def get_testdata_feature(**params):
    """
    返回 testdata_feature 文件内容
    :param params:
    :return:
    """
    testdata_feature_data = pd.read_table(testdata_feature_path, sep='\t')

    return testdata_feature_data


def get_traindata(**params):
    """
    返回 traindata 文件内容
    :param params:
    :return:
    """
    traindata_data = pd.read_table(traindata_path, sep='\t')

    return traindata_data


testdata_feature = get_testdata_feature()
traindata = get_traindata()

print('testdata_feature.columns: ', testdata_feature.columns)
print('traindata.columns: ', traindata.columns)



testdata_feature.columns:  Index(['sid', 'pkgname', 'ver', 'adunitshowid', 'mediashowid', 'apptype',
       'nginxtime', 'ip', 'city', 'province', 'reqrealip', 'adidmd5',
       'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5', 'dvctype', 'model',
       'make', 'ntt', 'carrier', 'os', 'osv', 'orientation', 'lan', 'h', 'w',
       'ppi'],
      dtype='object')
traindata.columns:  Index(['sid', 'label', 'pkgname', 'ver', 'adunitshowid', 'mediashowid',
       'apptype', 'nginxtime', 'ip', 'city', 'province', 'reqrealip',
       'adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5', 'dvctype',
       'model', 'make', 'ntt', 'carrier', 'os', 'osv', 'orientation', 'lan',
       'h', 'w', 'ppi'],
      dtype='object')


In [7]:
def conversion_time(df, column, **params):
    """
    对会话开始和结束时间进行标准化
    :param df:
    :param params:
    :return:
    """
    # 本题所给时间戳为毫秒级，故需除以1000转换为秒级：时间戳转成日期格式
    df[column] = df[column].apply(lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(int(x)/1000))))
    
    return df

# 读入测试数据：
testdata_feature['label'] = -1  # 测试集没有标签，可标记为-1

# 请求会话时间
testdata_feature['begintime'] = testdata_feature['sid'].apply(lambda x: int(x.split('-')[-1]))  
# 请求会话时间 与 请求到达服务时间的差
testdata_feature['nginxtime-begintime'] = testdata_feature['nginxtime'] - testdata_feature['begintime']  

# 请求会话时间
traindata['begintime'] = traindata['sid'].apply(lambda x: int(x.split('-')[-1]))
# 请求会话时间 与 请求到达服务时间的差
traindata['nginxtime-begintime'] = traindata['nginxtime'] - traindata['begintime']


print('traindata.shape: \n', traindata.shape)
print('\n')
print('testdata_feature.shape: ', testdata_feature.shape)
print('\n')

# 结合数据，方便提取特征：axis=0 纵向合并；axis=1 横向合并
data = pd.concat([traindata, testdata_feature], axis=0, sort=False).reset_index(drop=True)

print('the shape of data: \n', data.shape)
print('\n')
print('data.nunique(): \n', data.nunique())  # 返回每个字段的所有值组成集合的大小，即集合元素个数




traindata.shape: 
 (1000000, 31)


testdata_feature.shape:  (100000, 31)


the shape of data: 
 (1100000, 31)


data.nunique(): 
 sid                    1100000
label                        3
pkgname                   2368
ver                       3268
adunitshowid               800
mediashowid                313
apptype                     91
nginxtime              1098977
ip                      813719
city                       331
province                     8
reqrealip                 9748
adidmd5                 780369
imeimd5                1021836
idfamd5                    360
openudidmd5              85051
macmd5                  329184
dvctype                      3
model                     7957
make                      2727
ntt                          8
carrier                      5
os                           2
osv                        185
orientation                  4
lan                         33
h                          985
w                          449
pp

In [8]:
z = calculate_null(testdata_feature, 'sid', 'ver')  # 计算缺失值的，下面还没用到

print('label distribution:\n', traindata['label'].value_counts())  # 查看训练集标签分布
print('\n')
object_cols = list(data.dtypes[data.dtypes == np.object].index)  # 返回字段名为object类型的字段
print('object_cols:\n', object_cols)  # 输出object类型的字段




label distribution:
 0    517106
1    482894
Name: label, dtype: int64


object_cols:
 ['sid', 'pkgname', 'ver', 'adunitshowid', 'mediashowid', 'ip', 'city', 'reqrealip', 'adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5', 'model', 'make', 'os', 'osv', 'lan']


In [9]:
# 本题所给时间戳为毫秒级，故需除以1000转换为秒级：时间戳转成日期格式
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(data['nginxtime'][0] / 1000)))

# 对object类型的字段进行标签编码：
for col in object_cols:
    if col != 'sid':
        data[col] = one_hot_col(data[col].astype(str)).transform(data[col].astype(str))

print(data['nginxtime'])



2019-06-09 22:31:41
0          1.560091e+12
1          1.560051e+12
2          1.560089e+12
3          1.560063e+12
4          1.560079e+12
               ...     
1099995    1.560166e+12
1099996    1.560169e+12
1099997    1.560123e+12
1099998    1.560159e+12
1099999    1.560106e+12
Name: nginxtime, Length: 1100000, dtype: float64


In [11]:
# 划分数据：
train = data[:traindata.shape[0]]
label = train['label'].values
test = data[traindata.shape[0]:].reset_index(drop=True)

# 模型训练预测：
oof_lgb, prediction_lgb, feature_importance_df = \
    lgb_model(np.array(train.drop(['sid', 'label', 'nginxtime', 'ip', 'reqrealip', 'begintime'], axis=1)),
              label,
              np.array(test.drop(['sid', 'label', 'nginxtime', 'ip', 'reqrealip', 'begintime'], axis=1)))

# 保存结果：
sub = test[['sid']]
sub['label'] = prediction_lgb
sub['label'] = sub['label'].apply(lambda x: 1 if x > 0.5 else 0)  # ∪概率大于0.5的置1，否则置0
print('test pre_label distribution:\n', sub['label'].value_counts())  # 模型预测测试集的标签分布
sub.to_csv('submit0704.csv', index=None)  # 保存为submit0704.csv文件


fold: 1 training
Training until validation scores don't improve for 200 rounds.
[400]	valid_0's auc: 0.984056
[800]	valid_0's auc: 0.98484
[1200]	valid_0's auc: 0.985024
[1600]	valid_0's auc: 0.985059
Early stopping, best iteration is:
[1746]	valid_0's auc: 0.985067
fold: 2 training
Training until validation scores don't improve for 200 rounds.
[400]	valid_0's auc: 0.983201
[800]	valid_0's auc: 0.984107
[1200]	valid_0's auc: 0.984285
[1600]	valid_0's auc: 0.984314
Early stopping, best iteration is:
[1639]	valid_0's auc: 0.984321
fold: 3 training
Training until validation scores don't improve for 200 rounds.
[400]	valid_0's auc: 0.983511
[800]	valid_0's auc: 0.98431
[1200]	valid_0's auc: 0.984512
[1600]	valid_0's auc: 0.984532
Early stopping, best iteration is:
[1431]	valid_0's auc: 0.984536
fold: 4 training
Training until validation scores don't improve for 200 rounds.
[400]	valid_0's auc: 0.983389
[800]	valid_0's auc: 0.984193
[1200]	valid_0's auc: 0.984354
[1600]	valid_0's auc: 0.984

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
