In [1]:
# 使用五折交叉验证

# score = 87.1913

import pandas as pd
import numpy as np

train = pd.read_csv('./datasets/train.csv')
test1 = pd.read_csv('./datasets/test1.csv')

In [2]:
features = train.drop(['Unnamed: 0', 'label'], axis=1)
labels = train['label']
features.columns

Index(['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
       'dev_width', 'lan', 'media_id', 'ntt', 'os', 'osv', 'package', 'sid',
       'timestamp', 'version', 'fea_hash', 'location', 'fea1_hash',
       'cus_type'],
      dtype='object')

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 训练集和测试机合并，统一LabelEncoder

all_df = pd.concat([train, test1])
all_df['osv'] = all_df['osv'].astype('str')
all_df['osv'] = le.fit_transform(all_df['osv'])

features['osv'].value_counts()

8.1.0            151419
9                 71332
6.0.1             31714
7.1.1             26503
9.0.0             24385
                  ...  
Android 5.12          1
4.2.3.2               1
6.0 十核2.0G_HD         1
9.1                   1
2.2.2                 1
Name: osv, Length: 154, dtype: int64

In [4]:
for feature in features.columns:
    print(feature, train[feature].nunique())

# 类别特征
cate_features = ['apptype', 'carrier', 'ntt', 'version', 'location', 'cus_type']

android_id 362258
apptype 89
carrier 5
dev_height 798
dev_ppi 92
dev_width 346
lan 21
media_id 284
ntt 8
os 2
osv 154
package 1950
sid 500000
timestamp 500000
version 22
fea_hash 402980
location 332
fea1_hash 4959
cus_type 58


In [5]:
# 不参与建模的特征 ['os', 'osv', 'lan', 'sid']
remove_list = ['os', 'lan', 'sid']
col = features.columns.tolist()
for i in remove_list:
    col.remove(i)
features = features[col]

In [6]:
import time
from datetime import datetime

def get_date(features):
    
    features2 = features.copy()
    # 除以1000 转化为日期格式
    features2['timestamp'] = features2['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000))
    
    # 创建时间索引
    temp = pd.DatetimeIndex(features2['timestamp'])
    features2['year'] = temp.year
    features2['month'] = temp.month
    features2['day'] = temp.day
    features2['week_day'] = temp.weekday
    features2['hour'] = temp.hour
    features2['minute'] = temp.minute
    
    
    start_time = features2['timestamp'].min()
    features2['time_diff'] = features2['timestamp'] - start_time
    
    features2['time_diff'] = features2['time_diff'].dt.days * 24 + features2['time_diff'].dt.seconds / 3600
    
    features2.drop(['timestamp', 'year', 'month', 'week_day', 'minute'], axis = 1, inplace=True)
    return features2


features = get_date(features)
features[['day', 'hour', 'time_diff']]

test1 = get_date(test1)

In [7]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score

def ensemble_model(clf, train_x, train_y, test, cate_features):
    sk = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
    prob = []
    mean_acc = 0
    for k, (train_index, val_index) in enumerate(sk.split(train_x, train_y)):
        train_x_real = train_x.iloc[train_index]
        train_y_real = train_y.iloc[train_index]
        val_x = train_x.iloc[val_index]
        val_y = train_y.iloc[val_index]
        
        clf = clf.fit(train_x_real, train_y_real, categorical_feature=cate_features)
        val_y_pred = clf.predict(val_x)
        
        acc_val = accuracy_score(val_y, val_y_pred)
        print('第{}个子模型， acc{}'.format(k+1, acc_val))
        mean_acc += acc_val / 5
        
        test_y_pred = clf.predict_proba(test)[:, -1]
        prob.append(test_y_pred)
    print(mean_acc)
    mean_prob = sum(prob) / 5
    return mean_prob

In [8]:

# 特征变换，对于数值过大的异常值，设定为0
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))

features['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
# 特征变换，对于数值过大的异常值，设定为0
features['osv'] = all_df[all_df['label'].notnull()]['osv']


# test data 
test_fea = test1[features.columns]

test_fea['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
test_fea['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
test_fea['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
test_fea['osv'] = all_df[all_df['label'].isnull()]['osv']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['version

In [9]:
import lightgbm as lgb

clf = lgb.LGBMClassifier()
# clf.fit(features, labels, categorical_feature = cate_features)
result = ensemble_model(clf, features, labels, test_fea, cate_features)
print(result)




第1个子模型， acc0.88158




第2个子模型， acc0.88122




第3个子模型， acc0.88185




第4个子模型， acc0.88211




第5个子模型， acc0.88169
0.88169
[0.06830039 0.42071703 0.03394626 ... 0.93690519 0.56215164 0.96564357]


In [10]:

a = pd.DataFrame(test1['sid'])
a['label'] = result


a['label'] = a['label'].apply(lambda x: 0 if x < 0.5 else 1)


import datetime
timestr = datetime.datetime.now().strftime('%y-%m-%d %H %M %S')

a.to_csv('result/v4 ' + timestr + '.csv', index=False)

