In [1]:
# 第三版，使用时间
# score = 86.766
import pandas as pd
import numpy as np

train = pd.read_csv('../../datasets/fraud_prediction/train.csv')
test1 = pd.read_csv('../../datasets/fraud_prediction/test1.csv')

In [2]:
features = train.drop(['Unnamed: 0', 'label'], axis=1)
labels = train['label']
features.columns

Index(['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
       'dev_width', 'lan', 'media_id', 'ntt', 'os', 'osv', 'package', 'sid',
       'timestamp', 'version', 'fea_hash', 'location', 'fea1_hash',
       'cus_type'],
      dtype='object')

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 训练集和测试机合并，统一LabelEncoder

all_df = pd.concat([train, test1])
all_df['osv'] = all_df['osv'].astype('str')
all_df['osv'] = le.fit_transform(all_df['osv'])

features['osv'].value_counts()

8.1.0                                 151419
9                                      71332
6.0.1                                  31714
7.1.1                                  26503
9.0.0                                  24385
                                       ...  
2.2.2                                      1
9.1.0                                      1
Android 4.3.1                              1
f073b_changxiang_v01_b1b8_20180915         1
4                                          1
Name: osv, Length: 154, dtype: int64

In [4]:
for feature in features.columns:
    print(feature, train[feature].nunique())

# 类别特征
cate_features = ['apptype', 'carrier', 'ntt', 'version', 'location', 'cus_type']

android_id 362258
apptype 89
carrier 5
dev_height 798
dev_ppi 92
dev_width 346
lan 21
media_id 284
ntt 8
os 2
osv 154
package 1950
sid 500000
timestamp 500000
version 22
fea_hash 402980
location 332
fea1_hash 4959
cus_type 58


In [5]:
# 不参与建模的特征 ['os', 'osv', 'lan', 'sid']
remove_list = ['os', 'lan', 'sid']
col = features.columns.tolist()
for i in remove_list:
    col.remove(i)
features = features[col]

In [6]:
import time
from datetime import datetime

def get_date(features):
    
    features2 = features.copy()
    # 除以1000 转化为日期格式
    features2['timestamp'] = features2['timestamp'].apply(lambda x: datetime.fromtimestamp(x/1000))
    
    # 创建时间索引
    temp = pd.DatetimeIndex(features2['timestamp'])
    features2['year'] = temp.year
    features2['month'] = temp.month
    features2['day'] = temp.day
    features2['week_day'] = temp.weekday
    features2['hour'] = temp.hour
    features2['minute'] = temp.minute
    
    
    start_time = features2['timestamp'].min()
    features2['time_diff'] = features2['timestamp'] - start_time
    
    features2['time_diff'] = features2['time_diff'].dt.days * 24 + features2['time_diff'].dt.seconds / 3600
    
    features2.drop(['timestamp', 'year', 'month', 'week_day', 'minute'], axis = 1, inplace=True)
    return features2


features = get_date(features)
features[['day', 'hour', 'time_diff']]

test1 = get_date(test1)

In [7]:
features

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,media_id,ntt,osv,package,version,fea_hash,location,fea1_hash,cus_type,day,hour,time_diff
0,316361,1199,46000.0,0.0,0.0,0.0,104,6.0,9,18,8,2135019403,0,2329670524,601,7,15,111.535278
1,135939,893,0.0,0.0,0.0,0.0,19,6.0,8.1,0,4,2782306428,1,2864801071,1000,8,19,139.671944
2,399254,821,0.0,760.0,0.0,360.0,559,0.0,8.1.0,0,0,1392806005,2,628911675,696,6,23,95.971111
3,68983,1004,46000.0,2214.0,0.0,1080.0,129,2.0,8.1.0,0,0,3562553457,3,1283809327,753,9,8,152.993333
4,288999,1076,46000.0,2280.0,0.0,1080.0,64,2.0,8.0.0,0,5,2364522023,4,1510695983,582,7,8,104.472222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,392477,1028,46000.0,1920.0,3.0,1080.0,144,6.0,7.1.2,25,7,861755946,79,140647032,373,6,23,95.238056
499996,346134,1001,0.0,1424.0,0.0,720.0,29,2.0,8.1.0,0,3,1714444511,23,2745131047,525,6,17,89.681111
499997,499635,761,46000.0,1280.0,0.0,720.0,54,6.0,6.0.1,9,0,3843262581,25,1326115882,810,5,3,51.248889
499998,239786,917,46001.0,960.0,0.0,540.0,109,2.0,5.1.1,0,0,1984296118,225,1446741112,772,7,0,96.990556


In [8]:

# 特征变换，对于数值过大的异常值，设定为0
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))

features['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
# 特征变换，对于数值过大的异常值，设定为0
features['osv'] = all_df[all_df['label'].notnull()]['osv']


# test data 
test_fea = test1[features.columns]

test_fea['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
test_fea['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
test_fea['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
test_fea['osv'] = all_df[all_df['label'].isnull()]['osv']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x)) > 16 else int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_fea['version

In [9]:
import lightgbm as lgb

clf = lgb.LGBMClassifier()
clf.fit(features, labels, categorical_feature = cate_features)

a = pd.DataFrame(test1['sid'])
a['label'] = clf.predict(test_fea)

a.to_csv('v3.csv', index=False)

