In [1]:
import pandas as pd
import numpy as np
import gc
from joblib import Parallel, delayed
import os
import warnings
warnings.filterwarnings('ignore')

import time
from datetime import timedelta, datetime
from tqdm import tqdm
import re

feature_path = 'feature/'
model_path = 'model/'
submit_path = 'submit/'
#stacking feature data
metafeature_path = 'meta-feature/'
pic_path = 'pic/'
train_data_1_len = 1000000
train_data_2_len = 5000000
test_data_len = 1000000

def reduce_mem_usage(data):
    '''
    通过判断数据范围的上下限来选择最小能存储数据的类型
    注意:在存储feather前不要使用,因为feather不支持float16位类型
    data:输入dataframe
    return:返回优化后的dataframe
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = data.memory_usage().sum() / 1024**2    
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    #为避免feather不支持此类型
                    #data[col] = data[col].astype(np.float16)
                    pass
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)    
    end_mem = data.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return data
        

def combine_feature_feather(feature_file_list=None):
    '''
    从feature文件夹中加载feature
    feature_file_list:默认加载feature_path下所有feature文件(除了整个文件)；可传入列表选择文件
    '''
    if feature_file_list != None and type(feature_file_list) != list:
        raise ValueError("Unable to load features from {0}".format(str(feature_file_list)))
    
    if feature_file_list == None:
        #遍历feature文件
        feature_file_list = os.listdir(feature_path)
    
    if len(feature_file_list) < 1:
        print('Empty feature file list')
        return None
    
    # 无法读取混合类型的文件
    # 请将feature_path目录下文件统一格式存储
    #file_type_num = len(set([x.split('.') for x in feature_file_list]))
    #if file_type_num > 1:
     #   raise ValueError("Unable to load features from mix type feature file : file type number-{0}".format(str(file_type_num)))
    
    #并行读取文件
    def df_append(file):
        return pd.read_feather(file)
    
    feature_df_list = Parallel(n_jobs=-1,verbose=10)(delayed(df_append)(feature_path+file_name) for file_name in feature_file_list if file_name != 'combine.bin')
    
    print(len(feature_df_list))
    data = pd.concat(feature_df_list,axis=1).reset_index(drop=True)
    del feature_df_list
    gc.collect()
    data.to_feather(feature_path+'combine.bin')
    return data

def create_screen(data):
    cols = ['h','w','ppi','h_w_ratio','screen_area','size','orientation','creative_dpi','px']
    #高 除以 宽
    #data['w'][data['w']==-1.0] = 0.0
    #data['h'][data['h']==-1.0] = 0.0
    data['ppi'] = data['ppi'].fillna(0)
    
    group_col = 'model'
    for col in ['h','w','ppi']:
        tmp = data[[group_col,col]]
        tmp = tmp.groupby(by=[group_col])[col].mean()  #以'model'为划分求平均值
        tmp = tmp.reset_index().rename(columns={col: 'tmp'})   #划分序号并重命名为tmp

        #merge小结：on = ''为主键， how= 'outer'为并集，没有的取nan，'inner'为交集，'left'为全取左边，右边有就填充， 'right'相反
        data = data.merge(tmp, on=[group_col], how='left',copy=False,sort=False) 

        data['tmp'][data[col]!=0] = 0
        data[col]+=data['tmp']
        data.drop(['tmp'],axis=1,inplace=True)
    
    data['h_w_ratio'] = data['h']/data['w']
    
    # 屏幕尺寸 合并成宽和高
    data['screen_area'] = data['w'] * data['h']
    
    #英寸尺寸
    data['size'] = (np.sqrt(data['h'] ** 2 + data['w'] ** 2) / 2.54) / 1000
    data['px'] = data['ppi'] * data['size']
    
    data['creative_dpi'] = data['h'].astype(str) + "_" + data['w'].astype(str)

    # orientation 出现异常值 90度和2 归为 0
    data['orientation'][(data['orientation'] == 90) | (data['orientation'] == 2)] = 0
    data['orientation'] = data['orientation'].fillna(0)
    data['orientation'] = data['orientation'].astype(int)
    
    data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'screen_ft.bin')
    print('create screen_feature done')
    return data

def create_time(data):
    cols = ['hour','minute','period','nginxtime-begin_time']
    # 处理时间
    data['datetime'] = pd.to_datetime(data['nginxtime'] / 1000, unit='s') + timedelta(hours=8)
    data['hour'] = data['datetime'].dt.hour
    
    data['minute'] = data['datetime'].dt.hour*60 + data['datetime'].dt.minute

    data['period'] = pd.Series(np.zeros(len(data)))-1
    data.period[(data.hour>=22) | (data.hour<2)] = 1 #半夜
    data.period[(data.hour>=2) & (data.hour<6)] = 2 #凌晨
    data.period[(data.hour>=6) & (data.hour<10)] = 3 #早上
    data.period[(data.hour>=10) & (data.hour<14)] = 4 #中午
    data.period[(data.hour>=14) & (data.hour<18)] = 5 #下午
    data.period[(data.hour>=18) & (data.hour<22)] = 6 #晚上
    
    data['period'] = data['period'].astype(int)

    #请求会话时间，使用lambda函数，将取sid最后的一串数字作为开始时间
    data['begin_time'] = data['sid'].apply(lambda x:int(x.split('-')[-1])) 
    #请求会话时间 与 请求到达服务时间的差
    data['nginxtime-begin_time'] = data['nginxtime']-data['begin_time']
    '''
    df = data[['sid','ver','adunitshowid','nginxtime']].copy()
    df = df.sort_values(by='nginxtime')
    df['ver&adunitshowid_nextClick'] = (df.groupby(['ver','adunitshowid']).nginxtime.shift(-1)/1000 - df.nginxtime/1000).astype(np.float32)
    df['ver&adunitshowid_lastClick'] = (df.nginxtime/1000 - df.groupby(['ver','adunitshowid']).nginxtime.shift(+1)/1000).astype(np.float32)
    df['adunitshowid_nextClick'] = (df.groupby(['adunitshowid']).nginxtime.shift(-1)/1000 - df.nginxtime/1000).astype(np.float32)
    df['adunitshowid_lastClick'] = (df.nginxtime/1000 - df.groupby(['adunitshowid']).nginxtime.shift(+1)/1000).astype(np.float32)
    time_shift_cols = ['ver&adunitshowid_nextClick','ver&adunitshowid_lastClick','adunitshowid_nextClick','adunitshowid_lastClick']
    %time data = data.merge(df[['sid']+time_shift_cols],how='left',on='sid',copy=False,sort=False)
    print(data.columns)
    del df
    gc.collect()
    data[time_shift_cols].to_feather(feature_path+'time_shift_ft.bin')
    '''
    data = data.drop(['nginxtime','datetime','begin_time'],axis=1)
    
    #data['hour'] = data['hour'].astype(str)
    #data['minute'] = data['minute'].astype(str)
    #data['period'] = data['period'].astype(str)
    data[cols].to_feather(feature_path+'time_ft.bin')
    print('create time done')
    return data

def create_region(data):
    cols = ['city','province']
    
    city_list_raw = ['云浮市', '平顶山市', '莱芜市', '齐齐哈尔市', '黑河市', '巴中市', '达州市', '烟台市', '滨州市', '江门市', '内江市', '盐城市', '青岛市', '梅州市', '张家界市', '鞍山市', '锡林郭勒盟', '汕头市', '庆阳市', '九江市', '东莞市', '凉山彝族自治州', '日照市', '广元市', '昆明市', '周口市', '海南藏族自治州', '白城市', '巴音郭楞蒙古自治州', '攀枝花市', '克孜勒苏柯尔克孜自治州', '秦皇岛市', '惠州市', '红河哈尼族彝族自治州', '宁波市', '巴彦淖尔市', '泰州市', '驻马店市', '南昌市', '普洱市', '南阳市', '丽水市', '乐山市', '株洲市', '泸州市', '保定市', '三门峡市', '乌鲁木齐市', '营口市', '石嘴山市', '怒江傈僳族自治州', '长春市', '镇江市', '南京市', '阳江市', '晋中市', '洛阳市', '宜春市', '曲靖市', '包头市', '滁州市', '唐山市', '无锡市', '广州市', '鹤岗市', '孝感市', '双鸭山市', '漯河市', '银川市', '聊城市', '延边朝鲜族自治州', '南通市', '成都市', '赤峰市', '桂林市', '深圳市', '眉山市', '河源市', '丽江市', '宿迁市', '本溪市', '临沧市', '黔东南苗族侗族自治州', '绥化市', '芜湖市', '伊春市', '池州市', '西宁市', '楚雄彝族自治州', '厦门市', '济宁市', '呼伦贝尔市', '安阳市', '抚州市', '咸阳市', '崇左市', '延安市', '鄂尔多斯市', '台州市', '临夏回族自治州', '丹东市', '中山市', '武威市', '安庆市', '天水市', '合肥市', '福州市', '新乡市', '苏州市', '潮州市', '安康市', '定西市', '陇南市', '葫芦岛市', '枣庄市', '昭通市', '鹤壁市', '和田地区', '重庆市', '辽源市', '文山壮族苗族自治州', '大兴安岭地区', '塔城地区', '西双版纳傣族自治州', '襄阳市', '连云港市', '淮安市', '阿克苏地区', '迪庆藏族自治州', '海西蒙古族藏族自治州', '固原市', '杭州市', '常州市', '衡水市', '邢台市', '吉安市', '佳木斯市', '宁德市', '德阳市', '阜阳市', '岳阳市', '临沂市', '舟山市', '阳泉市', '东营市', '儋州市', '荆州市', '怀化市', '海北藏族自治州', '许昌市', '通辽市', '呼和浩特市', '阜新市', '铁岭市', '铜陵市', '兰州市', '扬州市', '绵阳市', '中卫市', '宜宾市', '遂宁市', '长治市', '济南市', '蚌埠市', '武汉市', '张掖市', '三亚市', '吕梁市', '嘉兴市', '雅安市', '来宾市', '四平市', '拉萨市', '汕尾市', '运城市', '乌海市', '南平市', '上饶市', '阿坝藏族羌族自治州', '果洛藏族自治州', '阿勒泰地区', '德州市', '濮阳市', '保山市', '湖州市', '沧州市', '商丘市', '佛山市', '荆门市', '哈尔滨市', '安顺市', '亳州市', '沈阳市', '晋城市', '淮北市', '鹰潭市', '汉中市', '鄂州市', '梧州市', '辽阳市', '松原市', '自治区直辖县级行政区划', '衢州市', '乌兰察布市', '玉树藏族自治州', '承德市', '嘉峪关市', '澳门', '伊犁哈萨克自治州', '泉州市', '兴安盟', '宿州市', '临汾市', '景德镇市', '邯郸市', '大同市', '西安市', '信阳市', '石家庄市', '阿拉善盟', '吉林市', '十堰市', '张家口市', '玉林市', '酒泉市', '资阳市', '金华市', '黄山市', '绍兴市', '淄博市', '衡阳市', '黄石市', '廊坊市', '克拉玛依市', '甘南藏族自治州', '锦州市', '香港', '通化市', '揭阳市', '郑州市', '海口市', '钦州市', '清远市', '白山市', '焦作市', '六盘水市', '昌吉回族自治州', '长沙市', '黄南藏族自治州', '广安市', '喀什地区', '南宁市', '韶关市', '阿里地区', '天津市', '大连市', '菏泽市', '萍乡市', '新余市', '太原市', '商洛市', '六安市', '大理白族自治州', '漳州市', '甘孜藏族自治州', '吴忠市', '茂名市', '忻州市', '赣州市', '德宏傣族景颇族自治州', '遵义市', '马鞍山市', '铜川市', '宣城市', '宜昌市', '盘锦市', '随州市', '贵阳市', '北海市', '贺州市', '温州市', '黔西南布依族苗族自治州', '白银市', '娄底市', '榆林市', '黔南布依族苗族自治州', '开封市', '金昌市', '台湾', '防城港市', '肇庆市', '柳州市', '龙岩市', '咸宁市', '朔州市', '三明市', '南充市', '湘西土家族苗族自治州', '潍坊市', '北京市', '徐州市', '玉溪市', '百色市', '朝阳市', '大庆市', '黄冈市', '湘潭市', '渭南市', '威海市', '上海市', '莆田市', '泰安市', '博尔塔拉蒙古自治州', '常德市', '珠海市', '恩施土家族苗族自治州', '淮南市', '贵港市', '永州市', '宝鸡市', '湛江市', '自贡市', '益阳市', '牡丹江市', '河池市', '鸡西市', '抚顺市', '七台河市', '郴州市', '邵阳市', '平凉市']
    city_to_province = ['广东省','河南省','山东省','黑龙江省','黑龙江省','四川省','四川省','山东省','山东省','广东省','四川省','江苏省','山东省','广东省','湖南省','辽宁省','内蒙古省','广东省','甘肃省','江西省','广东省','四川省','山东省','四川省','云南省','河南省','青海省','吉林省','新疆维吾尔省','四川省','新疆维吾尔省','河北省','广东省','云南省','浙江省','内蒙古省','江苏省','河南省','江西省','云南省','河南省','浙江省','四川省','湖南省','四川省','河北省','河南省','新疆维吾尔省','辽宁省','宁夏省','云南省','吉林省','江苏省','江苏省','广东省','山西省','河南省','江西省','云南省','内蒙古省','安徽省','河北省','江苏省','广东省','黑龙江省','湖北省','黑龙江省','河南省','宁夏省','山东省','吉林省','江苏省','四川省','内蒙古省','广西省','广东省','四川省','广东省','云南省','江苏省','辽宁省','云南省','贵州省','黑龙江省','安徽省','黑龙江省','安徽省','青海省','云南省','福建省','山东省','内蒙古省','河南省','江西省','陕西省','广西省','陕西省','内蒙古省','浙江省','甘肃省','辽宁省','广东省','甘肃省','安徽省','甘肃省','安徽省','福建省','河南省','江苏省','广东省','陕西省','甘肃省','甘肃省','辽宁省','山东省','云南省','河南省','新疆维吾尔省','重庆市','吉林省','云南省','黑龙江省','新疆维吾尔省','云南省','湖北省','江苏省','江苏省','新疆维吾尔省','云南省','青海省','宁夏省','浙江省','江苏省','河北省','河北省','江西省','黑龙江省','福建省','四川省','安徽省','湖南省','山东省','浙江省','山西省','山东省','海南省','湖北省','湖南省','青海省','河南省','内蒙古省','内蒙古省','辽宁省','辽宁省','安徽省','甘肃省','江苏省','四川省','宁夏省','四川省','四川省','山西省','山东省','安徽省','湖北省','甘肃省','海南省','山西省','浙江省','四川省','广西省','吉林省','西藏省','广东省','山西省','内蒙古省','福建省','江西省','四川省','青海省','新疆维吾尔省','山东省','河南省','云南省','浙江省','河北省','河南省','广东省','湖北省','黑龙江省','贵州省','安徽省','辽宁省','山西省','安徽省','江西省','陕西省','湖北省','广西省','辽宁省','吉林省','other_province','浙江省','内蒙古省','青海省','河北省','甘肃省','澳门','新疆维吾尔省','福建省','内蒙古省','安徽省','山西省','江西省','河北省','山西省','陕西省','河南省','河北省','内蒙古省','吉林省','湖北省','河北省','广西省','甘肃省','四川省','浙江省','安徽省','浙江省','山东省','湖南省','湖北省','河北省','新疆维吾尔省','甘肃省','辽宁省','香港','吉林省','广东省','河南省','海南省','广西省','广东省','吉林省','河南省','贵州省','新疆维吾尔省','湖南省','青海省','四川省','新疆维吾尔省','广西省','广东省','西藏省','天津省','辽宁省','山东省','江西省','江西省','山西省','陕西省','安徽省','云南省','福建省','四川省','宁夏省','广东省','山西省','江西省','云南省','贵州省','安徽省','陕西省','安徽省','湖北省','辽宁省','湖北省','贵州省','广西省','广西省','浙江省','贵州省','甘肃省','湖南省','陕西省','贵州省','河南省','甘肃省','台湾省','广西省','广东省','广西省','福建省','湖北省','山西省','福建省','四川省','湖南省','山东省','北京市','江苏省','云南省','广西省','辽宁省','黑龙江省','湖北省','湖南省','陕西省','山东省','上海市','福建省','山东省','新疆维吾尔省','湖南省','广东省','湖北省','安徽省','广西省','湖南省','陕西省','广东省','四川省','湖南省','黑龙江省','广西省','黑龙江省','辽宁省','黑龙江省','湖南省','湖南省','甘肃省']
    city_list_raw = pd.Series(city_list_raw)
    city_to_province = pd.Series(city_to_province)
    temp = pd.concat([city_list_raw,city_to_province],axis=1).reset_index(drop=True)
    temp.columns=['city','province']
    #使用merge函数合并,比遍历快
    data.merge(temp ,on = 'city',how = 'left',copy=False,sort=False)
    
    province_in_city = ['安徽','广东','云南','陕西','江苏','辽宁','山东','湖北','河北','内蒙古','湖南','浙江','四川','河南']
    for p in province_in_city:
        data['province'][data['city']==p] = p+'省'
    
    data['city'][data['city'].isnull()]='empty'
    data['province'][data['city'].isnull()]='empty'
    '''
    for i,city in tqdm(enumerate(city_list_raw),mininterval=1.0):
        try:
            data['province'][data['city']==city] = city_to_province[i]
        except:
            data['province'][data['city']==city] = 'empty'
            print(city)
    '''
    data['province'][data['province']=='other_province'] = 'empty'
    
    data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'region_ft.bin')
    
    print('create region done')
    return data

def create_ip(data):
    cols = ['ip','ip1','reqrealip','ip2','ip3','reqrealip1','reqrealip2','reqrealip3','ip_equal']
    # 对ip地址和reqrealip地址进行分割
    data['ip'] = data['ip'].astype(str)
    data['reqrealip'] = data['reqrealip'].astype(str)
    data['ip1'] = data['ip'].map(lambda x:'.'.join(x.split('.')[:1]))
    data['ip2'] = data['ip'].apply(lambda x: '.'.join(x.split('.')[0:2]))
    data['ip3'] = data['ip'].apply(lambda x: '.'.join(x.split('.')[0:3]))
    data['reqrealip1'] = data['reqrealip'].map(lambda x:'.'.join(x.split('.')[:1]))
    data['reqrealip2'] = data['reqrealip'].apply(lambda x: '.'.join(x.split('.')[0:2]))
    data['reqrealip3'] = data['reqrealip'].apply(lambda x: '.'.join(x.split('.')[0:3]))
    data['ip_equal'] = (data['ip'] == data['reqrealip']).astype(int)
    
    data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'ip_ft.bin')
    return data

#检查一些特征是空值或是重复，作为特征
#先处理地理位置
def create_empty(data):
    cols = ['empty_count']
    all_col=[]
    for col in [c for c in data.columns if c not in ['sid','label','orentation']]:
        data['is_{0}_empty'.format(col)] = pd.DataFrame(np.zeros(len(data)))
        data['is_{0}_empty'.format(col)][data[col]=='empty'] = 1
        data['is_{0}_empty'.format(col)][data[col].isnull()] = 1
        data['is_{0}_empty'.format(col)][(data[col]==0.0)|(data[col]==-1.0)] = 1
        all_col.append('is_{0}_empty'.format(col))
    data['empty_count'] = pd.DataFrame(np.zeros(len(data)))
    for col in all_col:
        data['empty_count']+=data[col]
    data = data.drop(all_col,axis=1)
    
    #data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'empty_ft.bin')
    print('create empty feature done')
    return data

#清洗系统版本
def osv_summary(x):
    idx = x.find('.')
    s='empty'
    if idx==-1 and len(x)>1:
        return s
    if len(x)==1:
        return x
    try:
        s = x[idx-1]
    except:
        pass
    return s

#先匹配'xx.xx.xx,如果不是再匹配xx.xx'
pt1 = re.compile('[0-9]+\.[0-9]+\.[0-9]+')
pt2 = re.compile('[0-9]+\.[0-9]+')
def clean_osv(x):
    result = re.search(pt1,x)
    if result == None:
        result = re.search(pt2,x)
    if result == None:
        x = x.strip('android_')
        return x.strip('android')
    return result.group()

#创建软件版本
def create_software(data):
    cols = ['dvctype','apptype','carrier','lan','ntt','new_ntt','osv','osv_summary','ver','pkgname']
    data['dvctype'] = data['dvctype'].astype(str)
    
    data['apptype'][data['apptype']==-1.0] = 0.0
    data['apptype'] = data['apptype'].astype(str)
    data['carrier'][data['carrier']==-1.0] = 0.0
    data['carrier'] = data['carrier'].astype(str)
    
    data['lan'] = data['lan'].fillna('empty')
    data['lan'] = data['lan'].apply(lambda x: x.lower()).apply(lambda x: x.replace('_','-'))
    data['lan'][data['lan']=='zh'] = 'zh-cn'
    data['lan'][data['lan']=='zh-'] = 'zh-cn'
    data['lan'][data['lan']=='_cn'] = 'zh-cn'
    
    #data['ntt'][data['ntt']==7.0] = 0.0
    
    # 处理 ntt 的数据特征 但是不删除之前的特征 将其归为新的一列数据
    data['new_ntt'] = data['ntt']
    data.new_ntt[(data.new_ntt == 0) | (data.new_ntt == 7)] = 0
    data.new_ntt[(data.new_ntt == 1) | (data.new_ntt == 2)] = 1
    data.new_ntt[data.new_ntt == 3] = 2
    data.new_ntt[(data.new_ntt >= 4) & (data.new_ntt <= 6)] = 3
    data['ntt'] = data['ntt'].astype(str)
    data['new_ntt'] = data['new_ntt'].astype(str)
    
    data['osv'] = data['osv'].astype(str)
    data['osv'][data['osv']=='unknown'] = 'empty'
    data['osv'] = data['osv'].fillna('empty')
    data['osv'] = data['osv'].apply(lambda x: str(x).lower()).apply(lambda x: x.replace(',','.')).apply(clean_osv)
    #data['osv'] = data['osv'].apply(lambda x: str(x).lower()).apply(lambda x: x.replace(',','.'))
    data['osv_summary'] = data['osv'].apply(osv_summary)
    
    data['ver'] = data['ver'].fillna('empty')
    
    data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'software_ft.bin')
    
    print("create software done")
    return data

#清洗厂商
def clean_make(x):
    if 'lenovo' in x or '联想' in x:
        return 'lenovo'
    if 'zte' in x or '中兴' in x:
        return 'zte'
    if 'meitu' in x or '美图' in x:
        return 'meitu'
    if '360' in x or '奇酷' in x or 'qiku' in x:
        return '360'
    if 'smartisan' in x or '锤子' in x:
        return 'smartisan'
    if 'sony' in x or '索尼' in x:
        return 'smartisan'
    if 'gree' in x or '格力' in x:
        return 'gree'
    if 'bbk' in x or '步步高' in x:
        return 'bbk'
    if 'google' in x or 'nexus' in x or '谷歌' in x:
        return 'google'
    if 'motorola' in x or '摩托罗拉' in x:
        return 'motorola'
    if 'changhong' in x or '长虹' in x:
        return 'changhong'
    if 'coolpad' in x or '酷派' in x:
        return 'coolpad'
    if 'hisense' in x or '海信' in x:
        return 'hisense'
    if 'letv' in x or 'lemobile' in x or '乐视' in x:
        return 'letv'
    if 'cmcc' in x or '移动' in x or 'cmdc' in x:
        return 'cmcc'
    if 'doov' in x or '朵唯' in x:
        return 'doov'
    if 'xiaolajiao' in x or '小辣椒' in x:
        return 'xiaolajiao'
    if '4g' in x:
        return '4g'
    if 'iphone' in x or 'apple' in x or '苹果' in x or 'ipad' in x:
        return 'apple'
    if '华为' in x or 'huawei' in x or "荣耀" in x or 'honor' in x or '-al' in x or '-tl' in x:
        return 'huawei'
    if "魅族" in x or 'meizu' in x:
        return 'meizu'
    if "金立" in x or 'gionee' in x:
        return 'gionee'
    if "三星" in x or 'samsung' in x or 'sm' in x:
        return 'samsung'
    if 'xiaomi' in x or '小米' in x or 'mi' in x or 'mix' in x:
        return 'xiaomi'
    if 'redmi' in x or '红米' in x:
        return 'redmi'
    if 'oppo' in x or 'm00' in x or 'm10' in x or 'm20' in x or 'm30' in x:
        return 'oppo'
    if 'vivo' in x:
        return 'vivo'
    if 'oneplus' in x or '一加' in x:
        return 'oneplus'
    if 'nubia' in x or '努比亚' in x:
        return 'nubia'
    if 'nokia' in x or '诺基亚' in x or 'hme' in x:
        return 'nokia'
    return x

#清理硬件特征
def create_hardware(data):
    cols = ['make','model','big_model','model_equal_make']
    
    data['make'] = data['make'].fillna('empty')
    
    data['make'] = data['make'].apply(lambda x: x.lower()).apply(clean_make)
    #data['make'] = data['make'].apply(lambda x: x.lower())
    
    data['make'][data['make']=='unknown'] = 'empty'
    data['make'][data['make']=='-'] = 'empty'
    data['make'][data['make']=='0'] = 'empty'
    
    #data['os'][data['make']=='apple'] = 'ios'
    
    data['model'] = data['model'].fillna('empty')
    data['model'][data['model']=='unknown'] = 'empty'
    data['model'] = data['model'].apply(lambda x: x.lower())
    
    data['big_model'] = data['model'].map(lambda x: x.split(' ')[0])
    data['model_equal_make'] = (data['big_model'] == data['make']).astype(int)
    
    data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'hardware_ft.bin')
    print("create hardware done")
    return data

def create_media(data):
    #无清理,只是模块化存储
    cols = ['adunitshowid','mediashowid']
    data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'media_ft.bin')
    return data

def create_md5(data):
    #无清理,只是模块化存储
    cols = ['adidmd5','imeimd5','macmd5']
    data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'md5_ft.bin')
    return data

def create_single_col_count(data,cols):
    # 单特征不同取值计数
    col_name = []
    for col in tqdm(cols):
        data[col + "_count"] = data.groupby([col])[col].transform('count')
        col_name.append(col + "_count")
    data[col_name].to_feather(feature_path+'single_col_count_ft.bin')
    print('create single column counts done')
    return data

def create_category_combine(data,pair_left,pair_right):
    '''
    
    '''
    cols=[]
    for idx,right in tqdm(enumerate(pair_right)):
        for left in pair_left[idx]:
            data[left+'&'+right] = data[left].astype(str)+'_'+data[right].astype(str)
            cols.append(left+'&'+right)
    data[cols] = create_category_encoder(data[cols])
    data[cols].to_feather(feature_path+'category_combine_ft.bin')
    return data


def create_gragh(data):
    UIDs = ['pkgname','adunitshowid','ver','osv','make']
    data['pkgname&adunitshowid&ver&osv&make'] = data['pkgname'].astype(str)+'&'+data['adunitshowid'].astype(str)+'&'+data['ver'].astype(str)+'&'+data['osv'].astype(str)+'&'+data['make'].astype(str)
    
    data['adunitshowid&ver&osv&make'] = data['adunitshowid'].astype(str)+'&'+data['ver'].astype(str)+'&'+data['osv'].astype(str)+'&'+data['make'].astype(str)
    data['pkgname&ver&osv&make'] = data['pkgname'].astype(str)+'&'+data['ver'].astype(str)+'&'+data['osv'].astype(str)+'&'+data['make'].astype(str)
    data['pkgname&adunitshowid&osv&make'] = data['pkgname'].astype(str)+'&'+data['adunitshowid'].astype(str)+'&'+data['osv'].astype(str)+'&'+data['make'].astype(str)
    data['pkgname&adunitshowid&ver&make'] = data['pkgname'].astype(str)+'&'+data['adunitshowid'].astype(str)+'&'+data['ver'].astype(str)+'&'+'&'+data['make'].astype(str)
    data['pkgname&adunitshowid&ver&osv'] = data['pkgname'].astype(str)+'&'+data['adunitshowid'].astype(str)+'&'+data['ver'].astype(str)+'&'+data['osv'].astype(str)
    
    gan = ['pkgname&adunitshowid&ver&osv&make','adunitshowid&ver&osv&make','pkgname&ver&osv&make','pkgname&adunitshowid&osv&make',
          'pkgname&adunitshowid&ver&make','pkgname&adunitshowid&ver&osv']
    data[gan] = create_category_encoder(data[gan]) 
    
    ids = ['imeimd5','macmd5','adidmd5']
    ips = ['ip','ip3','reqrealip']
    device = ['model','make','creative_dpi']
    app = ['pkgname','adunitshowid','ver','osv','apptype','mediashowid']
    
    all_sub_ft = ids+ips+device+app
    
    data.drop([col for col in data.columns if col not in UIDs+all_sub_ft+gan+['sid','label']],axis=1,inplace=True)
    
    UID = 'pkgname&adunitshowid&ver&osv&make'
    #一度关联
    for col in tqdm(all_sub_ft):
        data = data.merge(data.groupby([UID])[col].nunique().rename('UID_'+col+'_nunique').reset_index(),on=UID,how='left',copy=False,sort=False)
        data = data.merge(data.groupby([UID])[col].count().rename('UID_'+col+'_count').reset_index(),on=UID,how='left',copy=False,sort=False)
        data = reduce_mem_usage(data)
        print(len(data.columns))
    #二度关联
    for col in tqdm(all_sub_ft):
        data = data.merge(data.groupby([col])[UID].nunique().rename(col+'_UID'+'_nunique').reset_index(),on=col,how='left',copy=False,sort=False)
        data = data.merge(data.groupby([col])[UID].count().rename(col+'_UID'+'_count').reset_index(),on=col,how='left',copy=False,sort=False)
        data = reduce_mem_usage(data)
        print(len(data.columns))
    for col in tqdm(all_sub_ft):
        cname = col+'_UID'
        temp = data.groupby(UID)[cname+'_nunique'].agg(['sum', 'max','mean']).add_prefix(cname+'_nunique_').reset_index()
        data = data.merge(temp,on=UID,how='left',copy=False,sort=False)
        temp = data.groupby(UID)[cname+'_count'].agg(['sum']).add_prefix(cname+'_nunique_').reset_index()
        data = data.merge(temp,on=UID,how='left',copy=False,sort=False)
        data = reduce_mem_usage(data)
        print(len(data.columns))
        
    data = create_category_encoder(data)    
    data.to_feather(feature_path+'connection_gragh.bin')
    
    return data
    
def create_combine_gragh(data):
    from itertools import combinations
    device = ['make','model','creative_dpi','dvctype','ntt','carrier','h_w_ratio','h','w','size','px']
    other = ['dvctype','ntt','carrier','osv_summary','lan','province','empty_count']
    media = ['mediashowid','adunitshowid']
    app = ['pkgname','ver','apptype','osv','lan','osv_summary']
    normal_cate = device+media+app
    ####
    ip = ['ip','ip3','reqrealip']
    ids = ['imeimd5','macmd5','adidmd5']
    # 'make','model','creative_dpi' ,'mediashowid','adunitshowid','pkgname','ver','osv',apptype','device_info'
    #device_info - 'dvctype','ntt','carrier','osv_summary','lan','province'
    data.drop([col for col in data.columns if col not in normal_cate+ip+ids+other+['sid','label']],axis=1,inplace=True)
    
    data['device_info'] = data['dvctype'].astype(str)+'&'+data['ntt'].astype(str)+'&'+data['carrier'].astype(str)+'&'+data['osv_summary'].astype(str)+'&'+data['lan'].astype(str)+'&'+data['province'].astype(str)
    
    comb_ft = ['make','model','creative_dpi' ,'mediashowid','adunitshowid','pkgname','ver','osv','apptype','device_info']
    comb_2 = list(combinations(comb_ft,2))
    
    combed_ft = []
    for comb in tqdm(comb_2):
        data[comb[0]+'&'+comb[1]] = data[comb[0]].astype(str)+'&'+data[comb[1]].astype(str)
        combed_ft.append(comb[0]+'&'+comb[1])
        #print(comb[0]+'&'+comb[1])
    print(len(data.columns))
    
    data = create_category_encoder(data)
    data = reduce_mem_usage(data)
    
    comb_4 = list(combinations(comb_ft,4))
    for i,comb in tqdm(enumerate(comb_4)):
        if 'make' in comb and 'model' in comb:
            continue
        if 'mediashowid' in comb and 'adunitshowid' in comb:
            continue
        data[comb[0]+'&'+comb[1]+'&'+comb[2]+'&'+comb[3]] = data[comb[0]].astype(str)+'&'+data[comb[1]].astype(str)+'&'+data[comb[2]].astype(str)+'&'+data[comb[3]].astype(str)
        
        if (i+1)%30 == 0:
            data = create_category_encoder(data)
            data = reduce_mem_usage(data)
    data = create_category_encoder(data)
    data = reduce_mem_usage(data)
    #data = create_single_col_count(data,ip+ids+combed_ft)
    #data = reduce_mem_usage(data)
    #count_cols = [col for col in data.columns if col not in ['sid','label','ntt','dvctype','carrier','apptype','lan']+ip+ids+normal_cate]
    #count_cols = ['pkgname&ver&adunitshowid&model&creative_dpi','osv&lan&adunitshowid&make&dvctype&ntt&carrier','pkgname&ver&apptype&osv&lan']
    '''
    for cate in tqdm(count_cols):
        for col in ip+ids:
            prefix = cate+'_'+col+'_'
            temp = data[[cate,col]].groupby(cate)[col].agg(['nunique','count']).add_prefix(prefix).reset_index()
            data = data.merge(temp,on=cate,how='left',copy=False,sort=False)
            %time data[[prefix+'nunique',prefix+'count']] = reduce_mem_usage(data[[prefix+'nunique',prefix+'count']])
            print(len(data.columns))
    '''
    data.to_feather(feature_path+'combine_gragh.bin')
    return data
    
def create_category_encoder(data,encode_type='label',object_cols=None):
    from sklearn import preprocessing
    #返回内容为字符串的列名
    if object_cols == None:
        object_cols = list(data.dtypes[data.dtypes == np.object].index)
        
    encode_type = encode_type.lower()
    if encode_type == 'label':
        for col in object_cols:
            if col!= 'sid':
                #标签标准化，将object（字符串列）处理
                #例如["paris", "paris", "tokyo", "amsterdam"]；里面不同的标签数目是3个，则标准化标签之后就是0，1，2，并且根据字典排序
                lbl = preprocessing.LabelEncoder()
                #再一次使用transform是因为labelencoder返回的是LabelEncoder对象
                data[col] = lbl.fit_transform(data[col].astype(str))
    elif encode_type == 'catboost':
        from category_encoders.cat_boost import CatBoostEncoder
        train_data = data[:5000000]
        test_data=data[5000000:].reset_index(drop=True)
        train_y = train_data['label']
        # catboost encode  需要保证标签随机排列
        perm = np.random.permutation(len(train_data))
        train_data = train_data.iloc[perm].reset_index(drop=True)
        train_y = train_y.iloc[perm].reset_index(drop=True)

        cb_encoder = CatBoostEncoder(verbose=1,
                                     cols=object_cols,
                                     return_df=True,
                                     handle_unknown='value')
        train_data = cb_encoder.fit_transform(train_data,train_y)
        test_data = cb_encoder.transform(test_data)
        data = pd.concat([train_data,test_data],axis=0).reset_index(drop=True)
    elif encode_type == 'woe':
        from category_encoders.woe import WOEEncoder
        train_data = data[:5000000]
        test_data=data[5000000:].reset_index(drop=True)
        train_y = train_data['label']

        woe_encoder = WOEEncoder(verbose=1,
                                     cols=object_cols,
                                     return_df=True,
                                     handle_unknown='value',
                                random_state=20190920,
                                randomized=True)
        train_data = woe_encoder.fit_transform(train_data,train_y)
        test_data = woe_encoder.transform(test_data)
        data = pd.concat([train_data,test_data],axis=0).reset_index(drop=True)
    else:
        raise ValueError('Unknown encode type:',encode_type)
    print('category encode done')
    return data

True

In [2]:
train_data_1 = pd.read_csv('round1_train.txt','\t')
train_data_2 = pd.read_csv('round2_train.txt','\t')
test_A_data = pd.read_csv('round2_test_A.txt','\t')
test_B_data = pd.read_csv('round2_test_B.txt','\t')
data = pd.concat([train_data_1,train_data_2,test_A_data,test_B_data],axis=0,sort=False).reset_index(drop=True)
del train_data_2,test_A_data,test_B_data
data[['sid','label']].to_feather(feature_path+'sid&label.bin')
gc.collect()

7

In [3]:
%time data = create_region(data)

category encode done
create region done
Wall time: 47.6 s


In [4]:
%time data = create_empty(data)

create empty feature done
Wall time: 56.1 s


In [5]:
%time data = create_ip(data)

category encode done
Wall time: 1min 15s


In [6]:
%time data = create_software(data)

category encode done
create software done
Wall time: 1min 32s


In [7]:
%time data = create_time(data)

create time done
Wall time: 1min 1s


In [7]:
%time data = create_hardware(data)

category encode done
create hardware done
Wall time: 32 s


In [8]:
%time data = create_screen(data)

category encode done
create screen_feature done
Wall time: 1min 2s


In [9]:
%time data = create_media(data)
%time data = create_md5(data)

category encode done
Wall time: 5.66 s
category encode done
Wall time: 35.9 s


In [10]:
%time data = reduce_mem_usage(data)

Mem. usage decreased to 1312.26 Mb (30.6% reduction)
Wall time: 5.34 s


In [12]:
data['pkgname&ver'] = data['pkgname'].astype(str)+'&'+data['ver'].astype(str)
data['pkgname&ver&apptype'] = data['pkgname'].astype(str)+'&'+data['ver'].astype(str)+'&'+data['apptype'].astype(str)
data['pkgname&ver&apptype&osv'] = data['pkgname'].astype(str)+'&'+data['ver'].astype(str)+'&'+data['apptype'].astype(str)+'&'+data['osv'].astype(str)
data['pkgname&ver&apptype&osv&lan'] = data['pkgname'].astype(str)+'&'+data['ver'].astype(str)+'&'+data['apptype'].astype(str)+'&'+data['osv'].astype(str)+'&'+data['lan'].astype(str)

data['adunitshowid&reqrealip'] = data['adunitshowid'].astype(str)+'&'+data['reqrealip'].astype(str)
data['adunitshowid&reqrealip&pkgname'] = data['adunitshowid'].astype(str)+'&'+data['reqrealip'].astype(str)+'&'+data['pkgname'].astype(str)
data['adunitshowid&reqrealip&pkgname&ver'] = data['adunitshowid'].astype(str)+'&'+data['reqrealip'].astype(str)+'&'+data['pkgname'].astype(str)+'&'+data['ver'].astype(str)
data['adunitshowid&reqrealip&pkgname&ver&apptype'] = data['adunitshowid'].astype(str)+'&'+data['reqrealip'].astype(str)+'&'+data['pkgname'].astype(str)+'&'+data['ver'].astype(str)+'&'+data['apptype'].astype(str)

In [14]:
data.columns

Index(['sid', 'label', 'pkgname', 'ver', 'adunitshowid', 'mediashowid',
       'apptype', 'ip', 'city', 'province', 'reqrealip', 'adidmd5', 'imeimd5',
       'idfamd5', 'openudidmd5', 'macmd5', 'dvctype', 'model', 'make', 'ntt',
       'carrier', 'os', 'osv', 'orientation', 'lan', 'h', 'w', 'ppi',
       'empty_count', 'ip1', 'ip2', 'ip3', 'reqrealip1', 'reqrealip2',
       'reqrealip3', 'ip_equal', 'new_ntt', 'osv_summary', 'hour', 'minute',
       'period', 'nginxtime-begin_time', 'big_model', 'model_equal_make',
       'h_w_ratio', 'screen_area', 'size', 'px', 'creative_dpi'],
      dtype='object')

In [14]:
data = create_category_encoder(data)
data.to_feather(feature_path+'experiment.bin')

category encode done


In [12]:
data.columns

Index(['sid', 'label', 'pkgname', 'ver', 'adunitshowid', 'mediashowid',
       'apptype', 'nginxtime', 'ip', 'city', 'province', 'reqrealip',
       'adidmd5', 'imeimd5', 'idfamd5', 'openudidmd5', 'macmd5', 'dvctype',
       'model', 'make', 'ntt', 'carrier', 'os', 'osv', 'orientation', 'lan',
       'h', 'w', 'ppi', 'ip1', 'ip2', 'ip3', 'reqrealip1', 'reqrealip2',
       'reqrealip3', 'ip_equal', 'new_ntt', 'osv_summary', 'big_model',
       'model_equal_make', 'h_w_ratio', 'screen_area', 'size', 'px',
       'creative_dpi'],
      dtype='object')

In [None]:
pair_left = [['pkgname','ver','mediashowid'],['apptype','ver','osv'],['reqrealip','pkgname'],['pkgname&adunitshowid'],['pkgname&adunitshowid&ver','lan','make']]
pair_right = ['adunitshowid','pkgname','mediashowid','ver','osv']
%time data = create_category_combine(data,pair_left,pair_right)

In [16]:
counts_cols = ['imeimd5','macmd5','adidmd5','ip','reqrealip','ip3','make','model','apptype','big_model']
counts_cols

['imeimd5',
 'macmd5',
 'adidmd5',
 'ip',
 'reqrealip',
 'ip3',
 'make',
 'model',
 'apptype',
 'big_model']

In [17]:
%time data = create_single_col_count(data,counts_cols)

100%|████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.33s/it]


create single column counts done
Wall time: 21.3 s


In [None]:
%time data = create_gragh(data)

In [11]:
%time data = create_combine_gragh(data)

100%|████████████████████████████████████████████████████████████████████████████████████████| 45/45 [05:57<00:00,  7.94s/it]


75
category encode done
Mem. usage decreased to 1884.46 Mb (17.4% reduction)


0it [00:00, ?it/s]

category encode done
Mem. usage decreased to 1914.98 Mb (0.0% reduction)


39it [03:10, 17.74s/it]

category encode done
Mem. usage decreased to 2220.15 Mb (0.0% reduction)


59it [08:27, 17.71s/it]

category encode done
Mem. usage decreased to 2662.66 Mb (0.6% reduction)


69it [12:27, 19.18s/it]

category encode done
Mem. usage decreased to 2967.83 Mb (0.0% reduction)


79it [16:07, 18.94s/it]

category encode done
Mem. usage decreased to 3273.01 Mb (0.0% reduction)


89it [19:34, 18.30s/it]

category encode done
Mem. usage decreased to 3547.67 Mb (0.0% reduction)


99it [23:17, 18.82s/it]

category encode done
Mem. usage decreased to 3852.84 Mb (0.0% reduction)


119it [28:50, 18.18s/it]

category encode done
Mem. usage decreased to 4310.61 Mb (0.0% reduction)


129it [33:17, 19.69s/it]

category encode done
Mem. usage decreased to 4615.78 Mb (0.0% reduction)


139it [37:29, 19.44s/it]

category encode done
Mem. usage decreased to 4920.96 Mb (0.0% reduction)


149it [40:04, 21.50s/it]

category encode done
Mem. usage decreased to 5073.55 Mb (0.0% reduction)


159it [43:33, 18.87s/it]

category encode done
Mem. usage decreased to 5378.72 Mb (0.0% reduction)


169it [47:25, 19.04s/it]

category encode done
Mem. usage decreased to 5683.90 Mb (0.0% reduction)


189it [51:20, 17.36s/it]

category encode done
Mem. usage decreased to 5973.82 Mb (0.3% reduction)


199it [55:11, 19.23s/it]

category encode done
Mem. usage decreased to 6263.73 Mb (0.2% reduction)


209it [58:55, 18.84s/it]

category encode done
Mem. usage decreased to 6568.91 Mb (0.0% reduction)


210it [59:57, 17.13s/it]


category encode done
Mem. usage decreased to 6568.91 Mb (0.0% reduction)
Wall time: 1h 9min 58s


In [12]:
data.head().T

Unnamed: 0,0,1,2,3,4
sid,d7460126-e071-4979-9ee8-42f72777a28a-156009070...,b660d559-db97-4b5f-9bd2-2450cb89ce77-156005074...,f49a740e-66c3-4605-9b67-4d3079fe69cb-156008914...,fd60d096-f168-4540-b782-729d64d0fcc6-156006253...,a037b032-a5c7-40ea-9161-26b118b12406-156007938...
label,1,1,0,0,1
pkgname,1303,6813,680,6813,5917
ver,460,6826,1235,3612,1032
adunitshowid,555,63,514,606,159
mediashowid,276,71,327,328,110
apptype,42,62,38,33,71
ip,2932717,255434,3435819,3444555,3269012
province,0,0,0,0,0
reqrealip,20025,32515,31652,32464,31666


In [None]:
%time data = create_category_encoder(data,'catboost',cate_cols)

In [None]:
data.to_feather(feature_path+'combine_gragh.bin')

In [18]:
combine_gragh_file_list = ['combine_gragh.bin','single_col_count_ft.bin']
basic_file_list = ['empty_ft.bin','hardware_ft.bin','ip_ft.bin','md5_ft.bin','media_ft.bin','region_ft.bin',
                   'screen_ft.bin','sid&label.bin','single_col_count_ft.bin','software_ft.bin','time_ft.bin']

In [19]:
del data
gc.collect()
%time data = combine_feature_feather(basic_file_list)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    2.3s remaining:   10.6s
[Parallel(n_jobs=-1)]: Done   4 out of  11 | elapsed:    2.8s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done   6 out of  11 | elapsed:    3.9s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    5.1s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:   11.7s finished


11
Wall time: 45.5 s


In [20]:
%time data = reduce_mem_usage(data)

Mem. usage decreased to 1403.81 Mb (41.8% reduction)
Wall time: 7.2 s
