In [1]:
import numpy as np
import pandas as pd
import time
import ast
from tqdm import tqdm
import datetime
from multiprocessing import Pool, cpu_count
from itertools import zip_longest
from collections import defaultdict, OrderedDict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import chi2,SelectKBest
from gensim.models import FastText
from sklearn.utils import shuffle
%matplotlib inline

In [2]:
base_dir = './dataset/'

### 1. 读取基本特征数据集

In [3]:
train_id_set = pd.read_csv(base_dir+'age_train.csv',header=None, names=['uId', 'age_group'])
test_id_set = pd.read_csv(base_dir+'age_test.csv', header=None, names=['uId'])
user_basic_info = pd.read_csv(base_dir+'user_basic_info.csv', header=None)
user_basic_info.columns = ['uId','gender','city','prodName','ramCapacity','ramLeftRation',
                          'romCapacity','romLeftRation','color','fontSize','ct','carrier','os']
print(train_id_set.shape,test_id_set.shape,user_basic_info.shape)

(4000000, 2) (1000000, 1) (5000000, 13)


### 2. 数据清理与基本特征提取
#### 2.1 根据相同手机型号的众数（或其他统计值）填充user_basic_info表中的空缺值

In [4]:
user_basic_info['city'] = user_basic_info['city'].fillna(user_basic_info['city'].mode()[0])
user_basic_info['ramCapacity'] = user_basic_info.groupby('prodName')['ramCapacity'].transform(
    lambda x : x.fillna(x.mode()[0] if x.notna().sum()!=0 else user_basic_info['ramCapacity'].mode()[0]))
user_basic_info['ramLeftRation'] = user_basic_info.groupby('prodName')['ramLeftRation'].transform(
    lambda x : x.fillna(x.mean() if x.notna().sum()!=0 else user_basic_info['ramLeftRation'].mean()))
user_basic_info['romCapacity'] = user_basic_info.groupby('prodName')['romCapacity'].transform(
    lambda x : x.fillna(x.mode()[0] if x.notna().sum()!=0 else user_basic_info['romCapacity'].mode()[0]))
user_basic_info['romLeftRation'] = user_basic_info.groupby('prodName')['romLeftRation'].transform(
    lambda x : x.fillna(x.mean() if x.notna().sum()!=0 else user_basic_info['romLeftRation'].mean()))
user_basic_info['fontSize'] = user_basic_info.groupby('prodName')['fontSize'].transform(
    lambda x : x.fillna(x.mean() if x.notna().sum()!=0 else user_basic_info['fontSize'].mean()))
user_basic_info['os'] = user_basic_info.groupby('prodName')['os'].transform(
    lambda x : x.fillna(x.mode()[0] if x.notna().sum()!=0 else user_basic_info['os'].mode()[0]))
user_basic_info['ct'] = user_basic_info.groupby('prodName')['ct'].transform(
    lambda x : x.fillna(x.mode()[0] if x.notna().sum()!=0 else user_basic_info['ct'].mode()[0]))

In [5]:
train_user_info_set = train_id_set.merge(user_basic_info, on=['uId'], how='left')
test_user_info_set = test_id_set.merge(user_basic_info, on=['uId'], how='left')
print(train_user_info_set.shape,test_user_info_set.shape)

(4000000, 14) (1000000, 13)


In [6]:
train_user_info_set.isna().sum()

uId              0
age_group        0
gender           0
city             0
prodName         0
ramCapacity      0
ramLeftRation    0
romCapacity      0
romLeftRation    0
color            0
fontSize         0
ct               0
carrier          0
os               0
dtype: int64

In [7]:
test_user_info_set.isna().sum()

uId              0
gender           0
city             0
prodName         0
ramCapacity      0
ramLeftRation    0
romCapacity      0
romLeftRation    0
color            0
fontSize         0
ct               0
carrier          0
os               0
dtype: int64

#### 2.2 统计一共使用的手机特性数

In [8]:
user_behavior_info = pd.read_csv(base_dir+'user_behavior_info.csv', header=None)
user_behavior_info.columns = ['uId','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes',
                             'DFuncTimes','EFuncTimes','FFuncTimes','GFuncSum']
user_behavior_stat = user_behavior_info[['uId','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes',
                                         'EFuncTimes','FFuncTimes','GFuncSum']]
user_behavior_stat['all_used_features'] = (user_behavior_stat!=0).sum(axis=1)-1

In [9]:
train_user_set = train_user_info_set.merge(user_behavior_info, on=['uId'], how='left')
test_user_set = test_user_info_set.merge(user_behavior_info, on=['uId'], how='left')
print(train_user_set.shape,test_user_set.shape)

(4000000, 22) (1000000, 21)


In [17]:
train_user_set.isna().sum()

uId              0
age_group        0
gender           0
city             0
prodName         0
ramCapacity      0
ramLeftRation    0
romCapacity      0
romLeftRation    0
color            0
fontSize         0
ct               0
carrier          0
os               0
bootTimes        0
AFuncTimes       0
BFuncTimes       0
CFuncTimes       0
DFuncTimes       0
EFuncTimes       0
FFuncTimes       0
GFuncSum         0
运动健康             0
实用工具             0
新闻阅读             0
图书阅读             0
金融理财             0
社交通讯             0
便捷生活             0
休闲益智             0
拍摄美化             0
经营策略             0
儿童               0
汽车               0
教育               0
主题个性             0
影音娱乐             0
棋牌桌游             0
购物比价             0
旅游住宿             0
出行导航             0
商务               0
角色扮演             0
动作射击             0
体育竞速             0
美食               0
休闲娱乐             0
表盘个性             0
学习办公             0
网络游戏             0
主题铃声             0
动漫               0
休闲游戏        

In [11]:
test_user_set.isna().sum()

uId              0
gender           0
city             0
prodName         0
ramCapacity      0
ramLeftRation    0
romCapacity      0
romLeftRation    0
color            0
fontSize         0
ct               0
carrier          0
os               0
bootTimes        0
AFuncTimes       0
BFuncTimes       0
CFuncTimes       0
DFuncTimes       0
EFuncTimes       0
FFuncTimes       0
GFuncSum         0
dtype: int64

#### 2.3 统计每个app分类下安装（激活）的app个数

In [12]:
app_actived_category = pd.read_hdf(base_dir+'app_actived_category.h5', key='data')
app_actived_category.shape

(4999341, 34)

In [13]:
app_actived_category.isna().sum()

uId        0
运动健康       0
实用工具       0
新闻阅读       0
图书阅读       0
金融理财       0
社交通讯       0
便捷生活       0
休闲益智       0
拍摄美化       0
经营策略       0
儿童         0
汽车         0
教育         0
主题个性       0
影音娱乐       0
棋牌桌游       0
购物比价       0
旅游住宿       0
出行导航       0
商务         0
角色扮演       0
动作射击       0
体育竞速       0
美食         0
休闲娱乐       0
表盘个性       0
学习办公       0
网络游戏       0
主题铃声       0
动漫         0
休闲游戏       0
资讯生活       0
appNums    0
dtype: int64

In [14]:
train_user_set = train_user_set.merge(app_actived_category,on=['uId'],how='left')
train_user_set.fillna(0, inplace=True)
test_user_set = test_user_set.merge(app_actived_category,on=['uId'],how='left')
test_user_set.fillna(0, inplace=True)
print(train_user_set.shape,test_user_set.shape)

(4000000, 55) (1000000, 54)


In [16]:
train_user_set.columns

Index(['uId', 'age_group', 'gender', 'city', 'prodName', 'ramCapacity',
       'ramLeftRation', 'romCapacity', 'romLeftRation', 'color', 'fontSize',
       'ct', 'carrier', 'os', 'bootTimes', 'AFuncTimes', 'BFuncTimes',
       'CFuncTimes', 'DFuncTimes', 'EFuncTimes', 'FFuncTimes', 'GFuncSum',
       '运动健康', '实用工具', '新闻阅读', '图书阅读', '金融理财', '社交通讯', '便捷生活', '休闲益智', '拍摄美化',
       '经营策略', '儿童', '汽车', '教育', '主题个性', '影音娱乐', '棋牌桌游', '购物比价', '旅游住宿',
       '出行导航', '商务', '角色扮演', '动作射击', '体育竞速', '美食', '休闲娱乐', '表盘个性', '学习办公',
       '网络游戏', '主题铃声', '动漫', '休闲游戏', '资讯生活', 'appNums'],
      dtype='object')

#### 2.4 构造其他统计特征

In [18]:
train_user_set['totalGame'] = train_user_set['动作射击']+train_user_set['休闲益智']+\
    train_user_set['经营策略']+train_user_set['体育竞速']+train_user_set['角色扮演'] \
    +train_user_set['棋牌桌游']+train_user_set['网络游戏'] \
    +train_user_set['休闲游戏'] \
    +train_user_set['休闲娱乐']
train_user_set['young_feature'] = train_user_set['实用工具']+train_user_set['影音娱乐']+\
    train_user_set['汽车']+train_user_set['学习办公']+train_user_set['图书阅读']
train_user_set['business_feature'] = train_user_set['商务']+train_user_set['金融理财']+\
    train_user_set['汽车']+train_user_set['学习办公']+train_user_set['新闻阅读']
train_user_set['middle_feature'] = train_user_set['教育']+train_user_set['便捷生活']+\
    train_user_set['购物比价']+train_user_set['儿童']+train_user_set['拍摄美化']+train_user_set['美食']
train_user_set['used_rom'] = train_user_set.apply(lambda x:x['romCapacity']*(1-x['romLeftRation']),axis=1)
train_user_set['used_ram'] = train_user_set.apply(lambda x:x['ramCapacity']*(1-x['ramLeftRation']),axis=1)
train_user_set['gender_color'] = train_user_set.apply(lambda x:str(x['gender'])+'_'+x['color'], axis=1)
_ = train_user_set.pop('ramLeftRation')
_ = train_user_set.pop('romLeftRation')

In [19]:
test_user_set['totalGame'] = test_user_set['动作射击']+test_user_set['休闲益智']+\
    test_user_set['经营策略']+test_user_set['体育竞速']+test_user_set['角色扮演'] \
    +test_user_set['棋牌桌游']+test_user_set['网络游戏'] \
    +test_user_set['休闲游戏'] \
    +test_user_set['休闲娱乐']
test_user_set['young_feature'] = test_user_set['实用工具']+test_user_set['影音娱乐']+\
    test_user_set['运动健康']+test_user_set['学习办公']+test_user_set['图书阅读']
test_user_set['business_feature'] = test_user_set['商务']+test_user_set['金融理财']+\
    test_user_set['汽车']+test_user_set['学习办公']+test_user_set['新闻阅读']
test_user_set['middle_feature'] = test_user_set['教育']+test_user_set['便捷生活']+\
    test_user_set['购物比价']+test_user_set['儿童']+test_user_set['拍摄美化']+test_user_set['美食']
test_user_set['used_rom'] = test_user_set.apply(lambda x:x['romCapacity']*(1-x['romLeftRation']),axis=1)
test_user_set['used_ram'] = test_user_set.apply(lambda x:x['ramCapacity']*(1-x['ramLeftRation']),axis=1)
test_user_set['gender_color'] = test_user_set.apply(lambda x:str(x['gender'])+'_'+x['color'], axis=1)
_ = test_user_set.pop('ramLeftRation')
_ = test_user_set.pop('romLeftRation')

In [20]:
train_user_set = train_user_set.merge(user_behavior_stat[['uId','all_used_features']],on=['uId'],how='left')
test_user_set = test_user_set.merge(user_behavior_stat[['uId','all_used_features']],on=['uId'],how='left')
print(test_user_set.shape,train_user_set.shape)

(1000000, 60) (4000000, 61)


#### 2.5 训练集和验证集基本特征

In [21]:
train_user_set.to_csv(base_dir+'train_user_set.csv',index=False)
test_user_set.to_csv(base_dir+'test_user_set.csv',index=False)
print(train_user_set.shape,test_user_set.shape)

(4000000, 61) (1000000, 60)
