#### 导入包

In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
import gc
from tqdm import tqdm
import pickle
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

#### 列出所有文件

In [94]:
# 数据都放在 ./Demo 文件夹中
file_lists=os.listdir('./Demo/')
for file in file_lists:
    print(file)
    

deviceid_brand.tsv
deviceid_packages.tsv
deviceid_package_start_close.tsv
deviceid_test.tsv
deviceid_train.tsv
package_label.tsv
pack_tfidf_age.csv
test_statistic_feat.csv
tfidf_classfiy.csv
train_statistic_feat.csv
user_behavior.csv
xgb_feat_chizhu.csv


#### 读取数据

In [95]:
# 读取 设备信息，包括品牌和型号
deviced_brand=pd.read_csv('./Demo/deviceid_brand.tsv',sep='\t', names=['device_id','brand','model'])
# 读取 app 信息，包括 app 所属的类别
package_label=pd.read_csv('./Demo/package_label.tsv',sep='\t',names=['app','class1','class2'])
# 读取训练数据集
deviceid_train=pd.read_csv('./Demo/deviceid_train.tsv',sep='\t',names=['device_id','sex','age'])
# 读取测试数据集
deviceid_test=pd.read_csv('./Demo/deviceid_test.tsv',sep='\t',names=['device_id'])
# 读取 app 数据
_package_label=pd.read_csv('./Demo/package_label.tsv',sep='\t')
# 读取设备安装的 app 数据
deviceid_packages=pd.read_csv('./Demo/deviceid_packages.tsv',sep='\t', names=['device_id','apps'])

In [96]:
# 读取每个设备安装了哪些app，原始数据每个设备的 app 之间以逗号分隔，处理后转换为 list
deviceid_packages['apps']=deviceid_packages['apps'].apply(lambda x:x.split(','))
deviceid_packages['app_nums']=deviceid_packages['apps'].apply(lambda x:len(x))
deviceid_packages.head()

Unnamed: 0,device_id,apps,app_nums
0,00009270c4ec26e1d76f5d86847009c9,"[1896072db9ce6406febfc17f681c2086, 90cb852cf34...",3
1,000189ef5d5b951841d416a8c6c5b995,"[1896072db9ce6406febfc17f681c2086, 97d0422a331...",19
2,00026d79a6f0955fc860947724e24765,[c33b35d6254ad9c0c238233eb97a6c60],1
3,0002e3afb8146bc08e40575e45f0eca6,"[1896072db9ce6406febfc17f681c2086, 07e967d75aa...",3
4,0004709a296f9b925ae283efe2f043e7,"[4538778ad75aa8ce61c9d13fb9cb661b, 86f9f299cdb...",16


In [97]:
# 把训练集和测试集合并到一起处理
deviceid_all=pd.concat([deviceid_train,deviceid_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [98]:
# 把每个设备的 app 列表转换为字符串，以空格分隔
apps=deviceid_packages['apps'].apply(lambda x:' '.join(x)).tolist()
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
# 原来的 app 列表 转换为计数的稀疏矩阵。
cntTf = vectorizer.fit_transform(apps)
# 得到 tf-idf 矩阵
tfidf=transformer.fit_transform(cntTf)
# 得到所有的 APP 列表，相当于词典
word=vectorizer.get_feature_names()

In [99]:
cntTf.shape

(72727, 35000)

In [100]:
deviceid_packages['tfidf_sum']=0
#计算 每一行 的 tf-idf 的权重的和
for i in range(tfidf.shape[0]):
    deviceid_packages.loc[i,'tfidf_sum']=np.sum(tfidf[i].toarray())

In [101]:
# 这里设置主题数量为 5.
lda = LatentDirichletAllocation(n_components=5,learning_offset=50.,random_state=666)
# 输入是计数的稀疏矩阵
docres=lda.fit_transform(cntTf)

In [102]:
deviceid_packages=pd.concat([deviceid_packages,pd.DataFrame(docres)],axis=1)
deviceid_packages=deviceid_packages.drop('apps',axis=1)
data_all=pd.merge(deviceid_all,deviceid_packages,on='device_id',how='left')

In [103]:
# 把 float 转换为 str 
def transfer(x):
    if np.isnan(x):
        return 'nan'
    else:
        # x 是 float 类型，有小数点。
        # 所以这里需要先转换为 int，再转换为 str
        return str(int(x))
# 把 sex 字段由 int 类型转换为 str   
data_all['sex']=data_all['sex'].apply(transfer)
# 把 age 字段由 int 类型转换为 str   
data_all['age']=data_all['age'].apply(transfer)
# 把 age 字段和 sex 字段拼接起来，作为label   
data_all['sex_age']=data_all['sex']+'-'+data_all['age']

In [104]:
# 先把`nan`和'nan-nan'字符串转换为 np.NaN，方便判断拆分训练集和测试集
data_all=data_all.replace({'nan':np.NaN,'nan-nan':np.NaN})
data_all=data_all.drop(['sex','age','device_id'],axis=1)
train=data_all[data_all['sex_age'].notnull()]
test=data_all[data_all['sex_age'].isnull()]

In [105]:
# 得到 训练集的特征
X=train.drop(['sex_age'],axis=1)
# 得到 训练集的标签
Y=train['sex_age']
le=LabelEncoder()
# LGB 不支持 str 类型的label，需要转换为数字
Y=le.fit_transform(Y)
# 划分训练集和测试集
X_train,X_test, y_train, y_test =train_test_split(X,Y,test_size=0.3, random_state=666)
# 设置 LGB 训练集
lgb_train=lgb.Dataset(X_train,label=y_train)
# 设置 LGB 验证集
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
#设置参数
params = {
    'boosting_type': 'gbdt',
    'max_depth':3,
    'metric': {'multi_logloss'},
    'num_class':22,
    'objective':'multiclass',
    'random_state':666,
}

In [106]:
# 开始训练模型
gbm = lgb.train(params,
lgb_train,
num_boost_round=1000,
valid_sets=lgb_eval,
early_stopping_rounds=300)

[1]	valid_0's multi_logloss: 2.87391
Training until validation scores don't improve for 300 rounds.
[2]	valid_0's multi_logloss: 2.85884
[3]	valid_0's multi_logloss: 2.84558
[4]	valid_0's multi_logloss: 2.834
[5]	valid_0's multi_logloss: 2.82376
[6]	valid_0's multi_logloss: 2.81428
[7]	valid_0's multi_logloss: 2.80606
[8]	valid_0's multi_logloss: 2.79883
[9]	valid_0's multi_logloss: 2.79219
[10]	valid_0's multi_logloss: 2.78624
[11]	valid_0's multi_logloss: 2.78096
[12]	valid_0's multi_logloss: 2.776
[13]	valid_0's multi_logloss: 2.77149
[14]	valid_0's multi_logloss: 2.76738
[15]	valid_0's multi_logloss: 2.76367
[16]	valid_0's multi_logloss: 2.76013
[17]	valid_0's multi_logloss: 2.75706
[18]	valid_0's multi_logloss: 2.75405
[19]	valid_0's multi_logloss: 2.75152
[20]	valid_0's multi_logloss: 2.7492
[21]	valid_0's multi_logloss: 2.74708
[22]	valid_0's multi_logloss: 2.74499
[23]	valid_0's multi_logloss: 2.74313
[24]	valid_0's multi_logloss: 2.74143
[25]	valid_0's multi_logloss: 2.73984
[

[212]	valid_0's multi_logloss: 2.73513
[213]	valid_0's multi_logloss: 2.73517
[214]	valid_0's multi_logloss: 2.73529
[215]	valid_0's multi_logloss: 2.73535
[216]	valid_0's multi_logloss: 2.73541
[217]	valid_0's multi_logloss: 2.73565
[218]	valid_0's multi_logloss: 2.73575
[219]	valid_0's multi_logloss: 2.73583
[220]	valid_0's multi_logloss: 2.73598
[221]	valid_0's multi_logloss: 2.73609
[222]	valid_0's multi_logloss: 2.73617
[223]	valid_0's multi_logloss: 2.73628
[224]	valid_0's multi_logloss: 2.7364
[225]	valid_0's multi_logloss: 2.7365
[226]	valid_0's multi_logloss: 2.73667
[227]	valid_0's multi_logloss: 2.73672
[228]	valid_0's multi_logloss: 2.73688
[229]	valid_0's multi_logloss: 2.73698
[230]	valid_0's multi_logloss: 2.73711
[231]	valid_0's multi_logloss: 2.73718
[232]	valid_0's multi_logloss: 2.73736
[233]	valid_0's multi_logloss: 2.73742
[234]	valid_0's multi_logloss: 2.73747
[235]	valid_0's multi_logloss: 2.7376
[236]	valid_0's multi_logloss: 2.73763
[237]	valid_0's multi_loglos

In [107]:
# 预测得到测试集的结果
pre_x=test.drop(['sex_age'],axis=1)
pred_y=gbm.predict(pre_x.values,num_iteration=gbm.best_iteration)

In [108]:
# 根据 LabelEndcoder 中的属性设置列名
result=pd.DataFrame(pred_y,columns=le.classes_)
# 添加 DeviceID 列
result['DeviceID']=deviceid_test['device_id'].values

In [109]:
# 验证 列名是否正确
result=result[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]


In [110]:
# 保存到 csv 文件中
result.to_csv('baseline.csv',index=False)