## 处理结构化数据代码

In [29]:
# coding=utf8
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import numpy as np
import pickle

In [3]:
train_data = pd.read_csv('task3/task3/train.csv')
val_ids = pickle.load(open('data/val_ids_%d.pkl' % 18, 'rb'))
train_ids = list(set(train_data['id']) - set(val_ids))
train_val_data_unique = train_data.drop_duplicates()
train_val_data_unique.index = train_val_data_unique['id']
test_data = pd.read_csv('task3_test_stage1_new.csv')
test_data.index = test_data['id']
test_ids = list(test_data['id'])

In [3]:
same_id = set(test_ids) & (set(val_ids) | set(train_ids))
len(same_id), len(test_ids)

(64, 3902)

In [4]:
target = train_val_data_unique['label']
train_val_data_unique = train_val_data_unique.drop(['label'], axis=1)
all_data = train_val_data_unique.append(test_data)

In [5]:
all_data = all_data.fillna(-1)
province_list = ['北京', '天津', '河北', '山西', '内蒙古', '辽宁', '吉林','黑龙江', '上海', '江苏', '浙江', '安徽', '福建', '江西', '山东',\
                '河南', '湖北', '湖南', '广东', '广西', '海南', '重庆', '四川', '贵州', '云南', '西藏', '陕西', '甘肃', '青海', '宁夏',\
                '新疆', '台湾', '香港', '澳门']
def get_province(x):
    """
    """
    if x == -1:
        return -1
    if '其他' in x:
        return x
    if '海外' in x:
        return x
    for province in province_list:
        if province in x:
            return province
province = all_data.userLocation.apply(get_province)

feature = ['userFollowCount', 'userFansCount', 'userWeiboCount', 'feature_len_piclist', 'feature_userGender', 'feature_userLocation', \
           'feature_userProvince', 'feature_len_userDescription', 'feature_category']
all_data['feature_len_piclist'] = all_data.piclist.apply(lambda x : (-1 if x == -1 else len(x.split('\t'))))
all_data['feature_userGender'] = all_data.userGender.apply(lambda x : {'男': 0, '女':1, -1:-1}[x])
all_data['feature_userLocation'] = pd.factorize(all_data.userLocation)[0]
all_data['feature_userProvince'] = pd.factorize(province)[0]
all_data['feature_len_userDescription'] = all_data.userDescription.apply(lambda x : (-1 if x == -1 else len(x)))
all_data['feature_category'] = pd.factorize(all_data.category)[0]

In [6]:
import lightgbm as lgb
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [15]:
train = all_data.iloc[:-len(test_data)].loc[train_ids]
val = all_data.iloc[:-len(test_data)].loc[val_ids]

In [16]:
len(train), len(target.loc[train_ids])

(27250, 27250)

In [17]:
trn_data = lgb.Dataset(train[feature], label=target.loc[train_ids])
val_data = lgb.Dataset(val[feature], label=target.loc[val_ids])
clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)

Training until validation scores don't improve for 4000 rounds.
[5000]	training's auc: 0.977394	valid_1's auc: 0.974985
[10000]	training's auc: 0.980208	valid_1's auc: 0.977126
[15000]	training's auc: 0.981504	valid_1's auc: 0.978044
[20000]	training's auc: 0.982287	valid_1's auc: 0.978424
[25000]	training's auc: 0.982836	valid_1's auc: 0.978616
Early stopping, best iteration is:
[23394]	training's auc: 0.982685	valid_1's auc: 0.978707


In [19]:
val_score = clf.predict(val[feature], num_iteration=clf.best_iteration)
pre = val_score > 0.5
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score

print(precision_recall_fscore_support(target.loc[val_ids], pre))
print(f1_score(target.loc[val_ids], pre, average='macro'))

(array([0.91363502, 0.911739  ]), array([0.91067217, 0.91466978]), array([0.91215119, 0.91320204]), array([3392, 3422], dtype=int64))
0.9126766154238151


In [20]:
uncofidence_ids = pickle.load(open('data/unconfidence_ids.pkl', 'rb'))
uncofidence_val = val.loc[uncofidence_ids]
unconfidence_val_score = clf.predict(uncofidence_val[feature], num_iteration=clf.best_iteration)
unconfidence_val_pre = unconfidence_val_score > 0.5
print(precision_recall_fscore_support(target.loc[uncofidence_ids], unconfidence_val_pre))
from sklearn.metrics import accuracy_score
print(accuracy_score(unconfidence_val_pre, target.loc[uncofidence_ids]))

(array([0.94514768, 0.64583333]), array([0.86821705, 0.82666667]), array([0.90505051, 0.7251462 ]), array([258,  75], dtype=int64))
0.8588588588588588


In [21]:
test_data['label'] = clf.predict(all_data.iloc[-len(test_data):][feature]) > 0.5
test_data['label'] = test_data['label'].astype('float')
test_data[['id', 'label']].to_csv('submit_user_info.csv', index=False)

In [22]:
test_data['label'].value_counts()

0.0    2088
1.0    1814
Name: label, dtype: int64

In [47]:
clf.save_model('lgb_model.txt')

<lightgbm.basic.Booster at 0x2610d5fba58>

In [4]:
train_data.head()

Unnamed: 0,id,text,piclist,userGender,userFollowCount,userFansCount,userWeiboCount,userLocation,userDescription,category,label
0,84cdcfed1aeb7047ad168be7bb9b559e,回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：,,男,1728.0,748.0,30884.0,北京朝阳区,我们虚度的今天恰是。。昨天去世人渴望的明天。,文体娱乐,0
1,93ebc056c547618b5b00ab35270c9fad,//分享网易新闻:《发生在昆明的火锅店老板“辱滇门”，云南人该愤怒还是羞愧》|发生在昆明.....,63ad082a189566eed7c4bb3e4bc55012.jpg,男,423.0,112.0,792.0,云南楚雄,用心生活,社会生活,0
2,eefcba3b5856fe8f55213e036ee463ca,西宁城管围殴民警扬言要把警察打死|西宁城管围...,4986dc2a5f09a87c7af5dfc57d7775cd.jpg,,,,,,,社会生活,0
3,8e09b1b13477f62139b5cd7a7a7dcb8f,【川航航班因驾驶舱风挡破裂安全备降成都】今天上午6:26从重庆江北国际机场出发前往拉萨的四川...,dcfccfc69e90a0007afd6aafa1385e56.jpg,女,1668.0,7470000.0,57256.0,北京东城区,中国青年报•中青在线微博,社会生活,0
4,9ffea4d9573c6e723e8f178a789888dc,支持郑强！！！//【贵州大学校长回应空姐言论:常给她们写感谢信】,,男,267.0,61.0,1098.0,江苏盐城,能烧得全烧了，只剩下石头。,社会生活,0


In [12]:
train_data.piclist.iloc[0]

nan

In [20]:
len(train_data), train_data.piclist.count()

(38471, 21832)

In [21]:
38471 - 21832

16639

In [23]:
uncofidence_ids = pickle.load(open('data/unconfidence_ids.pkl', 'rb'))
uncofidence_val = train_val_data_unique.loc[uncofidence_ids]

In [24]:
uncofidence_val.piclist.count()

212

In [25]:
len(uncofidence_val)

333