In [22]:
import math
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from datetime import datetime
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [23]:
train = pd.read_csv('data/train.txt', header=None, names=['pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'make', 'model'])
test = pd.read_csv('data/test.txt', header=None, names=['pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'make', 'model'])
data = pd.concat([train, test]).reset_index(drop=True)

In [24]:
# 统计特征
for col in ['gender', 'age', 'province', 'city']:
    data['{}_count'.format(col)] = data.groupby(col)[col].transform('count')
    
corss_feature = ['gender', 'age', 'province', 'city']
# 交叉组合统计，就是组合特征的共现频次
while len(corss_feature) != 0:
    f = corss_feature.pop()
    for col in corss_feature:
        data['{}_{}_count'.format(f, col)] = data.groupby([f, col])[col].transform('count')
        
# 特征unique count特征
for index, col1 in enumerate(['age', 'province', 'city']):
    for col2 in ['age', 'province', 'city'][index:]:
        data['{}_in_{}_count'.format(col1, col2)] = data.groupby(col1)[col2].transform('count')
        data['{}_in_{}_nunique'.format(col1, col2)] = data.groupby(col1)[col2].transform('nunique')
        data['{}_in_{}_nunique/{}_in_{}_count'.format(col1, col2, col1, col2)] = data['{}_in_{}_nunique'.format(col1,
                                                                                                                col2)] / \
                                                                                 data['{}_in_{}_count'.format(col1,
                                                                                                              col2)]

        data['{}_in_{}_count'.format(col2, col1)] = data.groupby(col2)[col1].transform('count')
        data['{}_in_{}_nunique'.format(col2, col1)] = data.groupby(col2)[col1].transform('nunique')
        data['{}_in_{}_nunique/{}_in_{}_count'.format(col2, col1, col2, col1)] = data['{}_in_{}_nunique'.format(col2,
                                                                                                                col1)] / \
                                                                                 data['{}_in_{}_count'.format(col2,
                                                                                                              col1)]

In [25]:
# tagid word2vec特征
data['tagid'] = data['tagid'].apply(lambda x: eval(x))
sentences = data['tagid'].values.tolist()
for i in range(len(sentences)):
    sentences[i] = [str(x) for x in sentences[i]]
emb_size = 32
model = Word2Vec(sentences, vector_size=emb_size, window=6, min_count=5, sg=0, hs=0, seed=1, epochs=5)
emb_matrix = []
for seq in tqdm(sentences):
    vec = []
    for w in seq:
        if w in model.wv.key_to_index:
            vec.append(model.wv[w])
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * emb_size)
emb_matrix = np.array(emb_matrix)
for i in range(emb_size):
    data['tag_emb_{}'.format(i)] = emb_matrix[:, i]
    
# tagid tfidf特征
data['tagid'] = data['tagid'].apply(lambda x: ' '.join(map(str,x)))
clf_tfidf = TfidfVectorizer(max_features=30)
tfidf=clf_tfidf.fit_transform(data['tagid'])
tfidf = pd.DataFrame(tfidf.toarray())
tfidf.columns = ['tagid_tfidf_' + str(x) for x in range(30)]
data = pd.concat([data, tfidf], axis=1)

100%|██████████| 400000/400000 [01:05<00:00, 6078.95it/s]


In [27]:
cat_cols = ['gender', 'age', 'province', 'city']
features = [i for i in data.columns if i not in ['pid', 'label', 'tagid', 'time', 'model', 'make']]
data[cat_cols] = data[cat_cols].astype('category')
X_train = data[~data['label'].isna()]
X_test = data[data['label'].isna()]

y = X_train['label']
KF = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
params = {
          'objective':'binary',
          'metric':'binary_error',
          'learning_rate':0.05,
          'subsample':0.8,
          'subsample_freq':3,
          'colsample_btree':0.8,
          'num_iterations': 10000,
          'verbose':-1
}
oof_lgb = np.zeros(len(X_train))
predictions_lgb = np.zeros((len(X_test)))
# 特征重要性
feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0})
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:',trn_idx)
    print('val_idx:',val_idx)
    trn_data = lgb.Dataset(X_train.iloc[trn_idx][features],label=y.iloc[trn_idx])
    val_data = lgb.Dataset(X_train.iloc[val_idx][features],label=y.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(
        params,
        trn_data,
        num_round,
        valid_sets = [trn_data, val_data],
        verbose_eval=100,
        early_stopping_rounds=50,
        categorical_feature=cat_cols,
    )
    feat_imp_df['imp'] += clf.feature_importance() / 5
    oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration)
print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))

fold n°0
trn_idx: [     0      1      2 ... 299997 299998 299999]
val_idx: [     3      9     14 ... 299992 299993 299994]
Training until validation scores don't improve for 50 rounds
[100]	training's binary_error: 0.288733	valid_1's binary_error: 0.308517
[200]	training's binary_error: 0.270492	valid_1's binary_error: 0.3062
Early stopping, best iteration is:
[200]	training's binary_error: 0.270492	valid_1's binary_error: 0.3062
fold n°1
trn_idx: [     0      1      2 ... 299996 299998 299999]
val_idx: [    10     13     25 ... 299986 299988 299997]
Training until validation scores don't improve for 50 rounds
[100]	training's binary_error: 0.288637	valid_1's binary_error: 0.309817
[200]	training's binary_error: 0.270087	valid_1's binary_error: 0.305983
[300]	training's binary_error: 0.259467	valid_1's binary_error: 0.305533
Early stopping, best iteration is:
[324]	training's binary_error: 0.257283	valid_1's binary_error: 0.305217
fold n°2
trn_idx: [     0      3      5 ... 299997 2999

In [28]:
X_test['category_id'] = [1 if i >= 2.5 else 0 for i in predictions_lgb]
X_test['user_id'] = X_test['pid']
X_test[['user_id', 'category_id']].to_csv('base3_sub.csv', index=False)

In [15]:
train = pd.read_csv('data/train.txt', header=None, names=['pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'make', 'model'])
test = pd.read_csv('data/test.txt', header=None, names=['pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'make', 'model'])
data = pd.concat([train, test]).reset_index(drop=True)