In [None]:
ls

In [None]:
cd /content/drive/MyDrive/DeepLearningRec/XunFei

In [None]:
ls

In [None]:
!pip install category_encoders

In [None]:
import os
import math
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from datetime import datetime
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [None]:
train_path = './Dataset/train'
test_path = './Dataset/test'

In [None]:
TRAIN_COLUMNS = ['pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make']
TEST_COLUMNS = ['pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make']

train_data = pd.read_feather(os.path.join(train_path, 'train.feather'))
test_data = pd.read_feather(os.path.join(test_path, 'test.feather'))

In [None]:
data = pd.concat([train_data, test_data]).reset_index(drop=True)

### 1.基础统计特征和交叉特征
    对 province、city、gender、age 这四个基本特征进行统计，主要为 count 特征
    对 province、city、gender、age 的交叉特征进行统计，主要为 count、nunique、rate 特征，

In [None]:
for col in ['gender', 'age', 'province', 'city']:
  data['{}_count'.format(col)] = data.groupby(col)[col].transform('count')

In [None]:
cross_feature = ['gender', 'age', 'province', 'city']
# 交叉特征组合(两两出现的次数)
while len(cross_feature) != 0:
  f = cross_feature.pop()
  for col in cross_feature:
    data['{}_{}_count'.format(f, col)] = data.groupby([f, col])[col].transform('count')
    data['{}_{}/{}_rate'.format(f, col, f)] = data['{}_{}_count'.format(f, col)] / data['{}_count'.format(f)]
    data['{}_{}/{}_rate'.format(f, col, col)] = data['{}_{}_count'.format(f, col)] / data['{}_count'.format(col)]

In [None]:
# 特征unique count特征
for index, col1 in enumerate(['age', 'province', 'city']):
    for col2 in ['age', 'province', 'city'][index:]:
        data['{}_in_{}_nunique'.format(col1, col2)] = data.groupby(col1)[col2].transform('nunique')
        data['{}_in_{}_nunique'.format(col2, col1)] = data.groupby(col2)[col1].transform('nunique')

### 2.与CTR有关的统计特征
    将 gender、age、city、province 作为类别特征；
    age_city、age_province、age_gender、gender_province、gender_city作为交叉类别特征；
    上述类别特征用一种带平滑的 TargetEncoder 作为编码；

In [None]:
data['gender'] = data['gender'].apply(str)
data['age'] = data['age'].apply(str)

In [None]:
# 生成交叉特征
cross_features = ['age_city', 'age_province', 'age_gender', 'gender_province', 'gender_city']
for feature in cross_features:
  data[feature] = data[feature.split('_')[0]] + '_' + data[feature.split('_')[1]]

In [None]:
data['gender'] = data['gender'].apply(float)
data['age'] = data['age'].apply(float)

In [None]:
import category_encoders as ce

# 使用 m_estimate_encoder 对类别型特征进行编码
X_train, X_test = data[~data['label'].isna()], data[data['label'].isna()]
for feat in ['gender', 'age', 'province', 'city'] + cross_features:
  target_encoder = ce.MEstimateEncoder(cols=[feat], sigma=0.1, m=200.0)
  train_feat_info = target_encoder.fit_transform(X_train[[feat]], X_train['label'])
  test_feat_info = data[300000:][feat + '_m_est_encoding'] = target_encoder.transform(X_test[[feat]])
  data[feat + '_m_estimate_encoding'] = pd.concat([train_feat_info, test_feat_info])[feat]

In [None]:
# 使用 Wilson 置信区间估计，可以降低由于类别特征出现次数bias的影响
# 最终证明采用 Wilson 置信区间估计带来的提升很大，
def wilson_score(pos, total, p_z=1.96):
    """
    威尔逊得分计算函数
    :param pos: 正例数
    :param total: 总数
    :param p_z: 正太分布的分位数
    :return: 威尔逊得分
    """
    pos_rat = pos * 1. / total * 1.  # 正例比率
    score = (pos_rat + (np.square(p_z) / (2. * total))
             - ((p_z / (2. * total)) * np.sqrt(4. * total * (1. - pos_rat) * pos_rat + np.square(p_z)))) / \
    (1. + np.square(p_z) / total)
    return score

In [None]:
for feature in ['gender', 'age', 'province', 'city'] + cross_features:
  dict_sum = dict(X_train.groupby(feature)['label'].count())
  dict_pos = dict(X_train.groupby(feature)['label'].sum())
  dict_wilson_score = {}
  for key in dict_sum.keys():
    dict_wilson_score[key] = wilson_score(dict_pos[key], dict_sum[key])
  data['{}_wilsion_score'.format(feature)] = data[feature].apply(lambda x: dict_wilson_score.get(x, np.nan))

### 3.标签特征的处理

    对于标签的处理，基本上基本思想还是基于协同过滤的思想；
    基于标签的协同过滤，历史标签和 CTR 之间的关系，以及当前用户的标签和历史标签之间的关系得到当前用户可能的CTR特征；
    基于用户的协同过滤，希望通过找到历史用户的和当前用户最相似的用户，可以选择对用户表达成向量，也可以选择将标签作为item，利用基于领域的协同过滤算法。

#### 3.1 统计特征（模型结果显示这是个强特征）
    根据训练集清洗出的严格有用的标签以及每个用户过滤后的有用标签进行处理;
    计算每个用户的所有有用标签 pos_rate 和 wilson 置信区间的 min、max、mean、std 特征，以及有多少有用的标签数目；

In [None]:
# 读取标签统计信息
tag_info = pd.read_feather('./Dataset/tag_info.feather')

In [None]:
useful_tag_pos_rate_dict = dict(tag_info[tag_info['is_stricted_filtered']][['index', 'click_rate']].values)
useful_tag_wilson_score_dict = dict(tag_info[tag_info['is_stricted_filtered']][['index', 'wilson_score']].values)

In [None]:
wilson_score_mean = np.mean(list(useful_tag_wilson_score_dict.values()))

In [None]:
data['strict_filtered_tagid_pos_rate'] = data['strict_filtered_tagid'].apply(lambda x: [useful_tag_pos_rate_dict[tag] for tag in x] if type(x) is np.ndarray else [0.5])

In [None]:
data['strict_filtered_tagid_wilsion_score'] = data['strict_filtered_tagid'].apply(lambda x: [useful_tag_wilson_score_dict[tag] for tag in x] if type(x) is np.ndarray else [wilson_score_mean])

In [None]:
data['useful_tag_len'] = data['strict_filtered_tagid'].apply(lambda x: len(x) if type(x) is np.ndarray else 0)

In [None]:
#针对计算有用标签的 pos_rate 和 wilson_score, 计算min、max、mean、std
def indicator_func(func, x):
  if func == 'min':
    return np.min(x)
  elif func == 'max':
    return np.max(x)
  elif func == 'mean':
    return np.mean(x)
  else:
    return np.std(x)

for feat in ['strict_filtered_tagid_pos_rate', 'strict_filtered_tagid_wilsion_score']:
  for indicator in ['min', 'max', 'mean', 'std']:
    data['{}_{}'.format(feat, indicator)] = data[feat].apply(lambda x: indicator_func(indicator, x)) 

In [None]:
data.drop(columns=['strict_filtered_tagid_pos_rate', 'strict_filtered_tagid_wilsion_score'], inplace=True)

#### 3.2 用户标签的 tf-idf 特征和 countvectorizer 特征

    清洗过的对用户购买行为由比较强的指示特征的标签中用户标签的 tf-idf 特征和 countvectorizer 特征；
    然后用 LSA 模型得到用户-主题矩阵；

In [None]:
data['strict_filtered_tagid'] = data['strict_filtered_tagid'].apply(lambda x: ['Unknown'] if type(x) is not np.ndarray else x)

In [None]:
# 计算 tf-idf 特征和 countvectorizer 特征
# 分别用 LSA
tfv = TfidfVectorizer(max_features=1000)
cntv = CountVectorizer()
corpus = data['strict_filtered_tagid'].apply(lambda x: ' '.join(x))
tf_idf_features = tfv.fit_transform(corpus)
cnt_features = cntv.fit_transform(corpus)

In [None]:
tfv = TfidfVectorizer(max_features=1000)
cntv = CountVectorizer(max_features=1000)
corpus = data['strict_filtered_tagid'].apply(lambda x: ' '.join(x))
tf_idf_features = tfv.fit_transform(corpus)
cnt_features = cntv.fit_transform(corpus)

In [None]:
tf_idf_svd = TruncatedSVD(n_components=64)
tf_idf_svd_feats = tf_idf_svd.fit_transform(tf_idf_features)
cnt_svd = TruncatedSVD(n_components=128)
cnt_svd_feats = cnt_svd.fit_transform(cnt_features)

In [None]:
for i in range(64):
  data['tf_idf_feats_{}'.format(i)] = tf_idf_svd_feats[:, i]

for i in range(128):
  data['cnt_feats_{}'.format(i)] = cnt_svd_feats[:, i]

### 3.3 使用 LDA 主题模型
    使用 LDA 主题模型来对用户的标签进行预处理；
    LDA特征也可以作为神经网络的线性输入；
    将每个用户的标签表达成主题的形式，然后可以根据主题进行聚类，得到新的类别特征；

In [None]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary

lines = []
for line in data['gentle_filtered_tagid'].values:
    lines.append(line)

dictionary = Dictionary(lines)
corpus = [dictionary.doc2bow(text) for text in lines]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=32)
gentle_filtered_tag_lda = [lda[line] for line in corpus]

In [None]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary

lines = []
for line in data['strict_filtered_tagid'].values:
    lines.append(line)

dictionary = Dictionary(lines)
corpus = [dictionary.doc2bow(text) for text in lines]
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=16)
strict_filtered_tag_lda = [lda[line] for line in corpus]

#### 3.3 word2vec预训练标签的序列表示
    应用 word2vec 预训练得到标签的embedding表示；
    可以用来初始化神经网络层；
    分别尝试对精细清洗的标签和粗略清洗的标签都做一次预训练；
    也可以直接通过平均池化或者加权池化得到用户的向量表示（二者择一试试）；


In [None]:
#对粗略过滤的向量做预训练，然后取平均得到用户向量表示
sentences = data['gentle_filtered_tagid'].values.tolist()
for i in  range(len(sentences)):
  sentences[i] = [str(x) for x in sentences[i]]
gentle_embed_size = 32
model = Word2Vec(sentences, size = gentle_embed_size, window=10, min_count=4, sg=0, hs=0, seed=1, iter=10)

In [None]:
gentle_emb_matrix = []
for seq in tqdm(sentences):
  vec = []
  for w in seq:
    if w in model.wv.vocab.keys():
      vec.append(model.wv[w])
  if len(vec) > 0:
    gentle_emb_matrix.append(np.mean(vec, axis=0))
  else:
    gentle_emb_matrix.append([0] * gentle_embed_size)

gentle_emb_matrix = np.array(gentle_emb_matrix)

In [None]:
for i in range(32):
  data['gentle_w2v_emb_{}'.format(i)] = gentle_emb_matrix[:, i]

In [None]:
#对严格过滤的向量做预训练，然后取平均得到用户向量表示
sentences = data['strict_filtered_tagid'].values.tolist()
for i in  range(len(sentences)):
  sentences[i] = [str(x) for x in sentences[i]]
strict_embed_size = 16
model = Word2Vec(sentences, size = strict_embed_size, window=6, min_count=4, sg=0, hs=0, seed=1, iter=10)
strict_emb_matrix = []
for seq in tqdm(sentences):
  vec = []
  for w in seq:
    if w in model.wv.vocab.keys():
      vec.append(model.wv[w])
  if len(vec) > 0:
    strict_emb_matrix.append(np.mean(vec, axis=0))
  else:
    strict_emb_matrix.append([0] * embed_size)

strict_emb_matrix = np.array(strict_emb_matrix)

In [None]:
for i in range(16):
  data['strict_w2v_emb_{}'.format(i)] = strict_emb_matrix[:, i]

#### 3.4 Doc2Vec 直接得到用户的向量表示（运行时间太久，并没有跑完）
    根据用户的标签直接得到用户的向量表示；
    可以作为神经网路的输入；
    可以直接作为树模型的输入；

In [None]:
# 对宽松清洗的标签进行预训练
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

gentle_embed_size = 32
gentle_documents = [TaggedDocument(tags, [pid]) for pid, tags in zip(data['pid'], data['gentle_filtered_tagid'])]
gentle_doc2vec_model = Doc2Vec(gentle_documents, vector_size=gentle_embed_size, window=6, min_count=1, workers=4, epochs=20)

In [None]:
gentle_doc2vec_emb_matrix = []

In [None]:
# 对严格清洗的标签进行预训练
strict_embed_size = 32
strict_documents = [TaggedDocument(tags, [pid]) for pid, tags in zip(data['pid'], data['strict_filtered_tagid'])]
strict_doc2vec_model = Doc2Vec(gentle_documents, vector_size=strict_embed_size, window=6, min_count=1, workers=4, epochs=20)

#### 3.5 根据用户的向量表示对用户进行聚类

    根据前面LDA得到的用户向量，对用户进行聚类，得到新的类别型特征；
    直接根据 LDA 计算每个用户权重最高的主题也可以；
    再对整个类别型特征做类别型特征的处理；

In [None]:
#1.直接根据 LDA 中取值最大的向量
gentle_lda_matrix = np.zeros((400000, 32))
for row, lda_feat in enumerate(gentle_filtered_tag_lda):
  for col, val in lda_feat:
    gentle_lda_matrix[row][col] = val

data['gentle_lda_max'] = np.argmax(gentle_lda_matrix, axis=1)
for i in range(32):
  data['gentle_lda_emb_{}'.format(i)] = gentle_lda_matrix[:, i]

In [None]:
strict_lda_matrix = np.zeros((400000, 16))
for row, lda_feat in enumerate(strict_filtered_tag_lda):
  for col, val in lda_feat:
    strict_lda_matrix[row][col] = val

data['strict_lda_max'] = np.argmax(strict_lda_matrix, axis=1)
for i in range(16):
  data['strict_lda_emb_{}'.format(i)] = strict_lda_matrix[:, i]

In [None]:
#2.对 LDA 向量做聚类处理
from sklearn.cluster import KMeans

kms = KMeans(32)
kms.fit(gentle_lda_matrix)
data['gentle_lda_kmeans'] = kms.predict(gentle_lda_matrix)

In [None]:
kms = KMeans(32)
data['strict_lda_kmeans'] = kms.fit_predict(strict_lda_matrix)

In [None]:
# 使用 m_estimate_encoder 对类别型特征进行编码
X_train, X_test = data[~data['label'].isna()], data[data['label'].isna()]
for feat in ['gentle_lda_max', 'strict_lda_max', 'gentle_lda_kmeans', 'strict_lda_kmeans']:
  target_encoder = ce.MEstimateEncoder(cols=[feat], sigma=0.1, m=100.0)
  train_feat_info = target_encoder.fit_transform(X_train[[feat]], X_train['label'])
  test_feat_info = data[300000:][feat + '_m_est_encoding'] = target_encoder.transform(X_test[[feat]])
  data[feat + '_m_estimate_encoding'] = pd.concat([train_feat_info, test_feat_info])[feat]

#### 3.6 挖掘用户共现特征找相似用户（时间关系还没有完成）
    利用协同过滤的思想；
    根据用户的历史标签，找到于用户最相似的topn个用户，然后利用 topn 个最相似用户的pos_rate；
    基于领域的协同过滤会存在计算比较复杂的问题，所有选择根据使用Faiss库利用用户向量聚类来做；
    其实前面的聚类思想也是希望通过和用户相似用户的pos_rate统计特征；

In [None]:
data.columns[:50]

In [None]:
cat_cols = ['gender', 'age', 'province', 'city'] + cross_features + ['gentle_lda_max', 'strict_lda_max', 'gentle_lda_kmeans', 'strict_lda_kmeans']
features = [i for i in data.columns if i not in cat_cols + ['pid', 'label', 'tagid', 'time', 'make', 'model', 'strict_filtered_tagid', 'strict_filtered_day', 'gentle_filtered_tagid',
       'gentle_filtered_day']]
all_feature = features.copy()

data[cat_cols] = data[cat_cols].astype('category')
X_train = data[~data['label'].isna()]
X_test = data[data['label'].isna()]

y = X_train['label']

In [None]:
feat_imp_df = pd.DataFrame({'feat': all_feature, 'imp': 0})

In [None]:
KF = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
params = {
  'objective':'binary',
  'metric':'binary_error',
  'learning_rate':0.02,
  'subsample':0.8,
  'subsample_freq':3,
  'colsample_bytree':0.8,
  'num_iterations':10000,
  'verbose':-1,
  'n_thread':-1
}

oof_lgb = np.zeros(len(X_train))
predictions_lgb = np.zeros((len(X_test)))

# 特征重要性
# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train, y)):
    print("fold n°{}".format(fold_))
    print('trn_idx:',trn_idx)
    print('val_idx:',val_idx)
    train_x, val_x, test_x = X_train.iloc[trn_idx][features], X_train.iloc[val_idx][features], X_test[features]
    train_y, val_y = y.iloc[trn_idx], y.iloc[val_idx]
    trn_data = lgb.Dataset(train_x, label=train_y)
    val_data = lgb.Dataset(val_x, label=val_y)
    num_round = 10000
    clf = lgb.train(
        params,
        trn_data,
        num_round,
        valid_sets = [trn_data, val_data],
        verbose_eval=100,
        early_stopping_rounds=50,
        # 使用类别型特征模型表现会变差
        # categorical_feature=cat_cols,
    )
    feat_imp_df['imp'] += clf.feature_importance() / 5
    oof_lgb[val_idx] = clf.predict(val_x, num_iteration=clf.best_iteration)
    predictions_lgb[:] += clf.predict(test_x, num_iteration=clf.best_iteration)
print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))

In [None]:
median = np.median(predictions_lgb)
X_test['category_id'] = [1 if i >= median else 0 for i in predictions_lgb]
X_test['user_id'] = X_test['pid']
X_test[['user_id', 'category_id']].to_csv('lgb_731.csv', index=False)

In [None]:
# params = {
#   'objective':'binary',
#   'metric':'binary_error',
#   'learning_rate':0.02,
#   'subsample':0.8,
#   'subsample_freq':3,
#   'colsample_bytree':0.8,
#   'num_iterations': 800,
#   'verbose':1,
#   'n_thread':-1
# }

# train_x, test_x = X_train[features], X_test[features]
# # 分别计算不同cate feature 的 pos_rate
# trn_data = lgb.Dataset(train_x, label=y)
# clf = lgb.train(
#     params,
#     trn_data,
#     valid_sets = [trn_data],
#     verbose_eval=100,
#     # categorical_feature=cat_cols,
# )
# feat_imp_df['imp'] += clf.feature_importance()
# oof_lgb = clf.predict(train_x)
# predictions_lgb[:] = clf.predict(test_x, num_iteration=clf.best_iteration)
# print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
# print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
# print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
# print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))