## 任务1：报名比赛，下载比赛数据集并完成读取

- 步骤1 ：登录&报名比赛：https://aistudio.baidu.com/aistudio/competition/detail/45/0/task-definition
- 步骤2 ：下载比赛数据集
- 步骤3 ：使用Pandas完成数据读取。

## 任务2：对句子对提取TFIDF以及统计特征，训练和预测

参考代码：https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb
- 步骤1 ：对句子对（句子A和句子B统计）如下特征：
  - 句子A包含的字符个数、句子B包含的字符个数
  - 句子A与句子B的编辑距离
  - 句子A与句子B共有单词的个数
  - 句子A与句子B共有字符的个数
  - 句子A与句子B共有单词的个数 / 句子A字符个数
  - 句子A与句子B共有单词的个数 / 句子B字符个数
- 步骤2 ：计算TFIDF，并对句子A和句子B进行特征转换
- 步骤3 ：计算句子A与句子B的TFIDF向量的内积距离
- 步骤4 ：将上述特征送入分类模型，训练并预测，将结果预测提交到比赛网站。

In [19]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import jieba
import distance 
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
import xgboost as xgb
import re
from collections import Counter
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer

In [20]:
## jieba分词 
# cut_all=True，全模式，“我来到北京清华大学”-->“ 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学”
def jieba_cut(sentence,method=True):
    word_list = jieba.lcut(sentence,cut_all=method) 
    return word_list 

   
# 分词后，两句子相同词占所有词（去重）的比例
def percent(q1_cut,q2_cut):
    inter_num = len(set(q1_cut) & set(q2_cut))
    percent = inter_num/len(set(q1_cut))
    return percent

## 获取停词 
# https://github.com/goto456/stopwords
def stopwords():
    stop_words =[]
    with open('./cn_stopwords.txt','r',encoding='UTF-8') as f:
        for i in f.readlines():
            i = i.replace('\n','')
            stop_words.append(i)
    return stop_words

# 词共享 比例
def word_match_share(row,stops):
    q1words = {}
    q2words = {}
    # 剔除停词
    for word in str(row['q1_cut']):
        if word not in stops:
            q1words[word] = 1
    for word in str(row['q2_cut']):
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

# 准备语料
def all_words(train):
    corpus = []
    # 遍历每行，q1分词，q2分词，合并
    for row_id in range(len(train)):
        row = train.iloc[row_id]
        all_words = list()
        all_words.extend([word for word in row['q1_cut'] if word not in stopwords()])
        all_words.extend([word for word in row['q2_cut'] if word not in stopwords()])
        corpus.append(' '.join(all_words))
    return corpus

# 定义权重
# 词个数为1的，权重为0，大于1的，权重为 1/(count+10000)
def get_weight(cnt, eps=10000, min_count=2):
    if cnt < min_count:
        return 0
    else:
        return 1 / (cnt + eps)


def tfidf_word_match_share(row,weight):
    q1words = {word:1 for word in row['q1_cut'] if word not in stopwords()}
    q2words = {word:1 for word in row['q2_cut'] if word not in stopwords()}
    if len(q1words)==0 or len(q2words)==0:
        return 0 
    
    # 获取共享词的权重
    shared_weights = [weight.get(w,0) for w in q1words.keys() if w in q2words] + [weight.get(w,0) for w in q2words.keys() if w in q1words]
    # 总权重
    total_weights = [weight.get(w, 0) for w in q1words] + [weight.get(w, 0) for w in q2words]
    # 共享词权重比例
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [None]:
# 特征处理

def handle_feature(train):
    # 字符个数
    train['q1_len'] = train['q1'].apply(len)
    train['q2_len'] = train['q2'].apply(len)  

    train['q1_cut'] = train['q1'].apply(lambda x:jieba_cut(x)) 
    train['q2_cut'] = train['q2'].apply(lambda x:jieba_cut(x)) 
    # 分词后的词个数
    train['q1_cut_len'] = train['q1_cut'].apply(len)
    train['q2_cut_len'] = train['q2_cut'].apply(len)  

    # 编辑距离 
    # Levenshtein Distance 被称为编辑距离（Edit Distance），一个度量两个字符序列之间差异的字符串度量标准
    train['Lev_distance'] = train.apply(lambda x:distance.levenshtein(x['q1'],x['q2']),axis=1)
    train['q1_cut_percent'] = train.apply(lambda x: percent(x['q1_cut'],x['q2_cut']),axis=1)
    train['q2_cut_percent'] = train.apply(lambda x: percent(x['q2_cut'],x['q1_cut']),axis=1)

    train['word_match'] = train.apply(lambda x: word_match_share(x,stopwords()),axis=1)

    corpus = all_words(train)
    corpus = (' '.join(corpus).split())
    word_cnt = Counter(corpus)

    # 生成词权重，词--权重
    weight =  { word:get_weight(cnt) for word ,cnt in word_cnt.items()}
    train['tfidf_word_match'] = train.apply(lambda x:tfidf_word_match_share(x,weight),axis=1)
    return train



In [None]:
# LigthGMB    
          
def train_lgb_kfold(X_train, y_train, X_test, n_fold=5):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=2021, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr)
        dvalid = lgb.Dataset(X_val, y_val, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 300,
            'boosting_type':'gbdt',
            'subsample_freq':1,
            'reg_alpha':0.5,
            'reg_lambda':0.5,
            'n_estimators':2000,
            'learning_rate': 0.05,
            'min_data_in_leaf': 150,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'n_jobs': -1,
            'seed': 2022,
            'verbose':-1
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=100,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=20)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds


def train_lgb(train, test, feat_cols, label_col, n_fold=10):
    '''训练lightgbm'''
    X_train = train[feat_cols]
    y_train = train[label_col]
    X_test = test[feat_cols]
    gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb_kfold(X_train, y_train, X_test, n_fold=n_fold)
    
    return gbms_lgb, oof_preds_lgb, test_preds_lgb


def model(data):
    train = pd.read_csv(data_dir+'/'+data+'/train.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()
    test = pd.read_csv(data_dir+'/'+data+'/test.tsv',sep='\t',error_bad_lines=False,names=['q1','q2']).dropna()
    test['label'] = -1 
    dev = pd.read_csv(data_dir+'/'+data+'/dev.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()
    print(f'开始处理{data}数据')
    
    # label非[0,1]的不处理
    if len(set(train.label))>2:
        train = train[train['label'].isin(['0', '1'])]
        train['label'] = train['label'].astype('int')
    print('数据读取完成~')
    
    print('特征处理~')
    train = handle_feature(train)
    test = handle_feature(test)
    print('特征处理完成~')
    
    feat_cols = ['Lev_distance','word_match', 'tfidf_word_match']
    
    # 因为有drop，注意训练集的index的连续性
    train.index = train.reset_index(drop=True).index
    
    print('模型训练中~')
    gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb(train, test,
                                                    feat_cols=feat_cols,
                                                    label_col='label')
    
    
    # 测试集预测结果
    df_test_submit = pd.DataFrame({'index': test.index.to_list(),
                               'prediction': test_preds_lgb})
    ## 直接按 0.5 划分
    df_test_submit['prediction'] =  np.where(df_test_submit['prediction']>0.5,1,0)
    df_test_submit.to_csv('./submit_result/' + data + '.tsv', index=False, sep='\t')
    
    print('预测输出成功~')

In [31]:
data_dir = 'D:/study_hard/statistic/千言数据集'
data_list = ['bq_corpus','lcqmc','paws-x-zh']
# data_dir = 'E:/学习/千言数据集/'

for data in data_list:
    model(data)

开始处理bq_corpus数据
数据读取完成~
特征处理~
特征处理完成~
模型训练中~
[LightGBM] [Info] Number of positive: 38832, number of negative: 38742
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 77574, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500580 -> initscore=0.002320
[LightGBM] [Info] Start training from score 0.002320
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.741194	valid_1's auc: 0.718882
[100]	training's auc: 0.743538	valid_1's auc: 0.719686
[150]	training's auc: 0.745008	valid_1's auc: 0.720369
Early stopping, best iteration is:
[159]	training's auc: 0.745212	valid_1's auc: 0.720489
[LightGBM] [Info] Number of positive: 38832, number of negative: 38742
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true

## 任务3：加载中文词向量，自己训练中文词向量

- 步骤1 ：使用jieba对中文句子进行分词
- 步骤2 ：使用gensim中Word2Vec训练分词后的句子，得到词向量。

参考：https://zhuanlan.zhihu.com/p/114538417#/

https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

![20220124224400](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220124224400.png)

Word2Vec是轻量级的神经网络，其模型仅仅包括输入层、隐藏层和输出层，模型框架根据输入输出的不同，主要包括CBOW和Skip-gram模型。 

- CBOW（Continuous-bag-of-words）的方式是在知道词的上下文的情况下预测当前词；

- Skip-gram是在知道了词的情况下,对词的上下文进行预测；

Word2Vec相对于其他词袋模型来说，解决了2个问题：
1. 词序信息，n-gram捕获局部词序，但会受到高维、稀疏矩阵影响；
2. 潜在单词的含义；
   
但是，无法推断不熟悉的词的向量，FastText可解决这个问题。


In [68]:
data_dir = 'D:/study_hard/statistic/千言数据集'
# data_dir = 'E:/学习/千言数据集/'


train = pd.read_csv(data_dir+'/'+'bq_corpus'+'/train.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()
train.index = train.reset_index(drop=True).index
test = pd.read_csv(data_dir+'/'+'bq_corpus'+'/test.tsv',sep='\t',error_bad_lines=False,names=['q1','q2']).dropna()
test['label'] = -1 
dev = pd.read_csv(data_dir+'/'+'bq_corpus'+'/dev.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()

In [69]:
train['q1'].head()

0               用微信都6年，微信没有微粒贷功能
1                         微信消费算吗
2           交易密码忘记了找回密码绑定的手机卡也掉了
3    你好我昨天晚上申请的没有打电话给我今天之内一定会打吗？
4                        “微粒贷开通"
Name: q1, dtype: object

In [70]:
# 分词
train['q1_cut'] = train['q1'].apply(lambda x:jieba_cut(x,method=False)) 
train['q2_cut'] = train['q2'].apply(lambda x:jieba_cut(x,method=False)) 

# 剔除标点
remove_char_list = list('[·’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+')
def remove_chars(row):
    return [word for word in row if word not in remove_char_list]

train['q1_cut'] = train['q1_cut'].apply(lambda x:remove_chars(x)) 
train['q2_cut'] = train['q2_cut'].apply(lambda x:remove_chars(x)) 

# 删除停词
# def remove_stop_word(row):
#     l = [word for word in row if word not in stopwords()]
#     return l

# train['q1_cut'] = train['q1_cut'].map(lambda row:remove_stop_word(row))
# train['q2_cut'] = train['q2_cut'].map(lambda row:remove_stop_word(row))

In [71]:
train['q1_cut'].head()

0                  [用, 微信, 都, 6, 年, 微信, 没有, 微粒, 贷, 功能]
1                                       [微信, 消费, 算, 吗]
2         [交易, 密码, 忘记, 了, 找回, 密码, 绑定, 的, 手机卡, 也, 掉, 了]
3    [你好, 我, 昨天晚上, 申请, 的, 没有, 打电话, 给, 我, 今天, 之内, 一定...
4                                          [微粒, 贷, 开通]
Name: q1_cut, dtype: object

In [72]:
# 准备语料 
# 一句话一个list

def prepare_words(train):
    corpus = []
    # 遍历每行，q1分词，q2分词，合并
    for row_id in range(len(train)):
        row = train.iloc[row_id]
        all_words = []
        all_words.append([word for word in row['q1_cut']])
        all_words.append([word for word in row['q2_cut']])
        corpus.extend(all_words)
    return corpus


sentences = prepare_words(train)
sentences[:5]

[['用', '微信', '都', '6', '年', '微信', '没有', '微粒', '贷', '功能'],
 ['4', '号码', '来', '微粒', '贷'],
 ['微信', '消费', '算', '吗'],
 ['还有', '多少', '钱', '没', '还'],
 ['交易', '密码', '忘记', '了', '找回', '密码', '绑定', '的', '手机卡', '也', '掉', '了']]

In [73]:
train.iloc[6215]

q1          财务报告
q2            账务
label          1
q1_cut    [财务报告]
q2_cut      [账务]
Name: 6215, dtype: object

In [74]:
# 使用gensim训练词向量
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

def w2v(data):
    n_dim = 100
    # 初始化模型,sg=1:skip-gram
    w2v_model = Word2Vec(sentences=data,vector_size=n_dim,window=5,min_count=5,sg=1,workers=4,epochs=5) 
    # 保存模型
    w2v_model.save('./w2v_model.pkl')
    # 仅保存词+embeddings
    word_vec = w2v_model.wv
    word_vec.save("word2vec.wordvectors") # 保存word vectors
    return w2v_model


w2v_model = w2v(sentences)

collecting all words and their counts
PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
PROGRESS: at sentence #10000, processed 65540 words, keeping 3195 word types
PROGRESS: at sentence #20000, processed 174643 words, keeping 4369 word types
PROGRESS: at sentence #30000, processed 244552 words, keeping 4817 word types
PROGRESS: at sentence #40000, processed 313983 words, keeping 5116 word types
PROGRESS: at sentence #50000, processed 413072 words, keeping 5358 word types
PROGRESS: at sentence #60000, processed 489678 words, keeping 5489 word types
PROGRESS: at sentence #70000, processed 554928 words, keeping 5565 word types
PROGRESS: at sentence #80000, processed 620694 words, keeping 5617 word types
PROGRESS: at sentence #90000, processed 685771 words, keeping 5680 word types
PROGRESS: at sentence #100000, processed 751360 words, keeping 5713 word types
PROGRESS: at sentence #110000, processed 833694 words, keeping 5745 word types
PROGRESS: at sentence #120000, proces

In [83]:
# 删除占用大内存的model
# del w2v_model 

# 加载保存的word vectors
loaded_wv = KeyedVectors.load('word2vec.wordvectors', mmap='r')
loaded_wv['信']

# w2v_model.wv.similarity('微信','密码')
# w2v_model.wv.most_similar('微信')

loading KeyedVectors object from word2vec.wordvectors
KeyedVectors lifecycle event {'fname': 'word2vec.wordvectors', 'datetime': '2022-02-10T10:57:39.234887', 'gensim': '4.1.2', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


array([-1.8727331e-01,  1.0888859e-02, -1.3659933e-01,  1.3600546e-01,
       -2.1996497e-01,  2.0197120e-02,  1.2180665e-01,  2.1304755e-01,
       -2.9821113e-01,  1.5497589e-01, -1.5830395e-01, -4.4661757e-01,
        2.0237821e-01, -3.9499825e-01, -2.3537318e-01, -4.3659130e-01,
        2.7773699e-01,  1.4444243e-02, -2.3975772e-01, -4.5161733e-01,
        1.3033195e-01, -4.0630344e-03, -1.7682567e-01, -3.9634975e-03,
       -2.0297852e-01,  2.7562166e-02, -1.7153351e-01,  1.4266585e-01,
       -5.4410762e-01,  3.9102435e-02, -2.0235352e-02,  2.4826068e-01,
        2.1430512e-01, -7.5782321e-02, -2.1690451e-02,  2.7817345e-01,
       -3.8981200e-03, -1.1847605e-01, -2.8717902e-01, -7.2406012e-01,
       -1.5943624e-02, -5.9930223e-01,  8.2759850e-02, -9.1456905e-02,
        2.4171572e-02, -1.6210064e-01, -4.8542049e-02, -8.6396746e-02,
       -6.1020032e-02,  1.9139996e-01, -2.6346058e-01, -6.3354933e-01,
        1.5281633e-01,  1.1539430e-02,  2.9598472e-01,  8.1363946e-02,
      

## 任务4：使用中文词向量完成mean/max/sif句子编码

- 步骤1 ：单词通过word2vec编码为100维向量，则句子编码为N∗100的矩阵，N为句子单词个数。
- 步骤2 ：将N*100的矩阵进行max-pooling编码，转为100维度。
- 步骤3 ：将N*100的矩阵进行mean-pooling编码，转为100维度。
- 步骤4 ：将N*100的矩阵与单词的IDF进行矩阵相乘，即按照单词的词频进行加权，进行tfidf-pooling编码，转为100维度。
- 步骤5 ：学习SIF编码的原理，进行sif编码，转为100维度。
  - https://github.com/PrincetonML/SIF/blob/master/src/SIF_embedding.py#L30
  - https://openreview.net/pdf?id=SyK00v5xx
- 步骤6（可选） ：通过上述步骤2-步骤5的编码，计算相似句子的相似度 vs 不相似句子的相似度， 绘制得到分布图，哪一种编码最优？

参考：
https://blog.csdn.net/asialee_bird/article/details/100124565

**平均词向量**：

将句子中所有词的word embedding相加取平均，得到的向量当做最终的sentence embedding。该方法缺点是认为句子中的所有词对于表达句子含义同样重要。

![20220126003459](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220126003459.png)

**TFIDF加权平均词向量**：

对每个词按照tf-idf进行打分，然后进行加权平均，得到最终的句子表示。

![20220126003755](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220126003755.png)

mean-pooling

In [76]:
# mean-pooling: 对每个句子的所有词向量取均值，生成句子的vector

def build_sentence_mean_vector(sentences,w2v_model):
    arr = []
    for word in sentences:
        try:
            arr.append(w2v_model.wv[word])
        except KeyError:
            continue
    if len(arr) == 0:
        return np.zeros(shape=(1,100)).mean(0)
    else:
        return np.vstack(arr).mean(0) 


train['q1_sen_mean_vec'] = train['q1_cut'].map(lambda sentence:build_sentence_mean_vector(sentence,w2v_model))
train['q2_sen_mean_vec'] = train['q2_cut'].map(lambda sentence:build_sentence_mean_vector(sentence,w2v_model))

max-pooling

In [77]:
def build_sentence_max_vector(sentences,w2v_model):
    arr = []
    for word in sentences:
        try:
            arr.append(w2v_model.wv[word])
        except KeyError:
            continue
    if len(arr) == 0:
        return np.zeros(shape=(1,100)).max(0)
    else:
        return np.vstack(arr).max(0) 


train['q1_sen_max_vec'] = train['q1_cut'].map(lambda sentence:build_sentence_max_vector(sentence,w2v_model))
train['q2_sen_max_vec'] = train['q2_cut'].map(lambda sentence:build_sentence_max_vector(sentence,w2v_model))

tfidf-pooling

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
import math 

# 对分词计算tf-idf 
s= []
for sentence in sentences:
    s.append(' '.join(sentence))
    
tfidf_vec = TfidfVectorizer()
tf_idf = tfidf_vec.fit_transform(s) # 返回样本、特征矩阵

# 输出tf-idf字典
word_idf_dict ={k:v for k,v in zip(tfidf_vec.get_feature_names(),tfidf_vec.idf_)}
word_idf_dict.get('微信')

4.498836297423477

In [79]:
# 对每个句子的所有词向量取加权(idf)均值，生成句子vector

def build_sentence_weight_vector(sentence,w2v_model,word_idf_dict):
    # key_words_list=list(key_weight)
    arr = []
    for word in sentence:
        try:
            if word in word_idf_dict.keys():
                arr.append(np.dot(w2v_model.wv[word],math.exp(word_idf_dict.get(word))))
            else:
                arr.append(w2v_model.wv[word].reshape(1,100))
        except KeyError:
            continue
        
        if len(arr) == 0:
            return np.zeros(shape=(1,100)).mean(0)
        else:
            return np.vstack(arr).mean(0)


train['q1_sen_idf_vec'] = train['q1_cut'].map(lambda sentence:build_sentence_weight_vector(sentence,w2v_model,word_idf_dict))
train['q2_sen_idf_vec'] = train['q2_cut'].map(lambda sentence:build_sentence_weight_vector(sentence,w2v_model,word_idf_dict))

**SIF编码**（smooth inverse frequency，平滑逆词频）

参考：https://zhuanlan.zhihu.com/p/111710604

原理：计算句子中词向量的加权平均，然后这些词向量分别减去他们各自在句向量矩阵（有词向量组合而成）的第一主向量上的投影即可。这可以理解为删除词向量的“共有部分”，保留每个词向量各自拥有的特征


![20220208142640](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220208142640.png)
，a为常数，p(w)代表词频

该方法在各种**文本相似度任务**上的性能显著优于未加权平均，其中大多数任务的性能甚至超过了一些复杂的监督方法

![20220208150301](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220208150301.png)


In [80]:
# 统计词频,返回词列表、句子条数
def count_word(data):
    sen_num = 0
    word_list =list()
    for i in range(len(data)):
        word_list.extend(list(set(data.iloc[i]['q1_cut'])))
        word_list.extend(list(set(data.iloc[i]['q2_cut'])))
        sen_num+=2
    return word_list,sen_num

word_list,sen_num = count_word(train)
word_cnt = Counter(word_list)

# 计算sif权重
def sif_weight(word_cnt, a=3e-5):
    # 统计词频p(w)
    word_num = 0
    for k,v in dict(word_cnt).items():
        word_num += v
    # 计算sif权重
    sif = {}
    for k,v in dict(word_cnt).items():
        sif[k] = a / (a + v/word_num)
    return sif

word_sif_dict = sif_weight(word_cnt)

In [81]:
# https://github.com/PrincetonML/SIF/blob/master/src/SIF_embedding.py

from sklearn.decomposition import TruncatedSVD

# 计算主成分
def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

# 移除主成分
def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX

In [82]:
# sif加权词向量，生成句向量

def build_sentence_sif_vector(sentence,w2v_model,word_sif_dict):
    arr = []
    for word in sentence:
        try:
            if word in word_sif_dict.keys():
                arr.append(np.dot(w2v_model.wv[word],math.exp(word_sif_dict[word])))
            else:
                arr.append(w2v_model.wv[word].reshape(1,100))
        except KeyError:
            continue
        
    if len(arr) == 0:
        return np.zeros(shape=(1,100)).mean(0)
    else:
        return np.vstack(arr).mean(0)


train['q1_sen_sif_vec'] = train['q1_cut'].map(lambda sentence:remove_pc(build_sentence_sif_vector(sentence,w2v_model,word_sif_dict).reshape((1,100))))
train['q2_sen_sif_vec'] = train['q2_cut'].map(lambda sentence:remove_pc(build_sentence_sif_vector(sentence,w2v_model,word_sif_dict).reshape((1,100))))


## 任务5：搭建SiamCNN/LSTM模型，训练和预测

- 步骤1 ：将训练好的word2vex作为深度学习embeeding层的初始化参数。
- 步骤2 ：搭建SiamCNN（Word2Vec句子编码 + 1D CNN +FC）的孪生网络结构，完成训练和预测，提交测试集预测结果。
- 步骤3 ：搭建SiamLSTM（Word2Vec句子编码 + LSTM + FC）的孪生网络结构，完成训练和预测，提交测试集预测结果。


## 任务6：搭建InferSent模型，训练和预测

- 步骤1 ：将训练好的word2vex作为深度学习embeeding层的初始化参数。
- 步骤2 ：搭建InferSent模型，尝试不同的交叉方法。
- 步骤3 ：训练InferSent模型，提交测试集预测结果。

## 任务7：搭建ESIM模型，训练和预测

- 步骤1 ：将训练好的word2vex作为深度学习embeeding层的初始化参数。
- 步骤2 ：搭建ESIM模型，尝试不同的交叉方法。
- 步骤3 ：训练ESIM模型，提交测试集预测结果。


## 任务8：使用BERT或ERNIE完成NSP任务

参考代码：
https://aistudio.baidu.com/aistudio/projectdetail/3168859
bert-nsp代码
- 步骤1 ：学习Bert模型的使用。
- 步骤2 ：使用Bert完成NSP任务的训练和预测，提交测试集预测结果。

## 任务8：Bert-flow、Bert-white、SimCSE（可选，不参与积分）

- 步骤1 ：学习Bert-white原理和实现
- 步骤2 ：学习SimCSE原理和实现