## 任务1：报名比赛，下载比赛数据集并完成读取

- 步骤1 ：登录&报名比赛：https://aistudio.baidu.com/aistudio/competition/detail/45/0/task-definition
- 步骤2 ：下载比赛数据集
- 步骤3 ：使用Pandas完成数据读取。

## 任务2：对句子对提取TFIDF以及统计特征，训练和预测

参考代码：https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb
- 步骤1 ：对句子对（句子A和句子B统计）如下特征：
  - 句子A包含的字符个数、句子B包含的字符个数
  - 句子A与句子B的编辑距离
  - 句子A与句子B共有单词的个数
  - 句子A与句子B共有字符的个数
  - 句子A与句子B共有单词的个数 / 句子A字符个数
  - 句子A与句子B共有单词的个数 / 句子B字符个数
- 步骤2 ：计算TFIDF，并对句子A和句子B进行特征转换
- 步骤3 ：计算句子A与句子B的TFIDF向量的内积距离
- 步骤4 ：将上述特征送入分类模型，训练并预测，将结果预测提交到比赛网站。

In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import jieba
import distance 
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer

In [4]:
## jieba分词 
# cut_all=True，全模式，“我来到北京清华大学”-->“ 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学”
def jieba_cut(sentence):
    word_list = jieba.lcut(sentence,cut_all=True) 
    return word_list 

   
# 分词后，两句子相同词占所有词（去重）的比例
def percent(q1_cut,q2_cut):
    inter_num = len(set(q1_cut) & set(q2_cut))
    percent = inter_num/len(set(q1_cut))
    return percent

## 获取停词 
# https://github.com/goto456/stopwords
def stopwords():
    stop_words =[]
    with open('./cn_stopwords.txt','r',encoding='UTF-8') as f:
        for i in f.readlines():
            i = i.replace('\n','')
            stop_words.append(i)
    return stop_words

# 词共享 比例
def word_match_share(row,stops):
    q1words = {}
    q2words = {}
    # 剔除停词
    for word in str(row['q1_cut']):
        if word not in stops:
            q1words[word] = 1
    for word in str(row['q2_cut']):
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

# 准备语料
def all_words(train):
    corpus = []
    # 遍历每行，q1分词，q2分词，合并
    for row_id in range(len(train)):
        row = train.iloc[row_id]
        all_words = list()
        all_words.extend([word for word in row['q1_cut'] if word not in stopwords()])
        all_words.extend([word for word in row['q2_cut'] if word not in stopwords()])
        corpus.append(' '.join(all_words))
    return corpus

# 定义权重
# 词个数为1的，权重为0，大于1的，权重为 1/(count+10000)
def get_weight(cnt, eps=10000, min_count=2):
    if cnt < min_count:
        return 0
    else:
        return 1 / (cnt + eps)


def tfidf_word_match_share(row,weight):
    q1words = {word:1 for word in row['q1_cut'] if word not in stopwords()}
    q2words = {word:1 for word in row['q2_cut'] if word not in stopwords()}
    if len(q1words)==0 or len(q2words)==0:
        return 0 
    
    # 获取共享词的权重
    shared_weights = [weight.get(w,0) for w in q1words.keys() if w in q2words] + [weight.get(w,0) for w in q2words.keys() if w in q1words]
    # 总权重
    total_weights = [weight.get(w, 0) for w in q1words] + [weight.get(w, 0) for w in q2words]
    # 共享词权重比例
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [None]:
# 特征处理

def handle_feature(train):
    # 字符个数
    train['q1_len'] = train['q1'].apply(len)
    train['q2_len'] = train['q2'].apply(len)  

    train['q1_cut'] = train['q1'].apply(lambda x:jieba_cut(x)) 
    train['q2_cut'] = train['q2'].apply(lambda x:jieba_cut(x)) 
    # 分词后的词个数
    train['q1_cut_len'] = train['q1_cut'].apply(len)
    train['q2_cut_len'] = train['q2_cut'].apply(len)  

    # 编辑距离 
    # Levenshtein Distance 被称为编辑距离（Edit Distance），一个度量两个字符序列之间差异的字符串度量标准
    train['Lev_distance'] = train.apply(lambda x:distance.levenshtein(x['q1'],x['q2']),axis=1)
    train['q1_cut_percent'] = train.apply(lambda x: percent(x['q1_cut'],x['q2_cut']),axis=1)
    train['q2_cut_percent'] = train.apply(lambda x: percent(x['q2_cut'],x['q1_cut']),axis=1)

    train['word_match'] = train.apply(lambda x: word_match_share(x,stopwords()),axis=1)

    corpus = all_words(train)
    corpus = (' '.join(corpus).split())
    word_cnt = Counter(corpus)

    # 生成词权重，词--权重
    weight =  { word:get_weight(cnt) for word ,cnt in word_cnt.items()}
    train['tfidf_word_match'] = train.apply(lambda x:tfidf_word_match_share(x,weight),axis=1)
    return train



In [None]:
# LigthGMB    
          
def train_lgb_kfold(X_train, y_train, X_test, n_fold=5):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=2021, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr)
        dvalid = lgb.Dataset(X_val, y_val, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 300,
            'boosting_type':'gbdt',
            'subsample_freq':1,
            'reg_alpha':0.5,
            'reg_lambda':0.5,
            'n_estimators':2000,
            'learning_rate': 0.05,
            'min_data_in_leaf': 150,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'n_jobs': -1,
            'seed': 2022,
            'verbose':-1
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=100,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=20)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds


def train_lgb(train, test, feat_cols, label_col, n_fold=10):
    '''训练lightgbm'''
    X_train = train[feat_cols]
    y_train = train[label_col]
    X_test = test[feat_cols]
    gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb_kfold(X_train, y_train, X_test, n_fold=n_fold)
    
    return gbms_lgb, oof_preds_lgb, test_preds_lgb


def model(data):
    train = pd.read_csv(data_dir+'/'+data+'/train.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()
    test = pd.read_csv(data_dir+'/'+data+'/test.tsv',sep='\t',error_bad_lines=False,names=['q1','q2']).dropna()
    test['label'] = -1 
    dev = pd.read_csv(data_dir+'/'+data+'/dev.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()
    print(f'开始处理{data}数据')
    
    # label非[0,1]的不处理
    if len(set(train.label))>2:
        train = train[train['label'].isin(['0', '1'])]
        train['label'] = train['label'].astype('int')
    print('数据读取完成~')
    
    print('特征处理~')
    train = handle_feature(train)
    test = handle_feature(test)
    print('特征处理完成~')
    
    feat_cols = ['Lev_distance','word_match', 'tfidf_word_match']
    
    # 因为有drop，注意训练集的index的连续性
    train.index = train.reset_index(drop=True).index
    
    print('模型训练中~')
    gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb(train, test,
                                                    feat_cols=feat_cols,
                                                    label_col='label')
    
    
    # 测试集预测结果
    df_test_submit = pd.DataFrame({'index': test.index.to_list(),
                               'prediction': test_preds_lgb})
    ## 直接按 0.5 划分
    df_test_submit['prediction'] =  np.where(df_test_submit['prediction']>0.5,1,0)
    df_test_submit.to_csv('./submit_result/' + data + '.tsv', index=False, sep='\t')
    
    print('预测输出成功~')

In [31]:
data_dir = 'D:/study_hard/statistic/千言数据集'
data_list = ['bq_corpus','lcqmc','paws-x-zh']
# data_dir = 'E:/学习/千言数据集/'

for data in data_list:
    model(data)

开始处理bq_corpus数据
数据读取完成~
特征处理~
特征处理完成~
模型训练中~
[LightGBM] [Info] Number of positive: 38832, number of negative: 38742
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 503
[LightGBM] [Info] Number of data points in the train set: 77574, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500580 -> initscore=0.002320
[LightGBM] [Info] Start training from score 0.002320
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.741194	valid_1's auc: 0.718882
[100]	training's auc: 0.743538	valid_1's auc: 0.719686
[150]	training's auc: 0.745008	valid_1's auc: 0.720369
Early stopping, best iteration is:
[159]	training's auc: 0.745212	valid_1's auc: 0.720489
[LightGBM] [Info] Number of positive: 38832, number of negative: 38742
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true

## 任务3：加载中文词向量，自己训练中文词向量

- 步骤1 ：使用jieba对中文句子进行分词
- 步骤2 ：使用gensim中Word2Vec训练分词后的句子，得到词向量。

参考：https://zhuanlan.zhihu.com/p/114538417#/

https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

![20220124224400](https://cdn.jsdelivr.net/gh/xihuishawpy/PicBad@main/blogs/pictures/20220124224400.png)

Word2Vec是轻量级的神经网络，其模型仅仅包括输入层、隐藏层和输出层，模型框架根据输入输出的不同，主要包括CBOW和Skip-gram模型。 

- CBOW（Continuous-bag-of-words）的方式是在知道词的上下文的情况下预测当前词；

- Skip-gram是在知道了词的情况下,对词的上下文进行预测；

Word2Vec相对于其他词袋模型来说，解决了2个问题：
1. 词序信息，n-gram捕获局部词序，但会受到高维、稀疏矩阵影响；
2. 潜在单词的含义；
   
但是，无法推断不熟悉的词的向量，FastText可解决这个问题。


In [9]:
data_dir = 'D:/study_hard/statistic/千言数据集'
# data_dir = 'E:/学习/千言数据集/'

train = pd.read_csv(data_dir+'/'+'bq_corpus'+'/train.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()
train.index = train.reset_index(drop=True).index
test = pd.read_csv(data_dir+'/'+'bq_corpus'+'/test.tsv',sep='\t',error_bad_lines=False,names=['q1','q2']).dropna()
test['label'] = -1 
dev = pd.read_csv(data_dir+'/'+'bq_corpus'+'/dev.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()

In [10]:
train['q1']

0                   用微信都6年，微信没有微粒贷功能
1                             微信消费算吗
2               交易密码忘记了找回密码绑定的手机卡也掉了
3        你好我昨天晚上申请的没有打电话给我今天之内一定会打吗？
4                            “微粒贷开通"
                    ...             
86193                      申请的额度能取现吗
86194                      利息与罚息如何计算
86195                         如何申请货款
86196                         多久才有贷款
86197                  你好我要换卡怎么换我卡掉了
Name: q1, Length: 86198, dtype: object

In [11]:
# 分词
train['q1_cut'] = train['q1'].apply(lambda x:jieba_cut(x)) 
train['q2_cut'] = train['q2'].apply(lambda x:jieba_cut(x)) 

# 删除停词
def remove_stop_word(row):
    l = [word for word in row if word not in stopwords()]
    return l

train['q1_cut'] = train['q1_cut'].map(lambda row:remove_stop_word(row))
train['q2_cut'] = train['q2_cut'].map(lambda row:remove_stop_word(row))

In [21]:
# 准备预料
sentences = all_words(train)
sentences = (' '.join(sentences).split())

In [32]:
sentences

['微',
 '信',
 '年',
 '微',
 '信',
 '没有',
 '微粒',
 '贷',
 '功能',
 '号码',
 '微粒',
 '贷',
 '微',
 '信',
 '消费',
 '算',
 '多少钱',
 '没',
 '交易',
 '密码',
 '忘记',
 '找回',
 '密码',
 '绑定',
 '手机',
 '手机卡',
 '掉',
 '最近',
 '安全',
 '老',
 '改',
 '密码',
 '麻烦',
 '你好',
 '昨天',
 '昨天晚上',
 '晚上',
 '申请',
 '没有',
 '打电话',
 '电话',
 '今天',
 '之内',
 '一定',
 '定会',
 '账',
 '微粒',
 '贷',
 '开通',
 '"',
 '你好',
 '微粒',
 '贷',
 '没有',
 '开通',
 '借款',
 '一直',
 '没有',
 '回拨',
 '拨电话',
 '电话',
 '申请',
 '借款',
 '没有',
 '打电话',
 '电话',
 '过来',
 '每次',
 '提前',
 '还款',
 '最后',
 '贷款',
 '30',
 '号',
 '一次',
 '一次性',
 '还清',
 '请问',
 '一天',
 '是否',
 '限定',
 '只能',
 '转入',
 '转出',
 '五万',
 '微',
 '众多',
 '赎回',
 '短期',
 '理财',
 '微粒',
 '咨询',
 '咨询电话',
 '电话',
 '电话号码',
 '号码',
 '人工',
 '客服',
 '电话',
 '已经',
 '银行',
 '换',
 '新',
 '预留',
 '号码',
 '现在',
 '换',
 '电话',
 '电话号码',
 '号码',
 '需要',
 '更换',
 '下周',
 '产品',
 '元月',
 '元月份',
 '月份',
 '理财',
 '理财产品',
 '财产',
 '产品',
 '第一',
 '第一次',
 '一次',
 '使用',
 '额度',
 '额度',
 '多少钱',
 '微粒',
 '贷借',
 '借钱',
 '提前',
 '还清',
 '贷款',
 '借款',
 '多长',
 '多长时间',
 '长时间',
 '时间',
 '打电话',
 '电话',
 '借款',
 '多久',

In [25]:
# 使用gensim训练词向量

from gensim.models import Word2Vec

model = Word2Vec(sentences=sentences,size=100,window=5,min_count=3,sg=1)

collecting all words and their counts
Each 'sentences' item should be a list of words (usually unicode strings). First item here is instead plain <class 'str'>.
PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
PROGRESS: at sentence #10000, processed 19166 words, keeping 738 word types
PROGRESS: at sentence #20000, processed 38436 words, keeping 918 word types
PROGRESS: at sentence #30000, processed 57795 words, keeping 987 word types
PROGRESS: at sentence #40000, processed 77044 words, keeping 1052 word types
PROGRESS: at sentence #50000, processed 96348 words, keeping 1109 word types
PROGRESS: at sentence #60000, processed 115687 words, keeping 1165 word types
PROGRESS: at sentence #70000, processed 135075 words, keeping 1190 word types
PROGRESS: at sentence #80000, processed 154410 words, keeping 1226 word types
PROGRESS: at sentence #90000, processed 173641 words, keeping 1249 word types
PROGRESS: at sentence #100000, processed 192876 words, keeping 1263 word types


In [26]:
# pickle 保存模型

import pickle
def pkl_save(filename,file):
    output = open(filename, 'wb')
    pickle.dump(file, output)
    output.close()

def pkl_load(filename):
    pkl_file = open(filename, 'rb')
    file = pickle.load(pkl_file) 
    pkl_file.close()
    return file

## 模型保存
pkl_save('./word2vec.pkl',model)

In [36]:

## 加载模型
model = pkl_load('./word2vec.pkl')

## 预测
# model.wv["手机"]