## 任务1：报名比赛，下载比赛数据集并完成读取

- 步骤1 ：登录&报名比赛：https://aistudio.baidu.com/aistudio/competition/detail/45/0/task-definition
- 步骤2 ：下载比赛数据集
- 步骤3 ：使用Pandas完成数据读取。

In [9]:
import numpy as np 
import pandas as pd 
import os
import gc
import matplotlib.pyplot as plt
import jieba
import distance 
import seaborn as sns
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号

import lightgbm as lgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer

pal = sns.color_palette()
pd.__version__

'1.2.5'

In [2]:
data_dir = 'D:/study_hard/statistic/千言数据集'
data_list = ['bq_corpus','lcqmc','paws-x-zh']
# data_dir = 'E:/学习/千言数据集/'

In [4]:
# 先读一个数据集，后面统一读

train = pd.read_csv(data_dir+'/bq_corpus/train.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()
test = pd.read_csv(data_dir+'/bq_corpus/test.tsv',sep='\t',error_bad_lines=False,names=['q1','q2']).dropna()
test['label'] = -1 
dev = pd.read_csv(data_dir+'/bq_corpus/dev.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()

# label非[0,1]的不处理
if len(set(train.label))>2:
    train = train[train['label'].isin(['0', '1'])]
    train['label'] = train['label'].astype('int')

## 任务2：对句子对提取TFIDF以及统计特征，训练和预测

参考代码：https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb
- 步骤1 ：对句子对（句子A和句子B统计）如下特征：
    - 句子A包含的字符个数、句子B包含的字符个数
    - 句子A与句子B的编辑距离
    - 句子A与句子B共有单词的个数
    - 句子A与句子B共有字符的个数
    - 句子A与句子B共有单词的个数 / 句子A字符个数
    - 句子A与句子B共有单词的个数 / 句子B字符个数
- 步骤2 ：计算TFIDF，并对句子A和句子B进行特征转换，并进行
- 步骤3 ：计算句子A与句子B的TFIDF向量的内积距离
- 步骤4 ：将上述特征送入分类模型，训练并预测，将结果预测提交到比赛网站。

In [5]:
# 句子对特征

# 字符个数
train['q1_len'] = train['q1'].apply(len)
train['q2_len'] = train['q2'].apply(len)  

# 编辑距离 
# Levenshtein Distance 被称为编辑距离（Edit Distance），一个度量两个字符序列之间差异的字符串度量标准
train['Lev_distance'] = train.apply(lambda x:distance.levenshtein(x['q1'],x['q2']),axis=1)

## jieba分词 
# cut_all=True，全模式，“我来到北京清华大学”-->“ 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学”
def jieba_cut(sentence):
    word_list = jieba.lcut(sentence,cut_all=True) 
    return word_list 

train['q1_cut'] = train['q1'].apply(lambda x:jieba_cut(x)) 
train['q2_cut'] = train['q2'].apply(lambda x:jieba_cut(x)) 

# 分词后的词个数
train['q1_cut_len'] = train['q1_cut'].apply(len)
train['q2_cut_len'] = train['q2_cut'].apply(len)  
   
# 分词后，两句子相同词占所有词（去重）的比例
def percent(q1_cut,q2_cut):
    inter_num = len(set(q1_cut) & set(q2_cut))
    percent = inter_num/len(set(q1_cut))
    return percent

train['q1_cut_percent'] = train.apply(lambda x: percent(x['q1_cut'],x['q2_cut']),axis=1)
train['q2_cut_percent'] = train.apply(lambda x: percent(x['q2_cut'],x['q1_cut']),axis=1)


Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\CEALLA~1\AppData\Local\Temp\jieba.cache
Loading model from cache C:\Users\CEALLA~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.535 seconds.
Loading model cost 0.535 seconds.
Prefix dict has been built successfully.
Prefix dict has been built successfully.


In [6]:
## 获取停词 
# https://github.com/goto456/stopwords

def stopwords():
    stop_words =[]
    with open('cn_stopwords.txt','r',encoding='UTF-8') as f:
        for i in f.readlines():
            i = i.replace('\n','')
            stop_words.append(i)
    return stop_words

# 词共享 比例
def word_match_share(row,stops):
    q1words = {}
    q2words = {}
    # 剔除停词
    for word in str(row['q1_cut']):
        if word not in stops:
            q1words[word] = 1
    for word in str(row['q2_cut']):
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R



train['word_match'] = train.apply(lambda x: word_match_share(x,stopwords()),axis=1)

In [8]:
# tf-idf 相似度

from collections import Counter

# 准备语料
def all_words(train):
    corpus = []
    # 遍历每行，q1分词，q2分词，合并
    for row_id in range(len(train)):
        row = train.iloc[row_id]
        all_words = list()
        all_words.extend([word for word in row['q1_cut'] if word not in stopwords()])
        all_words.extend([word for word in row['q2_cut'] if word not in stopwords()])
        corpus.append(' '.join(all_words))
    return corpus

corpus = all_words(train)            

In [10]:
corpus = (' '.join(corpus).split())
word_cnt = Counter(corpus)

# 定义权重
# 词个数为1的，权重为0，大于1的，权重为 1/(count+10000)
def get_weight(cnt, eps=10000, min_count=2):
    if cnt < min_count:
        return 0
    else:
        return 1 / (cnt + eps)
    
# 生成词权重，词--权重
weight =  { word:get_weight(cnt) for word ,cnt in word_cnt.items()}

print('Most common words and weights: \n')
print(sorted(weight.items(), key=lambda x: x[1] if x[1] > 0 else 9999)[:10])
print('\nLeast common words and weights: ')
(sorted(weight.items(), key=lambda x: x[1], reverse=True)[:10])

Most common words and weights: 

[('还款', 2.7687792452307778e-05), ('微粒', 2.9992202027472856e-05), ('贷', 3.0422878004259203e-05), ('借款', 3.2968482131082684e-05), ('电话', 3.743075310675251e-05), ('没有', 3.885003885003885e-05), ('银行', 3.979465955668749e-05), ('微', 4.773953310736621e-05), ('额度', 4.9089391782435816e-05), ('你好', 5.270092226613966e-05)]

Least common words and weights: 


[('手机卡', 9.998000399920016e-05),
 ('门', 9.998000399920016e-05),
 ('装傻', 9.998000399920016e-05),
 ('私聊', 9.998000399920016e-05),
 ('看开', 9.998000399920016e-05),
 ('网点', 9.998000399920016e-05),
 ('退钱', 9.998000399920016e-05),
 ('察看', 9.998000399920016e-05),
 ('6624', 9.998000399920016e-05),
 ('算下来', 9.998000399920016e-05)]

In [11]:
def tfidf_word_match_share(row,weight):
    q1words = {word:1 for word in row['q1_cut'] if word not in stopwords()}
    q2words = {word:1 for word in row['q2_cut'] if word not in stopwords()}
    if len(q1words)==0 or len(q2words)==0:
        return 0 
    
    # 获取共享词的权重
    shared_weights = [weight.get(w,0) for w in q1words.keys() if w in q2words] + [weight.get(w,0) for w in q2words.keys() if w in q1words]
    # 总权重
    total_weights = [weight.get(w, 0) for w in q1words] + [weight.get(w, 0) for w in q2words]
    # 共享词权重比例
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

train['tfidf_word_match'] = train.apply(lambda x:tfidf_word_match_share(x,weight),axis=1)

In [19]:
test['q1_len'] = test['q1'].apply(len)
test['q2_len'] = test['q2'].apply(len)  

# 编辑距离 
# Levenshtein Distance 被称为编辑距离（Edit Distance），一个度量两个字符序列之间差异的字符串度量标准
test['Lev_distance'] = test.apply(lambda x:distance.levenshtein(x['q1'],x['q2']),axis=1)

# 分词
test['q1_cut'] = test['q1'].apply(lambda x:jieba_cut(x)) 
test['q2_cut'] = test['q2'].apply(lambda x:jieba_cut(x)) 

# 分词后的词个数
test['q1_cut_len'] = test['q1_cut'].apply(len)
test['q2_cut_len'] = test['q2_cut'].apply(len)  
   
test['q1_cut_percent'] = test.apply(lambda x: percent(x['q1_cut'],x['q2_cut']),axis=1)
test['q2_cut_percent'] = test.apply(lambda x: percent(x['q2_cut'],x['q1_cut']),axis=1)

test['word_match'] = test.apply(lambda x: word_match_share(x,stopwords()),axis=1)

# tf-idf 相似度
corpus = all_words(test)            
corpus = (' '.join(corpus).split())
word_cnt = Counter(corpus)

# 定义权重
# 生成词权重，词--权重
weight =  { word:get_weight(cnt) for word ,cnt in word_cnt.items()}
test['tfidf_word_match'] = test.apply(lambda x:tfidf_word_match_share(x,weight),axis=1)

In [20]:
# model

def train_lgb_kfold(X_train, y_train, X_test, n_fold=5):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=2021, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr)
        dvalid = lgb.Dataset(X_val, y_val, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 512,
            'boosting_type': 'gbdt',
            'subsample_freq': 1,
            'reg_alpha': 0.5,
            'reg_lambda': 0.5,
            'n_estimators': 5000,
            'learning_rate': 0.005,
            'min_data_in_leaf': 150,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.7,
            'n_jobs': -1,
            'seed': 2021
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=100,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=20)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds


def train_lgb(train, test, feat_cols, label_col, n_fold=5):
    '''训练lightgbm'''
    X_train = train[feat_cols]
    y_train = train[label_col]
    X_test = test[feat_cols]
    gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb_kfold(X_train, y_train, X_test, n_fold=n_fold)
    
    return gbms_lgb, oof_preds_lgb, test_preds_lgb


RangeIndex(start=0, stop=86194, step=1)

In [27]:
feat_cols = ['Lev_distance','word_match', 'tfidf_word_match']

train.index = train.reset_index(drop=True).index

gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb(train, test,
                                                    feat_cols=feat_cols,
                                                    label_col='label')



[LightGBM] [Info] Number of positive: 34518, number of negative: 34437
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 501
[LightGBM] [Info] Number of data points in the train set: 68955, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500587 -> initscore=0.002349
[LightGBM] [Info] Start training from score 0.002349
Training until validation scores don't improve for 20 rounds
[50]	training's auc: 0.722226	valid_1's auc: 0.70973
Early stopping, best iteration is:
[35]	training's auc: 0.722114	valid_1's auc: 0.709784
[LightGBM] [Info] Number of positive: 34518, number of negative: 34437
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 504
[LightGBM] [Info] Number of data points in the train set: 68955, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500587 -> initscore=0.002349
[LightGBM] [In