## 任务1：报名比赛，下载比赛数据集并完成读取

- 步骤1 ：登录&报名比赛：https://aistudio.baidu.com/aistudio/competition/detail/45/0/task-definition
- 步骤2 ：下载比赛数据集
- 步骤3 ：使用Pandas完成数据读取。

In [1]:
import numpy as np 
import pandas as pd 
import os
import gc
import matplotlib.pyplot as plt
import jieba
import distance 
import seaborn as sns
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号

pal = sns.color_palette()
pd.__version__

error uploading: check_hostname requires server_hostname


'1.2.5'

In [2]:
data_dir = 'D:/study_hard/statistic/千言数据集'
data_list = ['bq_corpus','lcqmc','paws-x-zh']
# data_dir = 'E:/学习/千言数据集/'

In [3]:
# 先读一个数据集，后面统一读

train = pd.read_csv(data_dir+'/bq_corpus/train.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()
test = pd.read_csv(data_dir+'/bq_corpus/test.tsv',sep='\t',error_bad_lines=False,names=['q1','q2']).dropna()
test['label'] = -1 
dev = pd.read_csv(data_dir+'/bq_corpus/dev.tsv',sep='\t',error_bad_lines=False,names=['q1','q2','label']).dropna()

train.head(),train.shape

Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


(                            q1                q2 label
 0             用微信都6年，微信没有微粒贷功能          4。号码来微粒贷     0
 1                       微信消费算吗           还有多少钱没还     0
 2         交易密码忘记了找回密码绑定的手机卡也掉了  怎么最近安全老是要改密码呢好麻烦     0
 3  你好我昨天晚上申请的没有打电话给我今天之内一定会打吗？          什么时候可以到账     0
 4                      “微粒贷开通"   你好，我的微粒贷怎么没有开通呢     0,
 (86198, 3))

## 任务2：对句子对提取TFIDF以及统计特征，训练和预测

参考代码：https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb
- 步骤1 ：对句子对（句子A和句子B统计）如下特征：
    - 句子A包含的字符个数、句子B包含的字符个数
    - 句子A与句子B的编辑距离
    - 句子A与句子B共有单词的个数
    - 句子A与句子B共有字符的个数
    - 句子A与句子B共有单词的个数 / 句子A字符个数
    - 句子A与句子B共有单词的个数 / 句子B字符个数
- 步骤2 ：计算TFIDF，并对句子A和句子B进行特征转换，并进行
- 步骤3 ：计算句子A与句子B的TFIDF向量的内积距离
- 步骤4 ：将上述特征送入分类模型，训练并预测，将结果预测提交到比赛网站。

In [4]:
# 句子对特征

# 字符个数
train['q1_len'] = train['q1'].apply(len)
train['q2_len'] = train['q2'].apply(len)  

# 编辑距离 
# Levenshtein Distance 被称为编辑距离（Edit Distance），一个度量两个字符序列之间差异的字符串度量标准
train['Lev_distance'] = train.apply(lambda x:distance.levenshtein(x['q1'],x['q2']),axis=1)

## jieba分词 
# cut_all=True，全模式，“我来到北京清华大学”-->“ 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学”
def jieba_cut(sentence):
    word_list = jieba.lcut(sentence,cut_all=True) 
    return word_list 

train['q1_cut'] = train['q1'].apply(lambda x:jieba_cut(x)) 
train['q2_cut'] = train['q2'].apply(lambda x:jieba_cut(x)) 

# 分词后的词个数
train['q1_cut_len'] = train['q1_cut'].apply(len)
train['q2_cut_len'] = train['q2_cut'].apply(len)  
   
# 分词后，两句子相同词占所有词（去重）的比例
def percent(q1_cut,q2_cut):
    inter_num = len(set(q1_cut) & set(q2_cut))
    percent = inter_num/len(set(q1_cut))
    return percent

train['q1_cut_percent'] = train.apply(lambda x: percent(x['q1_cut'],x['q2_cut']),axis=1)
train['q2_cut_percent'] = train.apply(lambda x: percent(x['q2_cut'],x['q1_cut']),axis=1)


Building prefix dict from the default dictionary ...
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\CEALLA~1\AppData\Local\Temp\jieba.cache
Loading model from cache C:\Users\CEALLA~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.551 seconds.
Loading model cost 0.551 seconds.
Prefix dict has been built successfully.
Prefix dict has been built successfully.


In [5]:
## 获取停词 
# https://github.com/goto456/stopwords

def stopwords():
    stop_words =[]
    with open('cn_stopwords.txt','r',encoding='UTF-8') as f:
        for i in f.readlines():
            i = i.replace('\n','')
            stop_words.append(i)
    return stop_words

# 词共享 比例
def word_match_share(row,stops):
    q1words = {}
    q2words = {}
    # 剔除停词
    for word in str(row['q1_cut']):
        if word not in stops:
            q1words[word] = 1
    for word in str(row['q2_cut']):
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R



train['word_match'] = train.apply(lambda x: word_match_share(x,stopwords()),axis=1)

In [32]:
# tf-idf 相似度

from collections import Counter

# 准备语料
def all_words(train):
    corpus = []
    # 遍历每行，q1分词，q2分词，合并
    for row_id in range(len(train)):
        row = train.iloc[row_id]
        all_words = list()
        all_words.extend([word for word in row['q1_cut'] if word not in stopwords()])
        all_words.extend([word for word in row['q2_cut'] if word not in stopwords()])
        corpus.append(' '.join(all_words))
    return corpus

corpus = all_words(train)                 

In [44]:
corpus = (' '.join(corpus).split())
word_cnt = Counter(corpus)

# 定义权重
# 词个数为1的，权重为0，大于1的，权重为 1/(count+10000)
def get_weight(cnt, eps=10000, min_count=2):
    if cnt < min_count:
        return 0
    else:
        return 1 / (cnt + eps)
    
# 生成词权重，词--权重
weight =  { word:get_weight(cnt) for word ,cnt in word_cnt.items()}

print('Most common words and weights: \n')
print(sorted(weight.items(), key=lambda x: x[1] if x[1] > 0 else 9999)[:10])
print('\nLeast common words and weights: ')
(sorted(weight.items(), key=lambda x: x[1], reverse=True)[:10])

Most common words and weights: 

[('还款', 2.6867275658248254e-05), ('微粒', 2.915026963999417e-05), ('贷', 2.9583173091145757e-05), ('借款', 3.2057446944925306e-05), ('电话', 3.6471060213720413e-05), ('没有', 3.787878787878788e-05), ('银行', 3.8825904643578196e-05), ('微', 4.675081813931744e-05), ('额度', 4.805151122002787e-05), ('你好', 5.1698288786641165e-05)]

Least common words and weights: 


[('手机卡', 9.998000399920016e-05),
 ('门', 9.998000399920016e-05),
 ('装傻', 9.998000399920016e-05),
 ('私聊', 9.998000399920016e-05),
 ('看开', 9.998000399920016e-05),
 ('网点', 9.998000399920016e-05),
 ('退钱', 9.998000399920016e-05),
 ('察看', 9.998000399920016e-05),
 ('6624', 9.998000399920016e-05),
 ('算下来', 9.998000399920016e-05)]

In [49]:
def tfidf_word_match_share(row,weight):
    q1words = {word:1 for word in row['q1_cut'] if word not in stopwords()}
    q2words = {word:1 for word in row['q2_cut'] if word not in stopwords()}
    if len(q1words)==0 or len(q2words)==0:
        return 0 
    
    # 获取共享词的权重
    shared_weights = [weight.get(w,0) for w in q1words.keys() if w in q2words] + [weight.get(w,0) for w in q2words.keys() if w in q1words]
    # 总权重
    total_weights = [weight.get(w, 0) for w in q1words] + [weight.get(w, 0) for w in q2words]
    # 共享词权重比例
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

train['tfidf_word_match'] = train.apply(lambda x:tfidf_word_match_share(x,weight),axis=1)

In [None]:
# model

