In [1]:
'''
Kernel: Preprocessing when using embeddings.
Two golden rules:
1. Don't use standard preprocessing steps like stemming or stopword removal 
   when you have pre-trained embeddings.
   (Some of you might used standard preprocessing steps when doing word 
   count based feature extraction (e.g. TFIDF) such as removing stopwords, 
   stemming etc. The reason is simple: You loose valuable information, 
   which would help your NN to figure things out.)
2. Get your vocabulary as close to the embeddings as possible.
'''

"\nKernel: Preprocessing when using embeddings.\nTwo golden rules:\n1. Don't use standard preprocessing steps like stemming or stopword removal \n   when you have pre-trained embeddings.\n   (Some of you might used standard preprocessing steps when doing word \n   count based feature extraction (e.g. TFIDF) such as removing stopwords, \n   stemming etc. The reason is simple: You loose valuable information, \n   which would help your NN to figure things out.)\n2. Get your vocabulary as close to the embeddings as possible.\n"

In [2]:
import re
import string
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [3]:
train_data_path = "./data/train.csv"
test_data_path = "./data/test.csv"

In [4]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [5]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [6]:
test_df.head()

Unnamed: 0,qid,question_text
0,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
1,000156468431f09b3cae,How much does a tutor earn in Bangalore?
2,000227734433360e1aae,What are the best made pocket knives under $20...
3,0005e06fbe3045bd2a92,Why would they add a hypothetical scenario tha...
4,00068a0f7f41f50fc399,What is the dresscode for Techmahindra freshers?


In [7]:
print("train data shape: ", train_df.shape)
print("test data shape: ", test_df.shape)

train data shape:  (1306122, 3)
test data shape:  (56370, 2)


In [8]:
# 划分训练集，验证集
train_df, val_df = train_test_split(train_df, 
                                    test_size=0.1, 
                                    train_size=0.9,
                                    random_state=2018)

In [9]:
print("train data shape: ", train_df.shape)
print("val data shape: ", val_df.shape)
print("test data shape: ", test_df.shape)

train data shape:  (1175509, 3)
val data shape:  (130613, 3)
test data shape:  (56370, 2)


In [10]:
# 缺失值填充
train_df["question_text"].fillna("_na_", inplace=True)
val_df["question_text"].fillna("_na_", inplace=True)
test_df["question_text"].fillna("_na_", inplace=True)

In [11]:
# 整个训练语料文本
all_data_text = pd.concat([train_df['question_text'], val_df['question_text']], axis=0)
all_data_text.shape

(1306122,)

In [12]:
# 建立词表
def build_vocab(sentences):
    # sentences: list of list of words
    # return: dictionary of words in training corpus
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab
# 统计词表词频
def build_vocab_count(sentences):
    # sentences: list of list of words
    # return: dictionary of words and their count
    vocab_count = {}
    for sentence in sentences:
        for word in sentence:
            if word in vocab_count:
                vocab_count[word] += 1
            else:
                vocab_count[word] = 1
    return vocab_count

In [13]:
sentences = all_data_text.apply(lambda x: x.split()).values
# 生成词表dict，和词表与其对应的词频dict
vocab = build_vocab(sentences)
vocab_count = build_vocab_count(sentences)

In [14]:
print({k: vocab_count[k] for k in list(vocab_count)[:10]})

{'intereview': 1, 'posioned?': 1, 'Genji': 6, 'hourly.': 1, 'Anaheim': 3, '"நான்': 1, 'Statups': 1, 'directing': 35, "Doll's": 4, 'opposite)': 1}


In [15]:
# 读取Google预训练的词向量word2vec
from gensim.models import KeyedVectors
word_embedding_path = './word embedding/GoogleNews-vectors-negative300.bin'
embedding_dict = KeyedVectors.load_word2vec_format(word_embedding_path, binary=True)

In [16]:
"##V" in embedding_dict

True

In [17]:
# 统计未登录词
def check_out_of_vocab(vocab_count, embedding_dict):
    exist_word = {}
    oov_word = {}
    exist_count = 0    # 包含在word2vec中的词频的总数
    oov_count = 0      # 未登录词的词频总数
    for word in vocab_count:
        if word in embedding_dict:
            exist_word[word] = embedding_dict[word]
            exist_count += vocab_count[word]
        else:
            oov_word[word] = vocab_count[word]
            oov_count += vocab_count[word]
    print('{:.2%} words of vocab founded in word2vec.'.format(len(exist_word) / len(vocab_count)))
    print('{:.2%} words of all text founded in word2vec.'.format(exist_count / (exist_count + oov_count)))
    sorted_oov_word = sorted(oov_word.items(), key=lambda x: x[1])[::-1]
    return sorted_oov_word

In [18]:
oov_word = check_out_of_vocab(vocab_count, embedding_dict)

24.31% words of vocab founded in word2vec.
78.75% words of all text founded in word2vec.


In [19]:
oov_word[:10]

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

In [20]:
'''
直接按空格符进行分词，未经过任何处理时，只有24%的词有预训练的词向量，
然后，根据以上打印出的oov_word中的词, 说明：
很多句子结尾的词，与问号之间没有空格符隔开，被当做一个词了，所以在预训练的word2vec中找不到。
'''

'\n直接按空格符进行分词，未经过任何处理时，只有24%的词有预训练的词向量，\n然后，根据以上打印出的oov_word中的词, 说明：\n很多句子结尾的词，与问号之间没有空格符隔开，被当做一个词了，所以在预训练的word2vec中找不到。\n'

In [21]:
'''
在处理punctuation时，如果预训练的词向量中有这个符号，则保留，如果没有则去除该符号.
'''

'\n在处理punctuation时，如果预训练的词向量中有这个符号，则保留，如果没有则去除该符号.\n'

In [22]:
print('?' in embedding_dict)    # '?'在word2vec中吗？不在
print('.' in embedding_dict)
print('&' in embedding_dict)

False
False
True


In [23]:
exist_punc = ""    # 有预训练的词向量的符号
oov_punc = ""      # 没有预训练词向量的符号
all_punctuation = string.punctuation
print('all punctuations: ', all_punctuation)
for punc in all_punctuation:
    if punc in embedding_dict:
        exist_punc += punc
    else:
        oov_punc += punc
print('exist punctuations: ', exist_punc)
print('oov punctuations: ', oov_punc)

all punctuations:  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
exist punctuations:  #$%&*+=>@^_`~
oov punctuations:  !"'(),-./:;<?[\]{|}


In [24]:
# 处理语料中的标点符号
def clean_punctuation(sent):
    sent = str(sent)
    for punc in "/-":
        sent = sent.replace(punc, ' ')
    for punc in exist_punc:
        sent = sent.replace(punc, ' '+punc+' ')
    for punc in oov_punc + "”“‘’":
        sent = sent.replace(punc, '')
    return sent

In [25]:
all_data_text = all_data_text.apply(lambda x: clean_punctuation(x))
sentences = all_data_text.apply(lambda x: x.split())
vocab = build_vocab(sentences)
vocab_count = build_vocab_count(sentences)
oov_word = check_out_of_vocab(vocab_count, embedding_dict)

57.29% words of vocab founded in word2vec.
89.80% words of all text founded in word2vec.


In [26]:
'''
处理完标点符号之后，现在词表中有58%的词有预训练的词向量了
'''

'\n处理完标点符号之后，现在词表中有58%的词有预训练的词向量了\n'

In [27]:
oov_word[:10]

[('to', 406304),
 ('a', 404284),
 ('of', 332972),
 ('and', 254088),
 ('2017', 8789),
 ('2018', 7372),
 ('10', 6852),
 ('doesnt', 6780),
 ('didnt', 3879),
 ('12', 3741)]

In [28]:
'''
再次打印出oov word，可以看出，词表中存在大量数字，这些数字全都没有预训练的词向量，
词表中数字的处理：
在word2vec中，只包含0-9十个数字的词向量，其他所有大于9的数字都被特殊字符替代，
例如，15 ——> ##， 123 ——> ###,  15.80$ --> ##.##$
所以我们对语料也进行相同的处理.
'''

'\n再次打印出oov word，可以看出，词表中存在大量数字，这些数字全都没有预训练的词向量，\n词表中数字的处理：\n在word2vec中，只包含0-9十个数字的词向量，其他所有大于9的数字都被特殊字符替代，\n例如，15 ——> ##， 123 ——> ###,  15.80$ --> ##.##$\n所以我们对语料也进行相同的处理.\n'

In [29]:
def clean_number(sent):
    sent = re.sub('[0-9]{5,}', '#####', sent)
    sent = re.sub('[0-9]{4}', '####', sent)
    sent = re.sub('[0-9]{3}', '###', sent)
    sent = re.sub('[0-9]{2}', '##', sent)
    return sent

In [30]:
all_data_text = all_data_text.apply(lambda x: clean_number(x))
sentences = all_data_text.apply(lambda x: x.split())
vocab = build_vocab(sentences)
vocab_count = build_vocab_count(sentences)
oov_word = check_out_of_vocab(vocab_count, embedding_dict)

60.19% words of vocab founded in word2vec.
90.58% words of all text founded in word2vec.


In [31]:
oov_word[:20]

[('to', 406304),
 ('a', 404284),
 ('of', 332972),
 ('and', 254088),
 ('doesnt', 6780),
 ('didnt', 3879),
 ('isnt', 2790),
 ('Isnt', 1429),
 ('favourite', 1246),
 ('bitcoin', 980),
 ('colour', 976),
 ('centre', 884),
 ('Quorans', 879),
 ('cryptocurrency', 820),
 ('shouldnt', 797),
 ('Snapchat', 785),
 ('hasnt', 784),
 ('wasnt', 743),
 ('travelling', 705),
 ('btech', 634)]

In [32]:
'''
根据现在的oov word，我们可以考虑去掉'to', 'a', 'of', 'and'这四个单词，
然后进行一些常见的错别字替换。
'''

"\n根据现在的oov word，我们可以考虑去掉'to', 'a', 'of', 'and'这四个单词，\n然后进行一些常见的错别字替换。\n"

In [33]:
# 人工定义一个常见错别字词典：
mis_spell_dict = {'colour':'color',
                  'centre':'center',
                  'didnt':'did not',
                  'doesnt':'does not',
                  'isnt':'is not',
                  'shouldnt':'should not',
                  'favourite':'favorite',
                  'travelling':'traveling',
                  'counselling':'counseling',
                  'theatre':'theater',
                  'cancelled':'canceled',
                  'labour':'labor',
                  'organisation':'organization',
                  'wwii':'world war 2',
                  'citicise':'criticize',
                  'instagram': 'social medium',
                  'whatsapp': 'social medium',
                  'snapchat': 'social medium'
                 }

# 人工定义需要去掉的停用词：
need_remove_words = ['a', 'to', 'of', 'and']

In [34]:
def clean_mis_spell(sent):
    for key, value in mis_spell_dict.items():
        sent = sent.replace(key, value)
    return sent

In [35]:
all_data_text = all_data_text.apply(lambda x: clean_mis_spell(x))
sentences = all_data_text.apply(lambda x: x.split())
sentences = [[word for word in sent if not word in need_remove_words] for sent in sentences]
vocab = build_vocab(sentences)
vocab_count = build_vocab_count(sentences)
oov_word = check_out_of_vocab(vocab_count, embedding_dict)

60.20% words of vocab founded in word2vec.
98.90% words of all text founded in word2vec.


In [36]:
'''
现在词典中有61%种词有预训练的词向量，
整个训练文本中99%的词都有预训练的词向量。
'''

'\n现在词典中有61%种词有预训练的词向量，\n整个训练文本中99%的词都有预训练的词向量。\n'

In [37]:
oov_word[:20]

[('Isnt', 1429),
 ('bitcoin', 980),
 ('Quorans', 879),
 ('cryptocurrency', 820),
 ('Snapchat', 785),
 ('hasnt', 784),
 ('wasnt', 743),
 ('btech', 634),
 ('Brexit', 492),
 ('cryptocurrencies', 481),
 ('Shouldnt', 477),
 ('blockchain', 474),
 ('behaviour', 468),
 ('upvotes', 433),
 ('programme', 401),
 ('Doesnt', 381),
 ('Redmi', 378),
 ('realise', 371),
 ('defence', 364),
 ('KVPY', 349)]

In [38]:
len(vocab)

245551