In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
import time
tqdm.pandas()

train_path = '../data/train.csv'
test_path = '../data/test.csv'
embedding_path = '../data/glove.840B.300d.txt'

In [2]:
train_df = pd.read_csv(train_path)
train_df['question_text'] = train_df['question_text'].str.lower()
test_df = pd.read_csv(test_path)
test_df['question_text'] = test_df['question_text'].str.lower()

In [3]:
sentences = train_df['question_text'].progress_apply(lambda x: x.split())

100%|██████████| 1306122/1306122 [00:04<00:00, 263042.38it/s]


In [4]:
def get_word_count(sentences):
    word_count = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            if word not in word_count:
                word_count[word] = 1
            else:
                word_count[word] += 1
    return word_count

In [5]:
word_count1 = get_word_count(sentences)
print({k: word_count1[k] for k in list(word_count1)[:5]})

100%|██████████| 1306122/1306122 [00:04<00:00, 319294.63it/s]

{'how': 287779, 'did': 41109, 'quebec': 102, 'nationalists': 105, 'see': 9085}





In [6]:
time1 = time.clock()
embeddings_index = {}
embeddings_set = []
with open(embedding_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        values = line.split(' ') # 要用split(' '),不能用split()
        word = values[0]
        #vector = np.asarray(values[1:], dtype='float32')
        #embeddings_index[word] = vector
        embeddings_set.append(word)
embeddings_set = set(embeddings_set)
print(len(embeddings_set), time.clock() - time1)

100%|██████████| 2196017/2196017 [01:28<00:00, 24813.62it/s]


2196016 85.67989299999999


In [None]:
def get_coverage(word_count1, embeddings_index):
    coverage_word = 0
    coverage_word_count = 0
    uncoverage = []
    for key1 in tqdm(word_count1):
        if key1 in embeddings_index:
            coverage_word += 1
            coverage_word_count += word_count1[key1]
        else:
            uncoverage.append([key1, word_count1[key1]])
    uncoverage.sort(key=lambda x: x[1], reverse=True)
    return coverage_word, coverage_word_count, uncoverage

coverage_word, coverage_word_count, uncoverage = get_coverage(word_count1, embeddings_set)

In [None]:
print('word coverage rate:{}\nword count coverage rate:{}'.format(coverage_word / len(word_count1), coverage_word_count / sum(word_count1.values())))

In [10]:
uncoverage[:10]

[['India?', 16384],
 ['it?', 12900],
 ["What's", 12425],
 ['do?', 8753],
 ['life?', 7753],
 ['you?', 6295],
 ['me?', 6202],
 ['them?', 6140],
 ['time?', 5716],
 ['world?', 5386]]

In [11]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

In [12]:
train_df['question_text'] = train_df['question_text'].progress_apply(clean_text)

100%|██████████| 1306122/1306122 [00:42<00:00, 30937.49it/s]


In [13]:
sentences = train_df['question_text'].progress_apply(lambda x: x.split())
word_count1 = get_word_count(sentences)
print({k: word_count1[k] for k in list(word_count1)[:5]})

100%|██████████| 1306122/1306122 [02:58<00:00, 7307.51it/s]  
100%|██████████| 1306122/1306122 [00:04<00:00, 278186.78it/s]


{'How': 263281, 'did': 34080, 'Quebec': 161, 'nationalists': 131, 'see': 9564}


In [14]:
coverage_word, coverage_word_count, uncoverage = get_coverage(word_count1, embeddings_index)
print('word coverage rate:{}\nword count coverage rate:{}'.format(coverage_word / len(word_count1), coverage_word_count / sum(word_count1.values())))

100%|██████████| 239137/239137 [00:02<00:00, 102737.70it/s]


word coverage rate:0.7496623274524645
word count coverage rate:0.9958335729525793


In [15]:
uncoverage[:10]

[['Quorans', 856],
 ['Brexit', 492],
 ['cryptocurrencies', 481],
 ['Redmi', 379],
 ['C#', 231],
 ['OnePlus', 125],
 ['UCEED', 123],
 ['Blockchain', 112],
 ['GDPR', 106],
 ['demonetisation', 106]]

In [18]:
uncoverage[:100]

[['Quorans', 856],
 ['Brexit', 492],
 ['cryptocurrencies', 481],
 ['Redmi', 379],
 ['C#', 231],
 ['OnePlus', 125],
 ['UCEED', 123],
 ['Blockchain', 112],
 ['GDPR', 106],
 ['demonetisation', 106],
 ['Coinbase', 105],
 ['Machedo', 99],
 ['Adityanath', 99],
 ['BNBR', 99],
 ['Boruto', 93],
 ['DCEU', 89],
 ['ethereum', 89],
 ['IIEST', 85],
 ['SJWs', 79],
 ['Qoura', 79],
 ['Upwork', 70],
 ['LNMIIT', 67],
 ['Kavalireddi', 65],
 ['Zerodha', 65],
 ['bhakts', 63],
 ['Doklam', 62],
 ['Vajiram', 59],
 ['NICMAR', 59],
 ['Unacademy', 58],
 ['MUOET', 56],
 ['chsl', 55],
 ['AlShamsi', 52],
 ['HackerRank', 52],
 ['Bhakts', 51],
 ['Awdhesh', 48],
 ['Litecoin', 48],
 ['eLitmus', 47],
 ['Jiren', 47],
 ['Cryptocurrency', 47],
 ['#1', 46],
 ['Ryzen', 45],
 ['altcoins', 45],
 ['altcoin', 45],
 ['Baahubali', 44],
 ['coinbase', 44],
 ['SRMJEE', 43],
 ['Beerus', 41],
 ['SGSITS', 40],
 ['Skripal', 40],
 ['bahubali', 38],
 ['BMSCE', 37],
 ['Zebpay', 37],
 ['Binance', 37],
 ['Gurugram', 36],
 ['Alshamsi', 36],
 ['