In [3]:
import pandas as pd
import numpy as np
import operator 
import re
import os

In [4]:
os.chdir("F:/talentsprintproject/quora-insincere-questions-classification")
train = pd.read_csv("train.csv").drop('target', axis=1)
test = pd.read_csv("test.csv")
df = pd.concat([train ,test])

print("Number of texts: ", df.shape[0])

Number of texts:  1681928


In [11]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding="utf8") if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [12]:
glove = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
paragram =  'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
wiki_news = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'

In [13]:
print("Extracting GloVe embedding")
embed_glove = load_embed(glove)
print("Extracting Paragram embedding")
embed_paragram = load_embed(paragram)
print("Extracting FastText embedding")
embed_fasttext = load_embed(wiki_news)

Extracting GloVe embedding
Extracting Paragram embedding
Extracting FastText embedding


In [14]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [15]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [16]:
vocab = build_vocab(df['question_text'])

In [17]:
print("Glove : ")
oov_glove = check_coverage(vocab, embed_glove)
print("Paragram : ")
oov_paragram = check_coverage(vocab, embed_paragram)
print("FastText : ")
oov_fasttext = check_coverage(vocab, embed_fasttext)

Glove : 
Found embeddings for 31.54% of vocab
Found embeddings for  88.16% of all text
Paragram : 
Found embeddings for 18.54% of vocab
Found embeddings for  72.21% of all text
FastText : 
Found embeddings for 28.40% of vocab
Found embeddings for  87.66% of all text
