In [None]:
#Importing the required packages
#Ignoring warnings
import warnings
warnings.filterwarnings('ignore') 
import numpy as np
import pandas as pd
from time import time
import operator
import string
import re
import os

import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS

import sklearn
from sklearn import utils
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import f1_score

import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

import tqdm
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

import tensorflow as tf
import keras.preprocessing
import keras.layers
import keras.models
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential,Model
from keras.engine.topology import Layer
from keras.layers import Activation,  Wrapper
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Bidirectional, Flatten, SpatialDropout1D, LSTM
from keras.layers import BatchNormalization
from keras.layers import Concatenate
from keras import initializers, regularizers, constraints
from keras.callbacks import (EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard)

In [None]:
# Retrieving the data from another notebooks
%store -r train_df1
%store -r test_df
%store -r tk
%store -r max_features
%store -r max_len
%store -r word_index
%store -r train_X
%store -r test_X
%store -r train_y

In [None]:
# Fuction for loading word embedding and returns a dictionary of embedding indexes 
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='utf8'))
        
    return embeddings_index

In [None]:
# Assigning the file path for glove word embedding
glove = 'glove.840B.300d.txt'

In [None]:
# Loading glove word embedding 
print("Extracting GloVe embedding ...")
embed_glove = load_embed(glove)

In [None]:
# Function for building vocabulary using train dataset and returns dictionary of words with corresponding indexes based on frquencies 
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab


In [None]:
# Calling build vocab function
vocab = build_vocab(train_df1['question_text'])

In [None]:
# Function for calculating the coverage of embeddings for vacbulary and returns unkown words
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

In [None]:
# Unknown words are stored in oov_glove 
print("Glove : ")
oov_glove = check_coverage(vocab, embed_glove)

In [None]:
# Function for creating embedding weights and returns an array with embedding weights
def create_embedding_weights(embeddings_index, word_index, max_features):
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    #embed_size = np.stack(embed_glove.values()).shape[1]
    embed_size = all_embs.shape[1]
    word_index = word_index
    embedding_weights = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_weights[i] = embedding_vector
    
    return embedding_weights

In [None]:
# Calling a function for creating embedding weights and stored in glove_weights
glove_weights = create_embedding_weights(embed_glove, word_index, max_features)
glove_weights

In [None]:
# %store is used to use the data stored in one notebook in another notebook.
%store glove_weights