# Using GloVe - Part 1

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

import pickle

Using TensorFlow backend.


In [2]:
df = pd.read_excel('../ML/Data_ML.xlsx')
df = df[pd.notnull(df['Business Description'])]
df['Sector ID'] = df['TRBC Economic Sector Name'].factorize(sort=True)[0]
X = df['Business Description']
y = df['Sector ID']
X.shape, y.shape

((23022,), (23022,))

In [3]:
X.head()

0    Cummins Inc. designs, manufactures, distribute...
1    Rio Tinto plc is a mining and metals company. ...
2    Rio Tinto Limited (Rio Tinto) is a mining comp...
3    The Royal Dutch Shell plc explores for crude o...
4    BHP Billiton Plc is a global resources company...
Name: Business Description, dtype: object

In [4]:
y.head()

0    1
1    0
2    0
3    3
4    0
Name: Sector ID, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

splits = [X_train, X_test, y_train, y_test]
for _ in splits:
    print(_.shape)

(18417,)
(4605,)
(18417,)
(4605,)


In [6]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
w2i = tokenizer.word_index
len(w2i)

70673

In [7]:
i2w = dict((v,k) for (k,v) in w2i.items())

print(list(w2i)[:10])
print(list(i2w)[:10])

['and', 'the', 'of', 'in', 'company', 'is', 'a', 'as', 'products', 'its']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [8]:
%%time
embeddings = {}
with open('../GloVe/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[word] = coefs

print('Found %s word vectors.' % len(embeddings))

Found 400000 word vectors.
Wall time: 26.6 s


In [9]:
embeddings['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [10]:
embeddings['the'].shape

(100,)

In [11]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
embedding_matrix.shape

(10000, 100)

In [12]:
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [13]:
found_counter = 0
for word, i in w2i.items():
    if i >= max_words:
        break
    embedding = embeddings.get(word)
    if embedding is not None:        
        embedding_matrix[i] = embedding
        found_counter += 1        
print(found_counter, 'words are found with embeddings.')

9773 words are found with embeddings.


In [14]:
embedding_matrix[50]

array([ 0.35995999,  0.45570999,  1.14170003,  0.45611   ,  0.9601    ,
       -0.95994997, -0.92540002,  0.127     ,  0.51292002, -0.039701  ,
       -0.26548001, -0.013654  ,  1.08679998,  0.30419001, -0.77057999,
       -0.054124  ,  0.4386    , -0.51727003, -0.61157   , -0.78044999,
        1.10800004,  0.17912   ,  0.13787   ,  1.05019999, -0.10599   ,
        0.28804001,  0.084783  ,  0.74449998, -0.072019  , -0.30362001,
       -1.19029999,  1.33949995,  0.80093002, -0.11032   ,  0.0036099 ,
        0.075959  ,  0.38789001, -0.19721   , -1.27600002, -0.59604001,
       -1.42470002, -0.61493999,  0.26129001, -0.46810001,  0.4763    ,
       -0.12842999,  0.24786   ,  0.1086    ,  0.36115   , -1.41849995,
        0.27347001,  0.050184  , -0.21788   ,  0.90376002, -0.29337999,
       -1.96060002,  0.16602001, -0.10826   ,  2.15840006, -0.024248  ,
       -0.82154   , -0.01291   , -0.33662   ,  0.47957   ,  0.054286  ,
        0.54413003, -0.31564999,  0.79236001,  0.579     ,  0.06

In [15]:
i2w[50]

'china'

In [16]:
with open('embeddings_glove.pkl','wb') as f:
    pickle.dump(embedding_matrix, f)
print('Embeddings pickled, ready to be uploaded to Google Drive')

Embeddings pickled, ready to be uploaded to Google Drive
