# Using Gensim - Part 1

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

import pickle

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.keyedvectors import KeyedVectors

Using TensorFlow backend.


In [2]:
df = pd.read_excel('../ML/Data_ML.xlsx')
df = df[pd.notnull(df['Business Description'])]
df['Sector ID'] = df['TRBC Economic Sector Name'].factorize(sort=True)[0]
X = df['Business Description']
y = df['Sector ID']
X.shape, y.shape

((23022,), (23022,))

In [3]:
X.head()

0    Cummins Inc. designs, manufactures, distribute...
1    Rio Tinto plc is a mining and metals company. ...
2    Rio Tinto Limited (Rio Tinto) is a mining comp...
3    The Royal Dutch Shell plc explores for crude o...
4    BHP Billiton Plc is a global resources company...
Name: Business Description, dtype: object

In [4]:
y.head()

0    1
1    0
2    0
3    3
4    0
Name: Sector ID, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

splits = [X_train, X_test, y_train, y_test]
for _ in splits:
    print(_.shape)

(18417,)
(4605,)
(18417,)
(4605,)


In [6]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
w2i = tokenizer.word_index
len(w2i)

70673

In [7]:
i2w = dict((v,k) for (k,v) in w2i.items())

print(list(w2i)[:10])
print(list(i2w)[:10])

['and', 'the', 'of', 'in', 'company', 'is', 'a', 'as', 'products', 'its']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [8]:
%%time

word2vec_path = '../GoogleNews-vectors-negative300.bin'
print('Loading word2vec')
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

Loading word2vec
Wall time: 10min 51s


In [9]:
embedding_matrix = np.zeros((10000, 300))
print('embed_matrix.shape', embedding_matrix.shape)
found_counter = 0
for word, idx in w2i.items():    
    if idx >= max_words:
        break
    if word in word2vec.wv:
        embedding_matrix[idx] = word2vec.wv[word]
        found_counter += 1
print(found_counter, 'words are found with embeddings.')

embed_matrix.shape (10000, 300)




8837 words are found with embeddings.


In [10]:
embedding_matrix[50]

array([-2.94921875e-01,  8.44726562e-02,  1.03149414e-02,  3.45703125e-01,
       -9.37500000e-02,  5.54199219e-02,  1.31835938e-01, -2.73437500e-01,
       -1.18164062e-01,  3.12500000e-01, -2.96875000e-01, -3.69140625e-01,
        1.00585938e-01, -1.79687500e-01, -1.68945312e-01,  1.12792969e-01,
       -1.22558594e-01,  2.75390625e-01, -2.15820312e-01, -4.53125000e-01,
        9.47265625e-02, -1.61132812e-01,  3.20312500e-01, -3.28125000e-01,
        1.73828125e-01,  5.97656250e-01, -2.20703125e-01,  1.93359375e-01,
       -4.46777344e-02, -6.25000000e-02, -1.34765625e-01, -1.07910156e-01,
       -3.14453125e-01, -5.15136719e-02, -9.61914062e-02, -1.56250000e-01,
       -2.38281250e-01,  1.61132812e-02,  6.13403320e-03,  8.20312500e-02,
       -6.78710938e-02,  1.63085938e-01,  2.04101562e-01, -8.83789062e-02,
        2.05078125e-01,  1.46484375e-01, -4.29687500e-02,  2.07031250e-01,
       -2.34375000e-01, -1.04980469e-01,  1.11328125e-01,  2.00195312e-01,
       -4.73632812e-02,  

In [11]:
embedding_matrix[50].shape

(300,)

In [12]:
i2w[50]

'china'

In [13]:
with open('embeddings_gensim.pkl','wb') as f:
    pickle.dump(embedding_matrix, f)
print('Embeddings pickled, ready to be uploaded to Google Drive')

Embeddings pickled, ready to be uploaded to Google Drive
