In [1]:
## Install glove-python module
!pip install -q glove-python

[?25l[K     |█▎                              | 10kB 19.6MB/s eta 0:00:01[K     |██▌                             | 20kB 1.7MB/s eta 0:00:01[K     |███▊                            | 30kB 2.5MB/s eta 0:00:01[K     |█████                           | 40kB 3.2MB/s eta 0:00:01[K     |██████▎                         | 51kB 2.0MB/s eta 0:00:01[K     |███████▌                        | 61kB 2.4MB/s eta 0:00:01[K     |████████▊                       | 71kB 2.8MB/s eta 0:00:01[K     |██████████                      | 81kB 3.1MB/s eta 0:00:01[K     |███████████▏                    | 92kB 3.5MB/s eta 0:00:01[K     |████████████▌                   | 102kB 2.7MB/s eta 0:00:01[K     |█████████████▊                  | 112kB 2.7MB/s eta 0:00:01[K     |███████████████                 | 122kB 2.7MB/s eta 0:00:01[K     |████████████████▏               | 133kB 2.7MB/s eta 0:00:01[K     |█████████████████▍              | 143kB 2.7MB/s eta 0:00:01[K     |██████████████████▊       

In [0]:
from glove import Corpus, Glove

import pandas as pd
import numpy as np

In [0]:
## Trains Glove embeddings for words in text present in column (colname) in file (filepath)
## Glove_Window = context length to be consider to find embedding of a word

def Glove_Embeddings_Train(filepath, colname, Glove_window=10, Glove_Vector_Size=10, Glove_learning_rate=0.05, Glove_epochs=30, Glove_no_threads=4):
  
  df = pd.read_csv(filepath)
  
  data = []
  for row in range(df.shape[0]):
    post_caption = df.iloc[row][colname]
    tokens = post_caption.split()
    data.append(tokens)

  data = np.array(data)
  
  corpus = Corpus()
  corpus.fit(data, window=Glove_window)

  glove = Glove(no_components=Glove_Vector_Size, learning_rate=Glove_learning_rate)
  glove.fit(corpus.matrix, epochs=Glove_epochs, no_threads=Glove_no_threads, verbose=True)

  glove.add_dictionary(corpus.dictionary)
  
  return glove
  

In [0]:
## Input: glove object (object of trained model), text
## Calculates word embedding for each word in text and returns mean of all vectors


def Calculate_Glove_Embedding(glove, text):
  
  words = text.split()
  text_embeddings = []
  
  for word in words:
    ## Ignore if word that is not present in vocabulary appears in the text
    if word in glove.dictionary:
      word_embedding = glove.word_vectors[glove.dictionary[word]]
      text_embeddings.append(word_embedding)
  
  text_embeddings = np.array(text_embeddings)
  text_avg_embedding = np.mean(text_embeddings, axis=0)
  
  return text_avg_embedding

In [6]:
## Train glove embeddings for given corpus and dump trained model object in pickle file 
## Load this pickle file in any other code, import this python module 

glove_obj = Glove_Embeddings_Train("/content/Filtered_Positive_Data.csv", "Caption_Tokens", Glove_Vector_Size=300)

import pickle

with open ("/content/Trained_Glove_Model.pkl", "wb") as file:
  pickle.dump(glove_obj, file)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [0]:
## Call this function in any python file where this module is imported 
## Input : GloveModel_filepath (trained model pickle file), Input_Data_filepath (Whose embeddings need to be calculated)
## Output_Data_filepath: optional (if given, embeddings will be dumped in it. Should be csv file)
## returns 2D dataframe with dimensions (No of samples in input, Vector size of embeddings)

def Get_Glove_Embedding(GloveModel_filepath, Input_Data_filepath, colname, Output_Data_filepath=""):
  
  with open(GloveModel_filepath, "rb") as ModelFile:
    glove_object = pickle.load(ModelFile)
    
  data = pd.read_csv(Input_Data_filepath)
  
  Data_Embeddings = []
  for row in range(data.shape[0]):
#     print(row)
    text = data.iloc[row][colname]
    text_avg_embedding = Calculate_Glove_Embedding(glove_object, text)
    Data_Embeddings.append(text_avg_embedding)
  
  Data_Text = data[colname]
  Data_Embeddings = pd.DataFrame(Data_Embeddings)
  
  Text_Embedding_Map = pd.concat([Data_Text,Data_Embeddings], axis=1)
  
  if Output_Data_filepath:
    Text_Embedding_Map.to_csv(Output_Data_filepath)
  
  return Text_Embedding_Map

In [8]:
## Sample call to the fuction

Text_Embedding_Map = Get_Glove_Embedding("/content/Trained_Glove_Model.pkl", "/content/Filtered_Positive_Data.csv", "Caption_Tokens", "/content/Positive_Glove_Embeddings.csv")
print(Text_Embedding_Map)

                                         Caption_Tokens  ...       299
0     walking feed sick new gear new brand partner f...  ...  0.008271
1     one literally one zero calorie sugar free low ...  ...  0.031334
2     thank listening article original post micro ar...  ...  0.023550
3     climbing brush name today sad annoying men cli...  ...  0.002718
4     since think told boring told high maintenance ...  ...  0.004441
5     theme year crazy ride let many people go makin...  ...  0.015899
6     long time recognize unpaid work housework bias...  ...  0.022081
7     ich job ich gut mir r w hat sie na n sind ich ...  ... -0.054868
8     original thank wait told place take wait peopl...  ...  0.010658
9     random men offering unsolicited advice right w...  ...  0.012759
10    globally unpaid work done spend three six per ...  ...  0.016958
11    time celebrate felt like forever article writi...  ...  0.014341
12    day combination day day think maybe warn peopl...  ...  0.021879
13    

In [0]:
## Sample call to the fuction

Text_Embedding_Map = Get_Glove_Embedding("/content/Trained_Glove_Model.pkl", "/content/Filtered_Positive_Data.csv", "Caption_Tokens", "/content/Positive_Glove_Embeddings.csv")
print(Text_Embedding_Map)