In [0]:
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import json
from nltk import RegexpTokenizer
import time
from keras.preprocessing.text import Tokenizer
import random
import multiprocessing
from nltk import RegexpTokenizer
import copy
import pickle
multiprocessing.cpu_count()

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
image_path = '/content/drive/My Drive/ImageCap/flickr30k_images/'
text_path = '/content/drive/My Drive/ImageCap/captions.csv'

In [0]:
df = pd.read_csv(text_path ,delimiter='|', skipinitialspace=True) #in this csv, | is being used as delimiter

In [0]:
groups = [b for a,b in df.groupby('image_name')] #Seperating rows into groups as per image name
random.shuffle(groups) #applying shuffle operation group-wise
df = pd.concat(groups).reset_index(drop=True) 

In [0]:
image_name_list = list(set(df['image_name'])) #obtaining unique instance name of each image
image_path_list = list(map(lambda arg: image_path + arg, image_name_list))

In [0]:
df.loc[0]

image_name                                           3296226598.jpg
comment_number                                                    0
comment           A man in a black leather jacket walks next to ...
Name: 0, dtype: object

In [0]:
df['comment'] = df['comment'].apply(lambda arg: '<start> ' + str(arg) + ' <end>') #putting start and end tokens

In [0]:
df.loc[0]['comment']

'<start> A man in a black leather jacket walks next to a white brick building in a big city . <end>'

In [0]:
tokenizer = RegexpTokenizer(r'\w+|<start>|<end>') #tokenizing using only alphanumeric tokens and start, end tokens
df['tokens'] = df['comment'].apply(tokenizer.tokenize)

In [0]:
df.loc[0]['tokens']

['<start>',
 'A',
 'man',
 'in',
 'a',
 'black',
 'leather',
 'jacket',
 'walks',
 'next',
 'to',
 'a',
 'white',
 'brick',
 'building',
 'in',
 'a',
 'big',
 'city',
 '<end>']

In [0]:
#Finding max length for padding
len_ = df['tokens'].apply(lambda arg: len(arg))
max_len = max(len_) 

In [0]:
#padding to max_len
def pad(arg):
  len_ = len(arg)
  len_ = max_len - len_
  lst = copy.deepcopy(arg)
  for i in np.arange(0,len_):
    lst.append('_')
  return lst

df['tokens_pad'] = df['tokens'].apply(pad)

In [0]:
df.loc[0]['tokens_pad']

['<start>',
 'A',
 'man',
 'in',
 'a',
 'black',
 'leather',
 'jacket',
 'walks',
 'next',
 'to',
 'a',
 'white',
 'brick',
 'building',
 'in',
 'a',
 'big',
 'city',
 '<end>',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_',
 '_']

In [0]:
#Training our custom word2vec model
t=time.time()
obj_w2v=Word2Vec(sentences=df['tokens'],min_count=1,window=2,size=300,sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20,workers=2,iter=30)
print(time.time()-t)

131.58676886558533


In [0]:
#Loading Google's pretrainind w2v model
t=time.time()
word2vec_path = "/content/drive/My Drive/ImageCap/GoogleNews-vectors-negative300.bin.gz"
w2v_g = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
print(time.time()-t)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


144.9913194179535


In [0]:
#updating our local model's vocab with google's w2v's vocab
t=time.time()
obj_w2v.build_vocab([list(w2v_g.vocab.keys())],update=True)
print(time.time()-t)

999.7349133491516


In [0]:
t=time.time()
obj_w2v.intersect_word2vec_format('/content/drive/My Drive/ImageCap/GoogleNews-vectors-negative300.bin.gz',lockf=1.0, binary=True)
#this function will look for words common in our vocab 
#and in google's vocab and change the weight of such words to that in google's weight vector. 
#Here we have set lockf=1 so that in further training weights of all the
#words are optimized

print(time.time()-t)

170.2057385444641


In [0]:
t=time.time()
obj_w2v.train(df['tokens'],total_examples=len(df),epochs=obj_w2v.iter)#we are training again so as to get weights adjusted
print(time.time()-t)

  


627.3765051364899


In [0]:
obj_w2v.save('/content/drive/My Drive/ImageCap/w2v_imageCap.kv') #This will save entire model

In [0]:
obj_w2v.wv.save_word2vec_format('/content/drive/My Drive/ImageCap/w2v_imageCap.bin', binary=True) #This will only save matrix / vocab of model

In [0]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/ImageCap/w2v_imageCap.bin', binary=True,unicode_errors='ignore')

In [0]:
#Now we will create a new vocab of most used tokens and then later map them to their w2v embeddings
vec=Tokenizer()
vec.fit_on_texts(df['tokens'])
l=len((vec.word_index))
print(l)
print((vec.word_index))#dictionary

18289


In [0]:
#Here we are generating a matrix that contains embedding of each word in our vocabulary
r=0
M=(np.zeros((l,300)))
#d=pd.DataFrame()
for el in vec.word_index:
  try:
    M[r,:]=word2vec[str(el)]#put entire word vector for el in rth row
  except:
    M[r,:]=np.zeros((1,300)) #in case of an unknown word, simply put zero vector
    print('error for ',' ',el)
  #d=d.append(pd.DataFrame(data={'row':[r], 'word': [el]}))# storing the mapping of row to word
  r=r+1

print(r)


error for    jenga
error for    squarepants
error for    lynyrd
error for    skynyrd
error for    heineken
error for    quiznos
error for    odeon
error for    hollister
error for    vuitton
error for    aladdin
error for    penske
error for    atvs
error for    springsteen
error for    nesquik
error for    groucho
error for    kawasaki
error for    nypd
error for    atms
error for    starbuck
error for    giorgio
error for    stonehurst
error for    usain
error for    citroen
error for    sallie
error for    nescafe
error for    shoegasm
error for    hendrix
error for    gump
error for    drayton
error for    sugarland
error for    rockettes
error for    makerbot
error for    lexmark
error for    exxxotica
error for    dhl
error for    ginobili
error for    sva
error for    theatre
error for    khera
error for    hartman
error for    brigada
error for    wwii
error for    isuzu
error for    altima
error for    tabacchi
error for    bleecker
error for    tropicana
error for    olvido
e

error for    wheelpower
error for    skimply
error for    cuido
error for    weezer
error for    kayacker
error for    kfbk
error for    yukla
error for    livingstone
error for    rnli
error for    stai
error for    guardano
error for    pavitmento
error for    rawlings
error for    aztec
error for    modell
error for    sylvester
error for    stallone
error for    icee
error for    swac
error for    halpern
error for    genpac
error for    joao
error for    edwardo
error for    alvarado
error for    gladstone
error for    gillette
error for    mohegan
error for    whatcom
error for    cariboos
error for    leppard
error for    collen
error for    meijer
error for    cki
error for    skirvin
error for    mahmoud
error for    ahmadinejad
error for    tyrolean
error for    dupont
error for    luca
error for    kinshasa
error for    garros
error for    kimmel
error for    neptuno
error for    rachofsky
error for    fransisco
error for    ruiz
error for    yumyumbowls
error for    tambor


In [0]:
M.shape

(18289, 300)

In [0]:
#pad token '_' has been dealt seperately because it might have affected the word2vec training if used earlier
vec.word_index['_'] = 0 #adding 0 for padding
vec.index_word[0] = '_'
M = np.vstack((np.zeros((1,300)),M)) #putting zero vector at 0th row for padding
seq = vec.texts_to_sequences(df['tokens_pad']) #converting tokens to corrosponding indicies so that they can be
                                               #they can be used for mapping by embedding layer during training
seq_vec = np.array(seq).astype('int32')

In [0]:
df.to_csv(r'/content/drive/My Drive/ImageCap/captions_pros.csv',index=None,header=True) # all further mappings will be as per this csv
np.save('/content/drive/My Drive/ImageCap/embedB.npy',M)
np.save('/content/drive/My Drive/ImageCap/caption_vec.npy',seq_vec)

with open('/content/drive/My Drive/ImageCap/word_ind_map.pkl', 'wb') as f:
  pickle.dump(vec.word_index, f, pickle.HIGHEST_PROTOCOL)

with open('x/ind_word_map.pkl', 'wb') as f:
  pickle.dump(vec.index_word, f, pickle.HIGHEST_PROTOCOL)