In [32]:
import numpy as np
import pandas as pd
import os

import io
import re
import string
import tqdm
import pickle

from os import listdir
from os.path import isfile, join

import tensorflow as tf
from tensorflow.keras import layers
AUTOTUNE = tf.data.AUTOTUNE

from utils import *

### 1. Import Dataset That Contains the Input, Output

In [44]:
"""
The dataset should contain the target and context word. 
An example of the format is:
        input, output
        toad, amphibian
        frog, amphibian
        amphibian, toad
        amphibian, frog
        politics, paris
        paris, politics
        ...
"""

emb = pd.read_csv('df.csv').head(1000000) # Import your dataset
emb = emb.sample(frac=1) # Shuffling the data to be able to shuffle a smaller size when creating the tensorflow dataset
emb.head()

Unnamed: 0,input,output
714313,236,813
84432,693,288
49321,433,620
530769,562,336
749940,1343,1249


### 2. Data Processing

#### 2.1 Rare Word Pruning

Minimum number of words to learn a meaningful representation for the pair of context/target words

In [45]:
min_count_rare_word_pruning = 10 #100
emb2 = prune_rare_words(emb, min_count_rare_word_pruning)
print(len(emb2))
emb2.head()

Using a treshold of 10: 17.51% of observations were pruned
824904


Unnamed: 0,input,output
0,0,1006
1,0,1006
2,0,1006
3,0,1006
4,0,1006


#### 2.2 Subsampling
Certain pairs of words appear more often then others. 
These pairs of words should not be over-represented in our dataset otherwise we will be working with an unbalanced dataset.

In [46]:
subsampling_treshold = 0.02 #0.000002
emb3 = subsample(emb2, subsampling_treshold).sample(frac=1) # to make sure that we shuffle correctly
print(len(emb3))
emb3.head()

824904


Unnamed: 0,input,output
798026,1384.0,1340.0
604683,933.0,252.0
442230,693.0,1343.0
708453,1149.0,1307.0
70935,150.0,288.0


In [47]:
observed_items = emb3.values.flatten() # returns all observed items in a list 
vocab_size = len(set(observed_items)) # Will need to change that for number of distinct items in dataset
print(vocab_size)

740


In [48]:
vocab = create_vocab(emb3)

#### 2.3 Negative Sampling

In [49]:
k=5
pwr_val=1/4

neg_samp_df, targets, contexts, labels = generate_negative_samples(emb3.sample(frac=1), k, pwr_val)

## FREE UP MEMORY
del emb, emb2

In [50]:
targets_tf = tf.convert_to_tensor(targets)
contexts_tf = tf.convert_to_tensor(contexts)
labels_tf = tf.convert_to_tensor(labels)

### 3. Generate Tensorflow Tensors

In [51]:
# Just disables the warning, doesn't take advantage of AVX/FMA to run faster
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # this is to ignore the error message: assuming you are running on GPU it is not necessary to see the error message

BATCH_SIZE = 30  ### --> HYPER-PARAMETER TO FINE TUNE
BUFFER_SIZE = 10000
TRAIN_PERC = .9 # Percentage of samples in train

dataset = tf.data.Dataset.from_tensor_slices(((targets_tf, contexts_tf), labels_tf))
n_samples = len(list(dataset))
train_size = int(n_samples * TRAIN_PERC)
test_size = n_samples - train_size

train_dataset = dataset.take(train_size)
test_dataset = dataset.skip(train_size)

print("Train Size: ", len(list(train_dataset)), "Test Size:", len(list(test_dataset)))

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
test_dataset = test_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

Train Size:  742413 Test Size: 82491


In [52]:
filename_ = str(min_count_rare_word_pruning) + "_" + str(subsampling_treshold) + "_" + str(vocab_size) + "_" + str(k) + "_" + str(pwr_val)

### 4. Define the Hyperparameters of the Model

- LEARNING_RATES: The learning rate of the optimizer
- EMBEDDING_DIMS: The dimension of the embeddings

In [53]:
EMBEDDING_DIMS = [5,6,7] #[10]
LEARNING_RATES = [.001]

In [54]:
save_embeddings = True
for embedding_dim in EMBEDDING_DIMS: 
    for learning_rate in LEARNING_RATES: 

        train_loss_dict = {}
        train_accuracy_dict = {}
        test_results_dict = {}

        word2vec = Word2Vec(vocab_size, embedding_dim, 5)

        opt = tf.keras.optimizers.Adam(
                learning_rate=learning_rate,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-07,
                amsgrad=False,
                name='Adam',
            )
            
        word2vec.compile(optimizer=opt,
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
        
        # Callbacks
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
        es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                    min_delta=0.001,
                    patience=2,
                    mode='max',
                    restore_best_weights=True) 
        
        results = word2vec.fit(train_dataset, 
                                epochs=10, 
                                callbacks=[tensorboard_callback, es],
                                validation_data=test_dataset
                            )                     
        
        model_name = f"{str(embedding_dim)}-{str(learning_rate)}"
        train_loss_dict[model_name] = results.history["loss"]
        train_accuracy_dict[model_name] = results.history["accuracy"]
        
        test_results = word2vec.evaluate(test_dataset, callbacks=[tensorboard_callback])
        test_results_dict[model_name] = test_results 

        model_id = filename_ + '_EMB_DIM_' + str(embedding_dim) + "_LR_" + str(learning_rate)

        logs_dict = {model_id:{'loss':train_loss_dict, 
                        'accuracy':train_accuracy_dict,
                        'test':test_results_dict
                                    }
                    }

        # Saving as a pickle file
        local_path = './skip_gram_training_logs/' + model_id + '/'

        # if the path doesn't exist
        if not os.path.exists(local_path):
            os.makedirs(local_path)
        
        with open(local_path + 'logs_dict.pickle', 'wb') as handle:
            pickle.dump(logs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        print(model_name, "Test Loss", test_results[0], "Test Accuracy", test_results[1])

        if save_embeddings:
            weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
            df_embed = pd.DataFrame(weights, index=vocab.keys())

            with open(local_path + 'df_embed.pickle', 'wb') as handle:
                pickle.dump(df_embed, handle, protocol=pickle.HIGHEST_PROTOCOL)

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node 'word2_vec_2/w2v_embedding/embedding_lookup' defined at (most recent call last):
    File "C:\Users\AC226827\Anaconda3\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\AC226827\Anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\AC226827\Anaconda3\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "C:\Users\AC226827\Anaconda3\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "C:\Users\AC226827\Anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell
      await result
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2863, in run_cell
      result = self._run_cell(
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in _run_cell
      return runner(coro)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3106, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3309, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\AC226827\AppData\Local\Temp\ipykernel_5876\1037852174.py", line 32, in <cell line: 2>
      results = word2vec.fit(train_dataset,
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\AC226827\OneDrive - Air Canada\Documents\ML AI Content\2022\Recommender Systems\to_load_gitub\utils.py", line 239, in call
      word_emb = self.target_embedding(target)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\AC226827\Anaconda3\lib\site-packages\keras\layers\core\embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'word2_vec_2/w2v_embedding/embedding_lookup'
indices[0] = 900 is not in [0, 740)
	 [[{{node word2_vec_2/w2v_embedding/embedding_lookup}}]] [Op:__inference_train_function_4966398]

### 5. Visualization

In [32]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
df_embed = pd.DataFrame(weights, index = vocab.keys())
df_embed.to_csv("./df_embeddings.csv")

In [9]:
len(df_embed)

905

In [10]:
import scipy.spatial as sp

df_embed.sort_index(inplace=True)
#computing the cosine between the destinations
cosines = 1 - sp.distance.cdist(df_embed.values, df_embed.values, 'cosine')

#creating a dataframe from the cosine
df_cosines = pd.DataFrame(cosines, columns = df_embed.index, index = df_embed.index)

df_cosines.head()

Unnamed: 0,aae,aal,aar,abe,abj,abq,abv,abx,abz,aca,...,zih,zlo,znz,zqn,zrh,zsa,zth,zuh,zws,ıst
aae,1.0,-0.042086,-0.088361,0.122154,0.58981,0.147615,0.355155,-0.390729,-0.117424,0.096971,...,0.049498,0.195215,0.30274,-0.248104,0.479312,-0.037291,-0.019002,0.228834,0.084672,0.742451
aal,-0.042086,1.0,0.982955,0.283006,0.195596,-0.099365,-0.143011,0.288958,0.380987,-0.32401,...,-0.332027,-0.253191,0.20924,0.275961,0.415281,-0.210217,-0.018955,-0.092794,0.169026,0.024954
aar,-0.088361,0.982955,1.0,0.238523,0.138889,0.010415,-0.096053,0.332344,0.463156,-0.2552,...,-0.280595,-0.173365,0.136918,0.346667,0.343346,-0.250485,-0.055906,-0.042153,0.064756,-0.037553
abe,0.122154,0.283006,0.238523,1.0,0.404975,0.157857,-0.028761,0.057125,-0.238542,0.341483,...,0.19793,0.240018,-0.202823,-0.11317,0.113985,0.384829,-0.310915,0.18863,0.064168,-0.0203
abj,0.58981,0.195596,0.138889,0.404975,1.0,0.14014,0.414363,-0.034936,0.248225,0.293012,...,0.31009,0.244104,0.285773,-0.005069,0.648946,0.259281,0.317179,0.309338,0.344633,0.521967


In [11]:
#getting the top 10 most similar cities for each city
big_arr = []
for row in df_cosines.iterrows():
    
    top10 = row[1].nlargest(10)
    
    for i in range(0, len(top10.values)):
        big_arr.append([top10.name, top10.values[i], top10.index.values[i]])
    
    
df_city_similarity = pd.DataFrame(big_arr, columns=['city', 'cosine', 'similar_city'])
df_city_similarity.head()

Unnamed: 0,city,cosine,similar_city
0,aae,1.0,aae
1,aae,0.980399,bja
2,aae,0.977457,czl
3,aae,0.971211,orn
4,aae,0.845885,hah


In [15]:
df_city_similarity = pd.merge(df_city_similarity, ref_airpt[['airp_cd', 'city_name', 'COUNTRY_NAME']], how='left', left_on='similar_city', right_on='airp_cd')
df_city_similarity.head()

Unnamed: 0,city,cosine,similar_city,airp_cd,city_name,COUNTRY_NAME
0,aae,1.0,aae,aae,ANNABA,ALGERIA
1,aae,0.980399,bja,bja,BEJAIA,ALGERIA
2,aae,0.977457,czl,czl,CONSTANTINE,ALGERIA
3,aae,0.971211,orn,orn,ORAN,ALGERIA
4,aae,0.845885,hah,hah,MORONI,COMOROS


In [46]:
#this code can be used to find the exact city_filename for a city. For instance, for London
df_city_similarity[df_city_similarity.city.str.contains('mad')]

Unnamed: 0,city,cosine,similar_city,airp_cd,city_name,COUNTRY_NAME
6600,ptp,1.0,ptp,ptp,POINTE A PITRE,GUADELOUPE
6601,ptp,0.965177,fdf,fdf,FORT DE FRANCE,MARTINIQUE
6602,ptp,0.955561,pnr,pnr,POINTE NOIRE,CONGO
6603,ptp,0.926858,cur,cur,CURACAO,CURACAO
6604,ptp,0.904372,bon,bon,BONAIRE,"BONAIRE, SAINT EUSTATIUS & SABA"
6605,ptp,0.871076,aua,aua,ARUBA,ARUBA
6606,ptp,0.86381,dom,dom,DOMINICA,DOMINICA
6607,ptp,0.857145,sxm,sxm,ST. MAARTEN,SINT MAARTEN
6608,ptp,0.848064,ctg,ctg,CARTAGENA,COLOMBIA
6609,ptp,0.847514,anu,anu,ANTIGUA,ANTIGUA AND BARBUDA


In [27]:
# CANCUN
# MONTEGO_BAY
# LYON

city='psp'

print("___________________________________________________________a________________________________________________________________________________________________")
print("")
print(" FOR " + city + ' :')
print("")
#you can then use the exact city_filename to look for the top 10 most similar cities
print(" Destination Metadata returns:".upper())
print(df_city_similarity[df_city_similarity.city==city][['similar_city']])
print("")

___________________________________________________________a________________________________________________________________________________________________

 FOR psp :

 DESTINATION METADATA RETURNS:
     similar_city
6590          psp
6591          phx
6592          san
6593          ont
6594          tus
6595          bur
6596          sfo
6597          saf
6598          sna
6599          slc



In [None]:
weights = {}

for idx, row in df_embed2.iterrows():
    weights[idx] = row.values

In [None]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')


In [None]:

for key, val in weights.items(): # for index, word in enumerate(vocab):
  vec = weights[key]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(key + "\n")
out_v.close()
out_m.close()