In [63]:
import sys

In [65]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine

In [66]:
# Load pre-trained BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Calculating average number of tokens in a user input description

In [3]:
desc1="I feel like going to a place to do camping in the forset  with sky full of stars.  I want to  see huge big trees with a cold wind breezing on my face. I want to follow it up with a bonfire "

In [4]:
len(tokenizer.tokenize(desc1))

46

In [5]:
desc2="I feel like going to a place where there is a cloudy weather so that there's no discomfort in sitting on the grass and an open sky. A box of beer bottles a pack of cigarettes and slow music on speakers. Addition of waterfalls, mountains and a forest in the backdrop is an icing on the cake. And a must to this destination is it should be undiscovered by humans so that it is clean and peaceful"

In [6]:
len(tokenizer.tokenize(desc2))

87

In [7]:
desc3="I need to go to a place where I can spend time with family and kids in the evening with the breezy sea winds and night views of the sea. The place should be wonderful at night time with colorful lights. Also there should be some playing area for kids and seating for adults. I need the place to be located nearby a railway station or a bus terminus so that it could be easy to travel"

In [8]:
len(tokenizer.tokenize(desc3))

81

In [9]:
desc4='I want to do an adventure sport like Parasailing with my wife. So the activity should allow couple entry and also should have a guide for it. I am afraid of heights so I need a training session for parasailing before I try it out. The outdoor activity should be near a beach so that I can spend some quality time after the parasailing.'

In [10]:
len(tokenizer.tokenize(desc4))

74

In [11]:
desc5="I need to take my kids to a children’s park where they can have some fun. The park should have swings and good walk paths with fountains. The park should be nearby some main attractions so that I can also visit some places with my family. If the park has some aquarium then it would be an added advantage. "

In [12]:
len(tokenizer.tokenize(desc5))

65

In [13]:
desc6='I feel like going to a beach with crystal-clear waters. The place should have a great marine ecosystem with variety of fishes & coral reefs. The place should be an ideal location for water based activities like swimming, diving, snorkeling tec. It should also offer different adventure sports options like surfing & scuba diving.'

In [14]:
len(tokenizer.tokenize(desc6))

66

In [15]:
(len(tokenizer.tokenize(desc2))+len(tokenizer.tokenize(desc1))+len(tokenizer.tokenize(desc3))+len(tokenizer.tokenize(desc4))+len(tokenizer.tokenize(desc5))+len(tokenizer.tokenize(desc6)))/6

69.83333333333333

## GETTING BERT EMBEDDINGS USING SLIDING WINDOW AND PADDING TECHNIQUES

In [192]:
df=pd.read_csv("E:\\PBS\\Project\\Data\\try 3\\combining\\1266 Places with details(some missing).csv")

In [193]:
df.Description[0]

'Tirumala. No wonder people spend hours and hours in the queue, just to get a glimpse of Lord Venkateshwara for just a couple of seconds. Faith can do wonders, and the thousands and thousands of devotees, of all ages prove this every day, queuing up to get the blessing of lord Venkateshwara. The moment you enter through the doors, it\'s a blissful experience, some sort of positive vibes takes over and the chant of Govinda Govinda just lets one forget their tiredness and feel energized. I am visited number of times and every time I feel it\'s I am first time visit, with plan or with out plan visit tirumala. Everything is good only happen. In tirumala there is free buses is available to reach temple to rooms & cottages and seeing the beauty of tirumala in night time also. One of most femous temple in india .in this temple anytime crowd .high in weekendays .300 ruppess ticket want bookthrough online ttd website and to go.2 to 3 hours darshan completed heavy crowd 5 to 6 hours in queline .

In [18]:
# Define parameters
max_length = 70  # Avg number of tokens in user input
stride = 35  # number of tokens to move forward for each window
text = df.Description[0]

# Tokenize text
tokens = tokenizer.tokenize(text)

# Divide the tokens into windows
windows = []
start = 0
while start < len(tokens):
    end = min(start + max_length, len(tokens))
    windows.append(tokens[start:end])
    start += stride


In [19]:
max_length = max([len(window) for window in windows])

In [20]:
min([len(window) for window in windows])

9

In [21]:
max_length

70

In [22]:
# windows

In [23]:
windows=[window + ['[PAD]'] * (70 - len(window)) for window in windows]

In [24]:
len(windows[0])

70

In [25]:
# Convert tokenized windows to BERT embeddings
embeddings = []
for window in windows:
    input_ids = tokenizer.convert_tokens_to_ids(window)
    with torch.no_grad():
        outputs = model(torch.tensor([input_ids]))
        window_embedding = outputs[0][0]  # Use last layer for final embedding
    embeddings.append(window_embedding)

In [26]:
embeddings[0].reshape(1,-1)+embeddings[1].reshape(1,-1)

tensor([[ 0.9436, -0.1175,  0.9618,  ...,  0.0493,  0.7864,  0.5947]])

In [27]:
len(embeddings)

911

In [28]:
place_emb=0
for i in range(len(embeddings)):
    place_emb+=embeddings[i].reshape(1,-1)

In [29]:
(place_emb/len(embeddings)).shape

torch.Size([1, 53760])

In [72]:
place_emb/len(embeddings)

tensor([[ 0.2937, -0.2139,  0.3961,  ..., -0.2544,  0.2147, -0.1112]])

In [75]:
bert=pd.DataFrame(columns=np.linspace(0,53759,53760))

In [76]:
bert

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,53750.0,53751.0,53752.0,53753.0,53754.0,53755.0,53756.0,53757.0,53758.0,53759.0


In [77]:
bert.loc[df.title[0],]=np.array(place_emb/len(embeddings))
bert

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,53750.0,53751.0,53752.0,53753.0,53754.0,53755.0,53756.0,53757.0,53758.0,53759.0
Tirumala Temple,0.293709,-0.213915,0.396115,0.027876,0.112087,-0.330744,0.120736,0.178353,-0.057125,-0.390785,...,-0.102605,-0.13172,0.225843,-0.032427,0.110309,-0.099666,-0.386567,-0.254426,0.214728,-0.111248


In [63]:
bert=pd.DataFrame(np.array(place_emb/len(embeddings)))

In [64]:
list(np.array(place_emb/len(embeddings)))

[array([ 0.29370907, -0.21391462,  0.39611495, ..., -0.2544262 ,
         0.21472806, -0.1112484 ], dtype=float32)]

In [65]:
bert

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53750,53751,53752,53753,53754,53755,53756,53757,53758,53759
0,0.293709,-0.213915,0.396115,0.027876,0.112087,-0.330744,0.120736,0.178353,-0.057125,-0.390785,...,-0.102605,-0.13172,0.225843,-0.032427,0.110309,-0.099666,-0.386567,-0.254426,0.214728,-0.111248


In [66]:
bert['Place']=""

In [67]:
bert['Place'][0]=df.title[0]
bert

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bert['Place'][0]=df.title[0]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53751,53752,53753,53754,53755,53756,53757,53758,53759,Place
0,0.293709,-0.213915,0.396115,0.027876,0.112087,-0.330744,0.120736,0.178353,-0.057125,-0.390785,...,-0.13172,0.225843,-0.032427,0.110309,-0.099666,-0.386567,-0.254426,0.214728,-0.111248,Tirumala Temple


In [70]:

bert.iloc[1,:-1] = np.array(place_emb/len(embeddings))

ValueError: setting an array element with a sequence.

In [71]:
bert

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53751,53752,53753,53754,53755,53756,53757,53758,53759,Place
0,0.293709,-0.213915,0.396115,0.027876,0.112087,-0.330744,0.120736,0.178353,-0.057125,-0.390785,...,-0.13172,0.225843,-0.032427,0.110309,-0.099666,-0.386567,-0.254426,0.214728,-0.111248,Tirumala Temple
1,,,,,,,,,,,...,,,,,,,,,,


# Automating for full dataset

In [86]:
bert=pd.DataFrame(columns=np.linspace(0,53759,53760))
for i in range(len(df)):
    #run time log
#     sys.stdout.write('\r                                                         '+str(round((i+1)*100/2, 1))+'% : Overall Progress')
    # Define parameters
    max_length = 70  # Avg number of tokens in user input
    stride = 35  # number of tokens to move forward for each window
    text = df.Description[i]

    # Tokenize text
    tokens = tokenizer.tokenize(text)

    # Divide the tokens into windows
    windows = []
    start = 0
    while start < len(tokens):
        end = min(start + max_length, len(tokens))
        windows.append(tokens[start:end])
        start += stride
    # Padding
    windows=[window + ['[PAD]'] * (70 - len(window)) for window in windows]
    # Convert tokenized windows to BERT embeddings
    embeddings = []
    for j in range(len(windows)):
        sys.stdout.write('\r '+str(round((j+1)*100/len(windows), 1))+'% :'+df.title[i]+' Progress')
        input_ids = tokenizer.convert_tokens_to_ids(windows[j])
        with torch.no_grad():
            outputs = model(torch.tensor([input_ids]))
            window_embedding = outputs[0][0]  # Use last layer for final embedding
        embeddings.append(window_embedding)
    place_emb=0
    for k in range(len(embeddings)):
        place_emb+=embeddings[k].reshape(1,-1)
    # ber=pd.DataFrame(np.array(place_emb/len(embeddings)))
    bert.loc[df.title[i],]=np.array(place_emb/len(embeddings))
    sys.stdout.write('\r                                                         '+str(round((i+1)*100/len(df), 1))+'% : Overall Progress')

 4.5% :Travel Saga Holidays Progress                     50.9% : Overall Progress

KeyboardInterrupt: 

In [87]:
bert

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,53750.0,53751.0,53752.0,53753.0,53754.0,53755.0,53756.0,53757.0,53758.0,53759.0
Tirumala Temple,0.293709,-0.213915,0.396115,0.027876,0.112087,-0.330744,0.120736,0.178353,-0.057125,-0.390785,...,-0.102605,-0.13172,0.225843,-0.032427,0.110309,-0.099666,-0.386567,-0.254426,0.214728,-0.111248
Diguva Ahobila Lakshmi Narasimha Swamy Temple,0.210103,-0.136586,0.346771,0.004666,0.132168,-0.336863,0.143052,0.138889,-0.124493,-0.369396,...,-0.099202,-0.116128,0.172284,-0.044598,0.120049,-0.043049,-0.382052,-0.246732,0.262474,-0.020433
INS Kurusura Submarine Museum,0.238594,-0.214718,0.444939,0.146867,0.114093,-0.41267,0.059302,0.250827,-0.06673,-0.351263,...,-0.141835,-0.155205,0.15008,-0.118689,0.129753,-0.205148,-0.297194,-0.253417,0.172327,0.051639
Borra Caves,0.304625,-0.150142,0.427049,0.111338,0.179687,-0.379798,0.204847,0.219696,-0.027474,-0.390733,...,-0.129571,-0.177891,0.040547,-0.115748,0.021918,-0.174307,-0.278685,-0.267228,0.200007,-0.000615
Kailasagiri,0.261823,-0.370101,0.399945,0.111545,0.195548,-0.445069,0.066371,0.258871,-0.074986,-0.361385,...,-0.137708,-0.160582,0.083451,-0.0798,0.001655,-0.16756,-0.340589,-0.312151,0.215478,-0.123034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Essence Ayurveda Medispa,0.309522,-0.086484,0.472798,-0.056507,0.016567,-0.294875,-0.018535,0.267229,0.060707,-0.412773,...,-0.121103,-0.299882,-0.018399,-0.181554,0.121342,-0.159661,-0.419431,-0.31425,0.164253,0.037474
Munnar Trekking Adventure,0.304956,-0.146277,0.424193,0.023659,0.105072,-0.395199,-0.016522,0.259605,0.053818,-0.344752,...,-0.276259,-0.234464,-0.163136,-0.212344,0.177489,-0.182417,-0.347388,-0.201925,0.256606,-0.041395
Seasonz India,0.187256,-0.154621,0.492349,0.079463,0.008395,-0.383695,-0.011943,0.235165,0.004169,-0.397091,...,-0.170292,-0.261798,0.053417,-0.193235,0.160012,-0.215242,-0.35629,-0.327679,0.277734,-0.097825
Munnar Holidays,0.225638,-0.113563,0.41975,0.069958,0.114743,-0.369055,-0.045877,0.245455,0.011333,-0.368243,...,-0.195725,-0.233883,0.020563,-0.156613,0.146023,-0.144607,-0.414371,-0.371642,0.278495,-0.106961


In [88]:
bert.to_csv("BERT Embeddings-Sliding+padding.csv")

In [6]:
df1=pd.read_csv("E:\PBS\Project\Data\\try 3\\BERT Embeddings-Sliding+padding_Aswin.csv")

In [14]:
df2=pd.read_csv("E:\PBS\Project\Data\\try 3\\BERT Embeddings-Sliding+padding.csv")

In [15]:
df3=pd.read_csv("E:\PBS\Project\Data\\try 3\\BERT Embeddings-Sliding+padding-1.csv")

In [31]:
df2.rename({'Unnamed: 0':'Place'},axis=1).set_index("Place",inplace=True)

In [38]:
df3.rename({'Unnamed: 0':'Place'},axis=1).set_index("Place",inplace=True)

In [52]:
df1.rename({'Places':'Place'},axis=1,inplace=True)

In [54]:
df1.set_index("Place",inplace=True)

In [55]:
df1

Unnamed: 0_level_0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,53750.0,53751.0,53752.0,53753.0,53754.0,53755.0,53756.0,53757.0,53758.0,53759.0
Place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tawang War Memorial,0.258663,-0.167171,0.309656,0.086236,-0.121107,-0.279518,0.069043,0.237783,-0.104324,-0.399753,...,-0.044787,-0.028978,0.265678,-0.088254,0.141890,-0.145662,-0.452453,-0.120055,0.185494,-0.081269
Nuranang Falls,0.286913,-0.333019,0.408509,0.156508,0.073830,-0.350591,0.184779,0.229595,-0.048747,-0.378610,...,-0.047157,-0.193837,-0.036714,-0.063599,0.032528,-0.109214,-0.320143,-0.251761,0.111250,-0.082165
Madhuri Lake,0.297287,-0.274898,0.368848,0.100509,0.122654,-0.386416,0.091595,0.237671,-0.041378,-0.364774,...,-0.081866,-0.168345,0.027702,-0.040837,0.030424,-0.168959,-0.328324,-0.314465,0.184484,-0.067366
Jaswant Garh,0.222298,-0.163456,0.234397,0.012882,-0.030309,-0.236603,0.127837,0.251081,-0.109253,-0.343250,...,-0.082127,0.047515,0.318217,-0.133888,0.242593,-0.056846,-0.514553,-0.074865,0.125103,-0.144011
Bomdila Monastery,0.322692,-0.140776,0.352150,0.029217,0.134435,-0.349131,0.116191,0.237616,-0.077243,-0.400015,...,-0.127308,-0.216476,0.034393,-0.135791,-0.034958,-0.182967,-0.310612,-0.243801,0.208286,-0.089349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Rock Beach,0.386142,-0.410420,0.404106,0.133396,0.189416,-0.497422,-0.100312,0.327825,0.023200,-0.448724,...,-0.147945,-0.217541,-0.072182,-0.042931,0.033703,-0.279845,-0.402914,-0.349629,0.168862,-0.090299
Eglise de Notre Dame des Anges,0.284077,-0.211078,0.460519,0.012270,0.118196,-0.226854,0.024648,0.319265,-0.073325,-0.543087,...,0.019562,-0.280041,0.048337,-0.171045,0.071958,-0.397452,-0.292772,-0.260359,0.254789,0.028713
Serenity Beach,0.414268,-0.375668,0.438130,0.133326,0.197251,-0.450011,-0.079220,0.351714,-0.009887,-0.397697,...,-0.175230,-0.246424,-0.145199,-0.063258,0.057226,-0.237580,-0.333353,-0.324593,0.163275,-0.045228
Chunnambar Boat House,0.370063,-0.304058,0.490067,0.158804,0.226558,-0.440199,0.032706,0.249179,-0.049977,-0.383412,...,-0.132158,-0.232898,-0.004560,-0.079595,0.053416,-0.148362,-0.397432,-0.316212,0.213185,-0.107757


In [56]:
final_bert=pd.concat([df1,df2,df3],axis=0)
final_bert

Unnamed: 0_level_0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,53750.0,53751.0,53752.0,53753.0,53754.0,53755.0,53756.0,53757.0,53758.0,53759.0
Place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tawang War Memorial,0.258663,-0.167171,0.309656,0.086236,-0.121107,-0.279518,0.069043,0.237783,-0.104324,-0.399753,...,-0.044787,-0.028978,0.265678,-0.088254,0.141890,-0.145662,-0.452453,-0.120055,0.185494,-0.081269
Nuranang Falls,0.286913,-0.333019,0.408509,0.156508,0.073830,-0.350591,0.184779,0.229595,-0.048747,-0.378610,...,-0.047157,-0.193837,-0.036714,-0.063599,0.032528,-0.109214,-0.320143,-0.251761,0.111250,-0.082165
Madhuri Lake,0.297287,-0.274898,0.368848,0.100509,0.122654,-0.386416,0.091595,0.237671,-0.041378,-0.364774,...,-0.081866,-0.168345,0.027702,-0.040837,0.030424,-0.168959,-0.328324,-0.314465,0.184484,-0.067366
Jaswant Garh,0.222298,-0.163456,0.234397,0.012882,-0.030309,-0.236603,0.127837,0.251081,-0.109253,-0.343250,...,-0.082127,0.047515,0.318217,-0.133888,0.242593,-0.056846,-0.514553,-0.074865,0.125103,-0.144011
Bomdila Monastery,0.322692,-0.140776,0.352150,0.029217,0.134435,-0.349131,0.116191,0.237616,-0.077243,-0.400015,...,-0.127308,-0.216476,0.034393,-0.135791,-0.034958,-0.182967,-0.310612,-0.243801,0.208286,-0.089349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
City Centre Mall,0.439128,-0.399354,0.491822,0.176970,0.209121,-0.302848,-0.163024,0.437133,-0.011521,-0.419323,...,-0.297036,-0.241840,0.035031,0.010264,-0.044108,-0.174217,-0.260852,-0.391836,0.192677,-0.179995
Escape Time,0.291791,-0.266065,0.554195,0.216489,0.010779,-0.386322,0.051416,0.105693,0.052501,-0.344103,...,-0.264528,-0.187481,-0.059318,-0.228399,0.207959,-0.181912,-0.459770,-0.389389,0.112776,0.169699
Sela Pass,0.323109,-0.141662,0.398203,0.084149,0.085559,-0.321644,0.163110,0.263912,-0.002894,-0.440621,...,-0.145753,-0.179873,0.034375,-0.148791,0.140842,-0.218558,-0.308770,-0.270690,0.160799,-0.070214
Bumla Pass,0.277872,-0.090888,0.359992,0.090878,0.116932,-0.380124,0.135585,0.215688,-0.012688,-0.358676,...,-0.155622,-0.186232,0.095311,-0.169889,0.110939,-0.119627,-0.389535,-0.240192,0.162528,-0.039286


In [57]:
%store final_bert

Stored 'final_bert' (DataFrame)


In [58]:
%store -r

In [59]:
final_bert

Unnamed: 0_level_0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,53750.0,53751.0,53752.0,53753.0,53754.0,53755.0,53756.0,53757.0,53758.0,53759.0
Place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tawang War Memorial,0.258663,-0.167171,0.309656,0.086236,-0.121107,-0.279518,0.069043,0.237783,-0.104324,-0.399753,...,-0.044787,-0.028978,0.265678,-0.088254,0.141890,-0.145662,-0.452453,-0.120055,0.185494,-0.081269
Nuranang Falls,0.286913,-0.333019,0.408509,0.156508,0.073830,-0.350591,0.184779,0.229595,-0.048747,-0.378610,...,-0.047157,-0.193837,-0.036714,-0.063599,0.032528,-0.109214,-0.320143,-0.251761,0.111250,-0.082165
Madhuri Lake,0.297287,-0.274898,0.368848,0.100509,0.122654,-0.386416,0.091595,0.237671,-0.041378,-0.364774,...,-0.081866,-0.168345,0.027702,-0.040837,0.030424,-0.168959,-0.328324,-0.314465,0.184484,-0.067366
Jaswant Garh,0.222298,-0.163456,0.234397,0.012882,-0.030309,-0.236603,0.127837,0.251081,-0.109253,-0.343250,...,-0.082127,0.047515,0.318217,-0.133888,0.242593,-0.056846,-0.514553,-0.074865,0.125103,-0.144011
Bomdila Monastery,0.322692,-0.140776,0.352150,0.029217,0.134435,-0.349131,0.116191,0.237616,-0.077243,-0.400015,...,-0.127308,-0.216476,0.034393,-0.135791,-0.034958,-0.182967,-0.310612,-0.243801,0.208286,-0.089349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
City Centre Mall,0.439128,-0.399354,0.491822,0.176970,0.209121,-0.302848,-0.163024,0.437133,-0.011521,-0.419323,...,-0.297036,-0.241840,0.035031,0.010264,-0.044108,-0.174217,-0.260852,-0.391836,0.192677,-0.179995
Escape Time,0.291791,-0.266065,0.554195,0.216489,0.010779,-0.386322,0.051416,0.105693,0.052501,-0.344103,...,-0.264528,-0.187481,-0.059318,-0.228399,0.207959,-0.181912,-0.459770,-0.389389,0.112776,0.169699
Sela Pass,0.323109,-0.141662,0.398203,0.084149,0.085559,-0.321644,0.163110,0.263912,-0.002894,-0.440621,...,-0.145753,-0.179873,0.034375,-0.148791,0.140842,-0.218558,-0.308770,-0.270690,0.160799,-0.070214
Bumla Pass,0.277872,-0.090888,0.359992,0.090878,0.116932,-0.380124,0.135585,0.215688,-0.012688,-0.358676,...,-0.155622,-0.186232,0.095311,-0.169889,0.110939,-0.119627,-0.389535,-0.240192,0.162528,-0.039286


In [60]:
final_bert.to_csv("final_bert_embedding.csv",index=False)

In [89]:
def ask_question_func(question):
    
    # Define parameters
    max_length = 70  # Avg number of tokens in user input
    stride = 35  # number of tokens to move forward for each window
    text = question

    # Tokenize text
    tokens = tokenizer.tokenize(text)

    # Divide the tokens into windows
    windows = []
    start = 0
    while start < len(tokens):
        end = min(start + max_length, len(tokens))
        windows.append(tokens[start:end])
        start += stride
    # Padding
    windows=[window + ['[PAD]'] * (70 - len(window)) for window in windows]
    # Convert tokenized windows to BERT embeddings
    embeddings = []
    for j in range(len(windows)):
        sys.stdout.write('\r '+str(round((j+1)*100/len(windows), 1))+'% :'+'Question Progress')
        input_ids = tokenizer.convert_tokens_to_ids(windows[j])
        with torch.no_grad():
            outputs = model(torch.tensor([input_ids]))
            window_embedding = outputs[0][0]  # Use last layer for final embedding
        embeddings.append(window_embedding)
    place_emb=0
    for k in range(len(embeddings)):
        place_emb+=embeddings[k].reshape(1,-1)
    # ber=pd.DataFrame(np.array(place_emb/len(embeddings)))
    ques_vec=np.array(place_emb/len(embeddings))
#     sys.stdout.write('\r                                                         '+str(round((i+1)*100/len(df), 1))+'% : Overall Progress')
    
    distance_list = []
    
    for i in final_bert.index:
        print(ques_vec.shape,final_bert.loc[i,:].shape)
        distance_list.append({'place':i, 'cosine distance':cosine(final_bert.loc[i,:], ques_vec)})
    
    return_df = pd.DataFrame(sorted(distance_list, key=lambda d: d['cosine distance']))
    
    return return_df.reset_index().loc[:,['place','cosine distance']]

In [107]:
final_bert.drop(dup,axis=0,inplace=True)

In [153]:
question1 = 'I need to go to a place where I can spend time with family and kids in the evening with the breezy sea winds and \
night views of the sea. The place should be wonderful at night time with colorful lights. Also there should be some playing \
area for kids and seating for adults. I need the place to be located nearby a railway station or a bus terminus so that it could \
be easy to travel. '

question2 = 'I want to do an adventure sport like Parasailing with my wife. So the activity should allow couple entry and also \
should have a guide for it. I am afraid of heights so I need a training session for parasailing before I try it out. The outdoor \
activity should be near a beach so that I can spend some quality time after the parasailing.'

question3 = 'I need to take my kids to a children’s park where they can have some fun. The park should have swings and good \
walk paths with fountains. The park should be nearby some main attractions so that I can also visit some places with my \
family. If the park has some aquarium then it would be an added advantage. '

question4 = 'I feel like going to a beach with crystal-clear waters. The place should have a great marine ecosystem with variety \
of fishes & coral reefs. The place should be an ideal location for water based activities like swimming, diving, snorkeling tec. \
It should also offer different adventure sports options like surfing & scuba diving.'

question5 = 'I do feel like going to a beach with crystal-clear waters. The place should not have a great marine ecosystem with variety \
of fishes & coral reefs. The place should be an ideal location for water based activities like swimming, diving, snorkeling tec. \
It should also offer different adventure sports options like surfing & scuba diving.'

In [110]:
# Define parameters
max_length = 70  # Avg number of tokens in user input
stride = 35  # number of tokens to move forward for each window
text = question1

# Tokenize text
tokens = tokenizer.tokenize(text)

# Divide the tokens into windows
windows = []
start = 0
while start < len(tokens):
    end = min(start + max_length, len(tokens))
    windows.append(tokens[start:end])
    start += stride
# Padding
windows=[window + ['[PAD]'] * (70 - len(window)) for window in windows]
# Convert tokenized windows to BERT embeddings
embeddings = []
for j in range(len(windows)):
    sys.stdout.write('\r '+str(round((j+1)*100/len(windows), 1))+'% :'+'Question Progress')
    input_ids = tokenizer.convert_tokens_to_ids(windows[j])
    with torch.no_grad():
        outputs = model(torch.tensor([input_ids]))
        window_embedding = outputs[0][0]  # Use last layer for final embedding
    embeddings.append(window_embedding)
place_emb=0
for k in range(len(embeddings)):
    place_emb+=embeddings[k].reshape(1,-1)
# ber=pd.DataFrame(np.array(place_emb/len(embeddings)))
ques_vec=np.array(place_emb/len(embeddings))
#     sys.stdout.write('\r                                                         '+str(round((i+1)*100/len(df), 1))+'% : Overall Progress')

distance_list = []

for i in final_bert.index:
#     print(ques_vec.shape,final_bert.loc[i,:].shape)
    distance_list.append({'place':i, 'cosine distance':cosine(final_bert.loc[i,:], ques_vec)})

return_df = pd.DataFrame(sorted(distance_list, key=lambda d: d['cosine distance']))

return_df.reset_index().loc[:,['place','cosine distance']]

 100.0% :Question Progress

Unnamed: 0,place,cosine distance
0,Travel Saga Holidays,0.337858
1,Magic Planet Theme Park,0.339603
2,"Plaza Premium Lounge (Domestic Departures, Ter...",0.343711
3,Sanguine Holidays,0.344276
4,The Hobby Place,0.344736
...,...,...
1242,Sri Bedi Anjaneyaswami Temple,0.461145
1243,Amareswara Temple,0.461984
1244,Sri Raja Rajeswari Temple,0.463163
1245,Kodandarama Temple,0.471181


In [176]:
# Define parameters
max_length = 70  # Avg number of tokens in user input
stride = 35  # number of tokens to move forward for each window
text = question6

# Tokenize text
tokens = tokenizer.tokenize(text)

# Divide the tokens into windows
windows = []
start = 0
while start < len(tokens):
    end = min(start + max_length, len(tokens))
    windows.append(tokens[start:end])
    start += stride
# Padding
windows=[window + ['[PAD]'] * (70 - len(window)) for window in windows]
# Convert tokenized windows to BERT embeddings
embeddings = []
for j in range(len(windows)):
    sys.stdout.write('\r '+str(round((j+1)*100/len(windows), 1))+'% :'+'Question Progress')
    input_ids = tokenizer.convert_tokens_to_ids(windows[j])
    with torch.no_grad():
        outputs = model(torch.tensor([input_ids]))
        window_embedding = outputs[0][0]  # Use last layer for final embedding
    embeddings.append(window_embedding)
place_emb=0
for k in range(len(embeddings)):
    place_emb+=embeddings[k].reshape(1,-1)
# ber=pd.DataFrame(np.array(place_emb/len(embeddings)))
ques_vec=np.array(place_emb/len(embeddings))
#     sys.stdout.write('\r                                                         '+str(round((i+1)*100/len(df), 1))+'% : Overall Progress')

distance_list = []

for i in final_bert.index:
#     print(ques_vec.shape,final_bert.loc[i,:].shape)
    distance_list.append({'place':i, 'cosine distance':cosine(final_bert.loc[i,:], ques_vec)})

return_df = pd.DataFrame(sorted(distance_list, key=lambda d: d['cosine distance']))

'return_df.reset_index().loc[:,['place','cosine distance']]

 100.0% :Question Progress

Unnamed: 0,place,cosine distance
0,Omthara Kala Kuteera,0.177565
1,The Hobby Place,0.188364
2,Thonikadavu,0.188541
3,Mysore Palace,0.189461
4,Hasta Shilpa Heritage Village,0.190810
...,...,...
1242,Sri Raja Rajeswari Temple,0.268347
1243,Kodandarama Temple,0.268918
1244,Della Adventure Park,0.276882
1245,Imagica Water Park,0.281907


In [97]:
np.sum(final_bert.index.value_counts()==2)

8

In [105]:
dup=final_bert.loc[final_bert.index.value_counts()==2].index

Unnamed: 0_level_0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,53750.0,53751.0,53752.0,53753.0,53754.0,53755.0,53756.0,53757.0,53758.0,53759.0
Place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tawang War Memorial,0.258663,-0.167171,0.309656,0.086236,-0.121107,-0.279518,0.069043,0.237783,-0.104324,-0.399753,...,-0.044787,-0.028978,0.265678,-0.088254,0.141890,-0.145662,-0.452453,-0.120055,0.185494,-0.081269
Nuranang Falls,0.286913,-0.333019,0.408509,0.156508,0.073830,-0.350591,0.184779,0.229595,-0.048747,-0.378610,...,-0.047157,-0.193837,-0.036714,-0.063599,0.032528,-0.109214,-0.320143,-0.251761,0.111250,-0.082165
Madhuri Lake,0.297287,-0.274898,0.368848,0.100509,0.122654,-0.386416,0.091595,0.237671,-0.041378,-0.364774,...,-0.081866,-0.168345,0.027702,-0.040837,0.030424,-0.168959,-0.328324,-0.314465,0.184484,-0.067366
Jaswant Garh,0.222298,-0.163456,0.234397,0.012882,-0.030309,-0.236603,0.127837,0.251081,-0.109253,-0.343250,...,-0.082127,0.047515,0.318217,-0.133888,0.242593,-0.056846,-0.514553,-0.074865,0.125103,-0.144011
Bomdila Monastery,0.322692,-0.140776,0.352150,0.029217,0.134435,-0.349131,0.116191,0.237616,-0.077243,-0.400015,...,-0.127308,-0.216476,0.034393,-0.135791,-0.034958,-0.182967,-0.310612,-0.243801,0.208286,-0.089349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
City Centre Mall,0.439128,-0.399354,0.491822,0.176970,0.209121,-0.302848,-0.163024,0.437133,-0.011521,-0.419323,...,-0.297036,-0.241840,0.035031,0.010264,-0.044108,-0.174217,-0.260852,-0.391836,0.192677,-0.179995
Escape Time,0.291791,-0.266065,0.554195,0.216489,0.010779,-0.386322,0.051416,0.105693,0.052501,-0.344103,...,-0.264528,-0.187481,-0.059318,-0.228399,0.207959,-0.181912,-0.459770,-0.389389,0.112776,0.169699
Sela Pass,0.323109,-0.141662,0.398203,0.084149,0.085559,-0.321644,0.163110,0.263912,-0.002894,-0.440621,...,-0.145753,-0.179873,0.034375,-0.148791,0.140842,-0.218558,-0.308770,-0.270690,0.160799,-0.070214
Bumla Pass,0.277872,-0.090888,0.359992,0.090878,0.116932,-0.380124,0.135585,0.215688,-0.012688,-0.358676,...,-0.155622,-0.186232,0.095311,-0.169889,0.110939,-0.119627,-0.389535,-0.240192,0.162528,-0.039286


In [90]:
ask_question_func("elephant ride and jungle").head()

 100.0% :Question Progress(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (53760,)
(1, 53760) (2, 53760)


ValueError: Input vector should be 1-D.

In [3]:
question6='Kerala is known for its lush green landscapes, serene backwaters, and beautiful beaches. It has a rich cultural heritage, which is reflected in its vibrant art, music, and dance forms. The state is also famous for its delicious cuisine, which is a blend of local spices and flavors.\
One of the main reasons people love Kerala is because of its peaceful and relaxing atmosphere. The state has a laid-back vibe that helps visitors unwind and de-stress. It is an ideal destination for those seeking a tranquil getaway from the hustle and bustle of city life.\
Kerala is also famous for its Ayurvedic treatments, which use natural herbs and oils to promote wellness and rejuvenation. Many tourists come to Kerala to experience these therapeutic treatments and to learn more about Ayurveda.Overall, Kerala natural beauty, cultural richness, and peaceful atmosphere make it a beloved destination for travelers from all over the world.'

In [2]:
question7='Kerala, also known as God\'s Own Country, is a state located in the southern region of India. It is known for its lush green landscapes, serene backwaters, beautiful beaches, and rich cultural heritage. The state has a diverse geography that ranges from the towering Western Ghats to the pristine beaches of the Arabian Sea, making it a perfect destination for nature lovers.\
Kerala is famous for its Ayurvedic treatments, which draw many people seeking holistic healing experiences. The state is also known for its delicious cuisine, which includes coconut-based curries, seafood, and traditional snacks.\
The people of Kerala are known for their warm hospitality and friendly nature, which makes visitors feel welcome and comfortable. Kerala has a rich cultural heritage, and visitors can experience it through its vibrant festivals, dance forms, and traditional art forms.\
Overall, Kerala is a must-visit destination for anyone looking to experience the beauty of nature and immerse themselves in a unique cultural experience.'

In [185]:
question8='West Bengal is a state with a rich cultural heritage, which is one of the main reasons people are drawn to it. From the bustling city of Kolkata to the serene Sunderbans, the state offers a unique blend of nature, art, and cuisine. The state\'s diverse geography, ranging from the Himalayan mountains to the Ganges delta, offers visitors the opportunity to experience different kinds of natural beauty.\
The state\'s cultural offerings, including art, music, and literature, are another reason why people like West Bengal. The works of Rabindranath Tagore, a Nobel Laureate who hails from the state, continue to inspire people across the globe. The state is also famous for its folk art, including the Baul music tradition and Chhau dance.\
West Bengal\'s cuisine is another reason why people may like the state. The fusion of various influences, including Bengali, British, and Mughal, offers visitors a wide variety of mouth-watering dishes, including fish curries, street food, and sweets like rasgulla and sandesh.\
Overall, West Bengal\'s rich cultural heritage, diverse geography, and delicious cuisine are some reasons why people may like the state.'

In [186]:
# Define parameters
max_length = 70  # Avg number of tokens in user input
stride = 35  # number of tokens to move forward for each window
text = question8

# Tokenize text
tokens = tokenizer.tokenize(text)

# Divide the tokens into windows
windows = []
start = 0
while start < len(tokens):
    end = min(start + max_length, len(tokens))
    windows.append(tokens[start:end])
    start += stride
# Padding
windows=[window + ['[PAD]'] * (70 - len(window)) for window in windows]
# Convert tokenized windows to BERT embeddings
embeddings = []
for j in range(len(windows)):
    sys.stdout.write('\r '+str(round((j+1)*100/len(windows), 1))+'% :'+'Question Progress')
    input_ids = tokenizer.convert_tokens_to_ids(windows[j])
    with torch.no_grad():
        outputs = model(torch.tensor([input_ids]))
        window_embedding = outputs[0][0]  # Use last layer for final embedding
    embeddings.append(window_embedding)
place_emb=0
for k in range(len(embeddings)):
    place_emb+=embeddings[k].reshape(1,-1)
# ber=pd.DataFrame(np.array(place_emb/len(embeddings)))
ques_vec2=np.array(place_emb/len(embeddings))

 100.0% :Question Progress

In [187]:
ques_vec2

array([[-0.00393546, -0.03130812,  0.03632314, ..., -0.335994  ,
         0.27196154,  0.02047628]], dtype=float32)

In [182]:
ques_vec1

array([[ 0.01487788, -0.09394608,  0.12555854, ..., -0.42535797,
         0.38513598, -0.126787  ]], dtype=float32)

In [188]:
cosine(ques_vec1,ques_vec2)

0.17120695114135742

In [202]:
## Universal Sentence Encoder EMBEDDINGS

In [6]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [1]:
import numpy as np
import tensorflow_hub as hub
import time

In [7]:

# Load the USE model
model_load_start=time.time()
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
model_load_end=time.time()
# Generate embeddings for a review and user input
review = question6
user_input = question7
emb_load_start=time.time()
review_embedding = embed([review])[0]
emb_load_end=time.time()
user_emb_load_start=time.time()
user_input_embedding = embed([user_input])[0]
user_emb_load_end=time.time()
# Pad the shorter embedding with zeros
if len(review_embedding) < len(user_input_embedding):
    review_embedding = np.pad(review_embedding, (0, len(user_input_embedding) - len(review_embedding)), mode="constant")
else:
    user_input_embedding = np.pad(user_input_embedding, (0, len(review_embedding) - len(user_input_embedding)), mode="constant")

# Compute the cosine similarity
similarity = np.dot(review_embedding, user_input_embedding) / (np.linalg.norm(review_embedding) * np.linalg.norm(user_input_embedding))
print(similarity)


0.9215545


In [8]:
model_load_end-model_load_start

0.0029985904693603516

In [9]:
emb_load_end-emb_load_start

19.79423213005066

In [10]:
user_emb_load_end-user_emb_load_start

0.23778057098388672

In [201]:
review = df.Description[0]
user_input = df.Description[1]
emb_load_start=time.time()
review_embedding = embed([review])[0]
emb_load_end=time.time()

ResourceExhaustedError: Graph execution error:

OOM when allocating tensor with shape[1,8,24160,24160] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node EncoderTransformer/Transformer/SparseTransformerEncode/Layer_0/SelfAttention/SparseMultiheadAttention/DotProductAttention/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_restored_function_body_70305]

In [None]:
user_emb_load_start=time.time()
user_input_embedding = embed([user_input])[0]
user_emb_load_end=time.time()

In [None]:
emb_load_end-emb_load_start

In [None]:
user_emb_load_end-user_emb_load_start

In [11]:
(tokenizer.convert_tokens_to_ids(windows[0])),len(tokens)

([2023,
  2003,
  1037,
  2936,
  3793,
  2008,
  23651,
  1996,
  1012,
  4555,
  5537,
  3091,
  1997,
  1996,
  14324,
  2944,
  1012],
 17)

In [None]:
# Convert tokenized windows to BERT embeddings
embeddings = []
for window in windows:
    input_ids = tokenizer.convert_tokens_to_ids(window)
    with torch.no_grad():
        outputs = model(torch.tensor([input_ids]))
        window_embedding = outputs[0][0]  # Use last layer for final embedding
    embeddings.append(window_embedding)