In [1]:
model = 'model0'

In [2]:
import os, re, sys
import numpy as np
import pandas as pd
import string
import warnings

In [3]:
#Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#distance
from scipy.spatial.distance import cosine

from gensim.models import KeyedVectors

In [4]:
# Global variables

warnings.filterwarnings("ignore")

loweralphabets="abcdefghijklmnopqrstuvwxyz"
upperalphabets="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
all_alphabets=loweralphabets+upperalphabets

p = string.punctuation
remv_punc = str.maketrans("", "", p)

ps = PorterStemmer()

nltk.download('stopwords')
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Asking question.

In [5]:
data = pd.read_csv('pre_processed_all_data.csv')

In [6]:
# w2v_df_model0 : created with google-news-300 embeddings
# w2v_df_model1 : created with combined review as corpus
# w2v_df_model2 : created with individual reviews as corpus
# w2v_df_model3 : created with reviews splitted to sentence level corpus
# w2v_df_model4 : created with model 2 then replaced available vectors from google embeddings

# Load back with memory-mapping = read-only, shared across processes.

# w2v_df_model0 : created with google-news-300 embeddings
# word2vec1.wordvectors : created with combined review as corpus
# word2vec2.wordvectors : created with individual reviews as corpus
# word2vec3.wordvectors : created with reviews splitted to sentence level corpus
# word2vec4.wordvectors : created with model 2 then replaced available vectors from google embeddings

if model == 'model0':
    
    w2v_df = pd.read_csv('w2v_df_model0.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load_word2vec_format(r'C:\Users\asus\Documents\IPYNB Files\CAPP\GoogleNews-vectors-negative300\GoogleNews-vectors-negative300.bin', binary = True)
    
elif model == 'model1':
    
    w2v_df = pd.read_csv('w2v_df_model1.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load("word2vec1.wordvectors", mmap='r')
    
elif model == 'model2':
    
    w2v_df = pd.read_csv('w2v_df_model2.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load("word2vec2.wordvectors", mmap='r')
    
elif model == 'model3':
    
    w2v_df = pd.read_csv('w2v_df_model3.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load("word2vec3.wordvectors", mmap='r')
    
elif model == 'model4':
    
    w2v_df = pd.read_csv('w2v_df_model4.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load("word2vec4.wordvectors", mmap='r')
    
elif model == 'model5':
    
    w2v_df = pd.read_csv('w2v_df_model5.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load("word2vec5.wordvectors", mmap='r')
    
elif model == 'model6':
    
    w2v_df = pd.read_csv('w2v_df_model6.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load("word2vec6.wordvectors", mmap='r')
    
elif model == 'model7':
    
    w2v_df = pd.read_csv('w2v_df_model7.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load("word2vec7.wordvectors", mmap='r')
    
elif model == 'model8':
    
    w2v_df = pd.read_csv('fasttext_df0.csv').drop('Unnamed: 0',axis=1)
    embeddings = KeyedVectors.load("fasttext0.wordvectors", mmap='r')

In [None]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def process_one_sentence(sentence):
    
    #1. Converting the text into lower cases
    sentence = sentence.lower()

    #2.Removing stopwords
    sentence = " ".join([w for w in sentence.split() if w not in stop_words])

    #3. Removing punctuation from all the tweets
    sentence = sentence.translate(remv_punc)

    #4. Remove white spaces
    sentence = sentence.replace("  ", " ").strip()

    #5 . Remove emojis
    sentence = remove_emojis(sentence)

    if model not in ('model0','model4','model6','model7'):
        sentence = " ".join([ps.stem(w) for w in sentence.split()])

    final = []
    for word in sentence.split():
        w = ""
        for char in word:
            if char in all_alphabets:
                w = w + char
        final.append(w)

    sentence = " ".join(final)

    return sentence

# function to ask the question and return the places in the list sorted by cosine similarity.

def ask_question_func(question):

    # process the question first
    question = process_one_sentence(question)
    
    doc_as_list = question.split()
    
    doc_len = len(doc_as_list)
    
    if model in ( 'model0', 'model4','model6','model7','model8') :
        doc_vector = np.array([0]*300)
    else:
        doc_vector = np.array([0]*150)
        
    for word in doc_as_list: 
        try:
            doc_vector = doc_vector + embeddings[word]  
        except:
            print('\'',word ,'\' word in question has no vector')
            pass
        
    question_vector = doc_vector / doc_len
    
    # final df to be returned. This method of using a dictionary is much faster than adding to df everytime :) 

    distance_list = []
    
    for i in w2v_df.index:
        distance_list.append({'place':data.loc[i, 'title'], 'State/UT':data.loc[i, 'State/UT'], 'cosine distance':cosine(w2v_df.loc[i,:], question_vector)})
    
    return_df = pd.DataFrame(sorted(distance_list, key=lambda d: d['cosine distance']))
    
    return return_df.reset_index().loc[:,['place','State/UT','cosine distance']]

## Testing

In [None]:
ask_question_func("elephant ride and jungle").head()

In [None]:
ask_question_func("best beach in near airport where i can spend some time before flying").head()

In [None]:
ask_question_func("neat and clean beach where i can drive my car and watch sunset").head()

In [None]:
# user question
ask_question_func("i want to go to away to the hills with forests").head()

In [None]:
# user question
ask_question_func("i want to go to hill top park")

In [None]:
# user question
ask_question_func("lighthouse hills")

In [None]:
# but the same question as beach
ask_question_func("lighthouse on the hills and calm beach")

In [None]:
# but the same question as beach
ask_question_func("best fort in jaipur")

In [None]:
# but the same question as beach

ask_question_func("elephant ride and jungle").head()

In [None]:
# but the same question as beach
ask_question_func("wonderla park")

In [None]:
ask_question_func("i want to go to a temple near kovalam beach").head()

In [None]:
ask_question_func("best beach in near airport where i can spend some time before flying").head()

In [None]:
ask_question_func("varkala beach").head()

In [None]:
question1 = 'I need to go to a place where I can spend time with family and kids in the evening with the breezy sea winds and \
night views of the sea. The place should be wonderful at night time with colorful lights. Also there should be some playing \
area for kids and seating for adults. I need the place to be located nearby a railway station or a bus terminus so that it could \
be easy to travel. '

question2 = 'I want to do an adventure sport like Parasailing with my wife. So the activity should allow couple entry and also \
should have a guide for it. I am afraid of heights so I need a training session for parasailing before I try it out. The outdoor \
activity should be near a beach so that I can spend some quality time after the parasailing.'

question3 = 'I need to take my kids to a children’s park where they can have some fun. The park should have swings and good \
walk paths with fountains. The park should be nearby some main attractions so that I can also visit some places with my \
family. If the park has some aquarium then it would be an added advantage. '

question4 = 'I feel like going to a beach with crystal-clear waters. The place should have a great marine ecosystem with variety \
of fishes & coral reefs. The place should be an ideal location for water based activities like swimming, diving, snorkeling tec. \
It should also offer different adventure sports options like surfing & scuba diving.'

print('\nQuestion 1 result:\n',question1,'\n\n',ask_question_func(question1).drop('cosine distance',axis=1).head())
print('\nQuestion 2 result:\n',question2,'\n\n',ask_question_func(question2).drop('cosine distance',axis=1).head())
print('\nQuestion 3 result:\n',question3,'\n\n',ask_question_func(question3).drop('cosine distance',axis=1).head())
print('\nQuestion 4 result:\n',question4,'\n\n',ask_question_func(question4).drop('cosine distance',axis=1).head())

In [None]:
ask_question_func('I am planning to go on a beautiful holiday destination specially hills. There should be Sea '
                  + 'side restaurants and sea view from hill where I can capture some good photos. After that '
                  + 'I need to stay at a Resort nearby with parking facility. I love sea foods, so restaurants with '
                  + 'sea foods should be around.')

In [None]:
ask_question_func('I want to experience a lovely cold breeze and beautiful green tea gardens, ' +
                  'soothing views of distance hills. The place should be surrounded by mountains covered ' +
                  'in dense fog so that I can feel like I am in midst of clouds. I also want to explore boating, ' + 
                  'echo point and gardens.')

## Validation

In [None]:
val_df = pd.rea