# Homework 3 - Find the perfect place to stay in Texas!
### Group 14 - PavanKumar Alikana, Matteo Cavalletti, Francesca Porcu

The homework consists in analyzing the text of Airbnb property listings and building a search engine.

In [1]:
#Import required libraries
import pandas as pd
# For displaying search results in a table
from IPython.display import HTML, display
from os.path import join as pjoin
import csv

# For persisting indexes in an external file
import pickle
import math
import heapq
from datetime import datetime
from pathlib import Path


import nltk
import csv
import re
import os

# For word tokenization
from nltk.tokenize import RegexpTokenizer
# For stop words list
from nltk.corpus import stopwords
# For word stemming
from nltk.stem.snowball import SnowballStemmer

#First we import stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#To remove punctuation we use regexptokenizer, but we leave dollar symbol $ because maybe is used in some queries
tokenizer = RegexpTokenizer(r'\w+|\$')
#we create the stemmer
ps = SnowballStemmer('english')

# Path to the current working directory to refer to all the files relatively
my_path = os.path.dirname(os.path.realpath('__file__'))

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


# Step 1: Data

In [2]:
#Reading the main CSV file
m = pd.read_csv("Airbnb_Texas_Rentals.csv")
doc_len = len(m)

# These are commented because we already processed the reviews in to invidual CSV file per review
#we found words like '\\n' in the dataset so we cleaned it
#m = m.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\\n',  ' ', regex=True)
#m = m.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\\t',  ' ', regex=True)
#m = m.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\\r',  ' ', regex=True)

#create tsv files,we droped the first column that was a prroblem
#m = m.drop(['Unnamed: 0'], axis=1)

# Step 2: Create documents

In [3]:
# These lines are commented because we already have the individual csv file per review,
# after first time we executed this section of code. so we no longer need to execute this every time
# Create a separate csv file for every row in the reviews csv file
#for i in range(len(m)):
#    with open(os.path.join(my_path, 'docu_hw3/doc_' + str(i) + '.tsv', 'w', newline='',encoding='utf-8') as output:
#        tsv_output = csv.writer(output, delimiter='\t')
#        tsv_output.writerow(m.iloc[i])

# Step 3: Search Engine

At this point we create the ***vocabulary***. We don't modify directly tsv files because we need them for the output of the search engines, but we create a dictionary that assigns to each file words that would have contained if we had preprocessed them. In particular, we apply to the words contained in each file these procedures:
- *Removing stopwords*
- *Removing punctuation*
- *Stemming*
- *Lower-case letters*

## 3.1) Conjunctive query

### 3.1.1) Create your index!

In [4]:
review_content_persist = {}
vocabulary_persist = {}
words_persist = {}

# Retrieving persisted information for review content and word map
# Please create a directory(in your current working directory) with name 'indexes'  
content_file = Path(os.path.join(my_path, "indexes/review_content.pkl"))
vocabulary_file = Path(os.path.join(my_path, "indexes/vocabulary.pkl"))
words_file = Path(os.path.join(my_path, "indexes/words.pkl"))


# Retrieving already persisted information

# Check if the index file exists, 
#if yes load the previously persisted indexes and content
if content_file.is_file():
    with open(content_file, "rb") as review_content:
        review_content_persist = pickle.load(review_content)
        review_content.close()
        
# Check if the vocabulary file exists, 
#if yes load the previously persisted vocabulary
if vocabulary_file.is_file():
    with open(vocabulary_file, "rb") as vocabulary:
        vocabulary_persist = pickle.load(vocabulary)
        vocabulary.close()
        
# Check if the words file exists, 
#if yes load the previously persisted words
if words_file.is_file():
    with open(words_file, "rb") as words:
        words_persist = pickle.load(words)
        words.close()

if(len(review_content_persist.keys()) == 0):
    
    review_word_map = {}
    
    # We reach here if we don't have indexes already present
    print("Indexes are being created")
    
    #we create the vocabulary of preprocessed documents,but we don't modify the documents because we''l use them in search engine
    
    for i in range(doc_len):
        with open(os.path.join(my_path, 'docu_hw3/doc_' + str(i) + '.tsv'),encoding='utf8') as tsvfile:
             tsvreader = list(csv.reader(tsvfile, delimiter="\t"))
        
        # For review title
        l1 = tsvreader[0][4]
        
        # For review content
        l2 = tsvreader[0][7]
        
        l = l1+ ' ' +l2
        l = l.lower()
        l = tokenizer.tokenize(l)
        
        # This array will contain all the valid words in a given review after removing 
        # all the stop words, punctuations, stemming etc..,, we will use this information
        # to find out the term frequency there by tf-idf values
        file_words = []
        
        for r in l :
            if not r in stop_words:
                sr = ps.stem(r)
                
                file_words.append(sr)
                
                if not  sr in review_word_map:
                    review_word_map[sr] = [i]
                else:
                    review_word_map[sr]+=[i]
                    
                    
        review_content_persist[i] = ' '.join(file_words)
    
    # Saving the content and indexes for the first time
    # We made use of pickel python module
    #Saving content dictionary
    with open(content_file, "wb") as review_content:
        pickle.dump(review_content_persist, review_content)
        review_content.close()
    
    # Word and Vocabulary indexes based on word map
    c = 0
    for key in review_word_map:
        words_persist[key] = c
        vocabulary_persist[c] = review_word_map[key]
        c += 1
    
    #Save vocabulary and words
    with open(vocabulary_file, "wb") as vocabulary:
        pickle.dump(vocabulary_persist, vocabulary)
        vocabulary.close()
        
   
    with open(words_file, "wb") as words:
        pickle.dump(words_persist, words)
        words.close()
    
    
                

            

### 3.1.2) Execute the query

In [5]:
word = input('Enter a search query: ')

# Cleaning user input similar to what we did for creating indexes for words
def clean_input(w):
    w_list = []
    w = w.lower()
    w = tokenizer.tokenize(w)
    # Check if we need to do any other preprocessing to improve the efficiency of search results
    
    for r in w :
        if not r in stop_words:
            sr = ps.stem(r)
            if not  sr in w_list:
                w_list.append(sr)
    return w_list

# Show search results in tabular format
def show_results(results, doc_list, isScore):
    
    if(len(doc_list)):
        print('Found ' + str(len(doc_list))  + ' matching reviews to your query')
    
    if(len(results)):
        if(isScore):
            tableFormat = '<table border="1"><tr><th>Title</th><th>Description</th><th>City</th><th>URL</th><th>Score</th></tr><tr>{}</tr></table>'
        else:
            tableFormat = '<table border="1"><tr><th>Title</th><th>Description</th><th>City</th><th>URL</th></tr><tr>{}</tr></table>'
        
        
        display(HTML(tableFormat.format('</tr><tr>'.join('<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in results)
)))
    else:
        display(HTML('<h1>No results found. Please try a different query</h1>'))
    


word_list = clean_input(word)

print("Cleaned word: ", word_list)

list_doc_list = []

for w in word_list:
        doc_list = []
        
        if w in words_persist: 
            doc_list = vocabulary_persist[words_persist[w]]
            
        list_doc_list.append(doc_list)  

# Initially assinging the list intersection to the matching documents of first word
list_intersect = list_doc_list[0]

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 
        
for docList in list_doc_list:
    list_intersect = intersection(list_intersect, docList)
    
results = []

# Removing the duplicates in the document intersection
list_intersect = list(set(list_intersect))

i_len = len(list_intersect)

if(i_len):
    
    r_limit = 10
    
    if(i_len < 10):
        r_limit = i_len
    
    # Showing at most ten results
    for doc in list_intersect[:r_limit]:
        
        # Reading each document based on document id in list intersect 
        with open(os.path.join(my_path, 'docu_hw3/doc_' + str(doc) + '.tsv'),encoding='utf8') as tsvfile:
             tsvreader = list(csv.reader(tsvfile, delimiter="\t"))

        title = tsvreader[0][7]        
        description = tsvreader[0][4]
        city = tsvreader[0][2]
        url = tsvreader[0][8]

        results.append([title, description, city, url])

# Displaying the results
show_results(results, list_intersect, False)

Enter a search query: roma
Cleaned word:  ['roma']


## 3.2) Conjunctive query & Ranking score

### 3.2.1) Inverted index

In [6]:
iindex_tf_idf_persist = {}


# Check if the index file exists, if yes load the previously persisted indexes and content
# Please create a directory(in your current working directory) with name indexes  
index_file = Path(os.path.join(my_path, "indexes/iindex_tf_idf.pkl"))

# Check if the index file exists, if yes load the previously persisted indexes
if index_file.is_file():
    # Retriving precreated inverted indexes
    with open(index_file, "rb") as iindex_tf_idf:
        iindex_tf_idf_persist = pickle.load(iindex_tf_idf)
        iindex_tf_idf.close()
        
    
if(len(iindex_tf_idf_persist.keys()) == 0):
    
    print("Inverted Indexes are being calculated")

    word_iindex = {}

    #Creating inverted index using tf-idf and consine similarity
    for word in words_persist:
        word_doc_list = vocabulary_persist[words_persist[word]]
        word_iindex[word] = []

        # Store indexes based on number of times a particular word is present in a given document
        for doc in word_doc_list:
            doc_content = review_content_persist[doc]
            # Pushing the term frequency with document id
            word_iindex[word].append([doc, doc_content.split().count(word)])

    # Store indexes based on tf-idf
    docs_length = len(review_content_persist.keys())
    iindex_tf_idf_persist = word_iindex

    for key, word in iindex_tf_idf_persist.items():
        # find out the relative importance of a particular terms relating it to document count
        idf= math.log10( docs_length / len(word) )

        for elem in word:
            # Add the document score corresponding to a particular term which we then use in the 
            # search results ranking of documents
            elem[1] = idf * elem[1]
    
    # Persisting the indexes calculated 
    with open(index_file, "wb") as iindex_tf_idf:
        pickle.dump(iindex_tf_idf_persist, iindex_tf_idf)
        iindex_tf_idf.close()
        


### 3.2.2) Execute the query

In [7]:
dict_qcos = {}
dict_norm = {}

#print("Search started")

# Need to remove this 100 hardcoding
for doc in list_intersect[:100]:
    
    num = 0
    
    #print("Current document ID: " + str(doc))
    
    # Calculating numerator of the cosine similarity equation
    
    for word in word_list:
        w_index = iindex_tf_idf_persist[word]
        w_i_len = len(w_index)
        for i in range(w_i_len):
            if w_index[i][0] == doc: 
                num +=  w_index[i][1]
                
    dict_qcos[doc]=num

    
    # Calculating denominator of the cosine similarity equation
    norm = 0
    for word in iindex_tf_idf_persist.values():
        for i in range(len(word)):
            if word[i][0] == doc:
                norm +=  word[i][1]**2
    
    dict_norm[doc]=math.sqrt(norm)

#print("Numerator and Denominator calculated")

# Once numerator and denominator is calculated find the score of each document in the intersection list 
# By applying the consine similarity formala
for doc,num in dict_qcos.items():
    # Eleminating divided by zero problem to check if the normalization value for a document is non-zero
    if dict_norm[doc] != 0:
        dict_qcos[doc] = num/(math.sqrt(len(word_list))*dict_norm[doc])

#print("Cosine similarity done")

In [8]:
# Applying heap data structure to print to top-k documents

h = []
results = []

for doc in dict_qcos.keys():
    
    # Reading the document meta data to print in the search results
    
    with open(os.path.join(my_path, 'docu_hw3/doc_' + str(doc) + '.tsv'),encoding='utf8') as tsvfile:
         tsvreader = list(csv.reader(tsvfile, delimiter="\t"))
    
    title = tsvreader[0][7]        
    description = tsvreader[0][4]
    city = tsvreader[0][2]
    url = tsvreader[0][8]
    
    # Pushing the document information to heap data structure
    
    heapq.heappush(h,(dict_qcos[doc], title, description, city, url))

#print("Applying Heap")    
# Applying max heap algorithm
heapq._heapify_max(h)

limit = 10
doc_len = len(dict_qcos.keys())

if(doc_len < 10):
    limit = doc_len

for i in range(limit):
    # Popping the document with maximum score at every step and adding it to the result list
    
    # Since the data structure used in heap is a tuple, we convert it in to list for ease of manipulation
    results.append(list(heapq.heappop(h)))
    
    # re-applying the max heap algorithm
    heapq._heapify_max(h)

#print("Applying Heap")  

#print(results)
results_formatted = []

for i in results:
    
    first = round(i.pop(0), 4)
    i.append(first) 
    #first = round(float(results[i].pop(0)), 4)
    #results[i] = results[i].append(first)
    results_formatted.append(i)


# Displaying the results
show_results(results_formatted, [], True)

# Step 4: Define a new score!

New score is:
- *average price per night (av_r_n_u)*
- *n° of bedrooms (bedr_c_u)*
- *zone (zone_u)*

**Average price per night**: we simply use average price per night from the tsv files and from this variable any house can get at maximum 2 points and at least 0. Maximum number of points (2) is achieved if the price is lower than half of the price suggested by the user, instead if price is two times the requested price the house get 0. To get 1 point you need to have the same price or lower than 1.25 times the suggested price. Naturally, other intermediate marks can be achieved.

**n° of bedrooms**: if the n° of bedrooms requested are more or the same, house gets 1 point; if there are less bedrooms, house gets a lower mark. If there are three bedrooms less than requested or worse, house gets 0 point. 

**zone**: We found a set of coordinates (31,169621, -99,683617) that can be considered the center of texas; that is near to the city of Brady (County of McCulloch), so we divided Texas in four zones according to this center (like a cartesian coordinate system). Zones are: North East (NE), North West (NW), South East (SE) and South West (SW). If the house is in the requested zone it gets 1 point else 0. Is important to clarify that we didn't use cities because, given the fact that there are cities with the same name in different counties (e.g. we have 3 Austin), we can't use them because they can be confused.

Finally we can calculate the Score as sum/4 where sum is sum of the accomulated scores from individul fields based on user choices and 4 is maximum number points that can be accumulated.

In [9]:
av_r_n_u = 0
bedr_c_u = 1000
zone_u = 'NA'

try:
    av_r_n_u = int(input('please, enter maximum price that you can pay(e.g. 10): '))
except:
    print('No price suggested')
    pass
try:
    bedr_c_u = int(input('please, enter the number of bedrooms that you need(e.g 5): '))
except:
    print('No indication about the needed number of bedrooms')
    pass
try:
    zone_u = input('please enter the zone you want to reside(e.g. NE): ')
except:
    print('No zone was indicated')
    pass
list_ord =[]

n_h = []

for doc in list_intersect:
    with open(r'docu_hw3\doc_'+ str(doc) + '.tsv',encoding='utf8') as tsvfile:
         tsvreader = list(csv.reader(tsvfile, delimiter="\t"))
    title = tsvreader[0][7]        
    description = tsvreader[0][4]
    latitude = tsvreader[0][5]
    longitude = tsvreader[0][6]
    url = tsvreader[0][8]
    price = list(tsvreader[0][0])
    eff_price = ''
    for i in range(1,len(price)):
        eff_price += price[i]
    
    
    try:
        av_r_n = int(eff_price)
        bedr_c = int(tsvreader[0][1])
    except ValueError:
        av_r_n = 1000000
        bedr_c = 1
    
    sum_values = 0
    if av_r_n >= 2*av_r_n_u :
               sum_values += 0          
    elif av_r_n < 2*av_r_n_u and av_r_n >= 1.75*av_r_n_u:
               sum_values += 0.25
    elif  av_r_n < 1.75*av_r_n_u and av_r_n >= 1.5*av_r_n_u:
               sum_values += 0.50  
    elif av_r_n < 1.5*av_r_n_u and av_r_n >= 1.25*av_r_n_u:
               sum_values += 0.75 
    elif av_r_n < 1.25*av_r_n_u and av_r_n >= av_r_n_u:
               sum_values += 1           
    elif av_r_n < av_r_n_u and av_r_n >=0.75*av_r_n_u:
               sum_values += 1.25
    elif av_r_n < 0.75*av_r_n_u and av_r_n >=0.5*av_r_n_u :
               sum_values += 1.5
    elif av_r_n < 0.5*av_r_n_u :
               sum_values += 2.
    
    
    if bedr_c >= bedr_c_u:
               sum_values += 1
    elif bedr_c == bedr_c_u-1:
               sum_values += 0.75
    elif bedr_c == bedr_c_u-2:
               sum_values += 0.5
    elif bedr_c <= bedr_c_u-3:
               sum_values += 0 #only to make index easier to understand , no practical effect
            
    try:
        la = float(latitude)
        lo = float(longitude)
        if la <=31.169621 and lo <= -99.683617:
             zone ='SW'
        elif la <=31.169621 and lo > -99.683617:
             zone ='SE'
        elif la >31.169621 and lo > -99.683617:
             zone ='NE'
        elif la >31.169621 and lo <= -99.683617:
             zone ='NW'
        if zone == zone_u:
            sum_values +=1
    except:
        pass
    
    
    score = round(sum_values/4,2)
    
    #list_ord.append([title, description, city, url, str(score),score])
    
    heapq.heappush(n_h,(score, title, description, city, url))

results = []

# Applying heapify max algorithm
heapq._heapify_max(n_h)

limit = 10
d_len = len(list_intersect)

if(d_len < 10):
    limit = d_len

for i in range(limit):
    results.append(list(heapq.heappop(n_h)))
    heapq._heapify_max(n_h)

#print("Applying Heap")  

#print(results)
results_formatted = []

for i in results:
    first = round(i.pop(0), 4)
    i.append(first) 
    results_formatted.append(i)
    

# Displaying the results
show_results(results_formatted, [], True)

please, enter maximum price that you can pay(e.g. 10): 500
please, enter the number of bedrooms that you need(e.g 5): 2
please enter the zone you want to reside(e.g. NE): ariosto


# Bonus Step: Make a nice visualization!

An important feature of Airbnb is the search on the map. 

Our tool will works in the following way:

- Takes in input a set of coordinates and a maximum distance from the coordinates.
- Generate a map, with a circle of the given radius, where the center is represented by the coordinates given in input.
- Shows the houses that are inside the circle of the given radius.

In [10]:
#We drop rows where latitude and/or longitude column contains missing values (NaN)
m = m.dropna(subset=['latitude', 'longitude'])

In [11]:
#We ask as input latitude, longitude and a maximum distance to generate the radius
lat = float(input('Enter a latitude: '))
lon = float(input('Enter a longitude: '))
dis = float(input('Enter distance range (in km): '))

Enter a latitude: 5
Enter a longitude: 10
Enter distance range (in km): 10


In [14]:
#!pip install geopy
import folium
from geopy import distance

#We create the map with given coordinates
mp = folium.Map(location = [lat, lon], zoom_start = 12)

In [15]:
#Search represents the set of given coordinates
search = (lat, lon)

#Now, we create the marker associated with input set of coordinates, that we call origin
folium.Marker(location = [lat, lon], popup = 'origin', icon = folium.Icon(color = 'green', icon = 'home')).add_to(mp)

#Then, we generate the circle with input set of coordinates as center. 
folium.Circle(location = [lat, lon], radius = dis * 1000).add_to(mp)

#For each house that is in the given distance from the input set of coordinates, we create a marker with price that 
#can be clicked to access the house web page
for row in m.itertuples():
    if distance.distance(search, (row.latitude, row.longitude)).km <= dis:
        folium.Marker(location = [row.latitude, row.longitude], popup = folium.Popup('<a href=' + row.url + '>' + row.average_rate_per_night + ' </a>')).add_to(mp)
        
mp.save('map.html')