In [11]:
import pandas as pd
import csv
import nltk
import os
from pandas import DataFrame
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from collections import defaultdict

In [2]:
# Import the data
df = pd.read_csv("Airbnb_Texas_Rentals.csv") 

In [3]:
df.isnull().sum()

Unnamed: 0                 0
average_rate_per_night    28
bedrooms_count             3
city                       0
date_of_listing            0
description                2
latitude                  34
longitude                 34
title                      3
url                        0
dtype: int64

In [4]:
# Drop the rows with null numbers 
df = df[pd.notnull(df["description"])]
df = df[pd.notnull(df["title"])]

In [5]:
# Create a tsv file for each row
for i in range(len(df)):
    pd.DataFrame(df.iloc[i]).transpose().to_csv('Files/doc_%s.tsv'%i, sep='\t')

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
ps = PorterStemmer()

In [7]:
# Create an empty set for the vocabulary
voc_set = set()
# And an empty dictionary for storage the words for each document
d = {}

In [8]:
# For every file...
for i in range(len(df)):
    doc = pd.read_csv('Files/doc_%s.tsv'%i, sep='\t')
    # Concatenate the description and title in a string
    words = doc["description"][0] + doc["title"][0]
    # Remove "\\n" and replace with a space
    words = words.replace("\\n"," ")
    # Convert everything to lowercase
    words = words.lower()
    # Removing punctuation
    words = tokenizer.tokenize(words)
    # Stemming
    words = [ps.stem(word) for word in words]
    # Keep only unique words
    words = set(words)
    # Removing stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    # No integers
    words = [x for x in words if not (x.isdigit() or x[0].isdigit())]
     # Storage the words in vocabulary set
    voc_set.update(words)
    # Up date the dictionary with key: file name, values: words
    d.update( {i:words})

In [9]:
# As a result of this problem, we have a vocabulary set with unique words
# an a dictionary, with key: number of the document values: a list of all the words (filtered) in the Airbnb post

# Create a vocabulary dictionary from the set dictionary 
voc_dict = {}
voc_list = list(voc_set)
for i in range(len(voc_list)):
        voc_dict[i] = voc_list[i]

In [12]:
# Create the index
index = defaultdict(str)

for key, value in d.items():
    L = list(value)
    for value in L:
        if value in index:
            index[value].append(key)
        else:
            index[value] = [key]

In [49]:
# Do the same procedure to the query that we did on the docs

query = input()
# Convert everything to lowercase
query = query.lower()
# Removing punctuation
query = tokenizer.tokenize(query)
# Stemming
query = [ps.stem(word) for word in query]  
# Removing stopwords
query = [word for word in query if word not in stopwords.words('english')]
# No integers
query = [x for x in query if not (x.isdigit() or x[0].isdigit())]
# Keep only unique words
query = set(query)
query = list(query)

a beautiful house with garden and beach


In [50]:
# Each of the querys matches should contain all the words on the list query

querys_matches = []
for word in query:
    if word in index.keys():
        querys_matches.append(set(index[word]))

# As a result we are going to have a list, with all the matches, called inter
k = 0
if len(querys_matches) > 0:
    inter = querys_matches[k]
    k += 1
    for i in range(1, len(querys_matches)):
        inter = inter.intersection(querys_matches[k])
        k += 1

In [51]:
if inter == set():
    print("No results were found with those characteristics")
else:
    for i in range(len(inter)):
        inter = list(inter)
        x = int(inter[i])
        file = pd.read_csv("Files/doc_%s.tsv"%x, sep="\t") 
        file = file.rename(columns={'title': 'Title'})
        file = file.rename(columns={'description': 'Description'})
        file = file.rename(columns={'city': 'City'})
        file = file.rename(columns={'url': 'Url'})
        cols_of_interest = ["Title", "Description", "City", "Url"]
        file[cols_of_interest].style.hide_index()

No results were found with those characteristics
