## This is a Q&A system using Levenshtein Python module

### Note: Install the below python libraries before running this notebook
### pip install pandas
### pip install python-Levenshtein

In [1]:
import pandas as pd
from Levenshtein import ratio

In [2]:
# Read the housing data from the Redfin Dataset
redfin_data = pd.read_csv('CountyData\AllCounties_Data.csv')
redfin_data.columns= redfin_data.columns.str.lower()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# Check a sample data
redfin_data.head(2)

Unnamed: 0,unnamed: 0,unnamed: 0.1,sale type,sold date,property type,address,city,state or province,zip or postal code,price,...,status,next open house start time,next open house end time,url (see http://www.redfin.com/buy-a-home/comparative-market-analysis for info on pricing),source,mls#,favorite,interested,latitude,longitude
0,0,0,MLS Listing,,Single Family Residential,25205 Oak St,Lomita,CA,90717.0,999000.0,...,Pre On-Market,November-6-2021 12:00 AM,November-6-2021 11:59 PM,http://www.redfin.com/CA/Lomita/25205-Oak-St-9...,CRMLS,OC21238572,N,Y,33.795732,-118.317474
1,1,1,MLS Listing,,Single Family Residential,203 E Camino Real,Monrovia,CA,91016.0,899000.0,...,Active,November-6-2021 01:00 PM,November-6-2021 04:00 PM,http://www.redfin.com/CA/Monrovia/203-E-Camino...,CRMLS,PF21238553,N,Y,34.12203,-118.001185


In [4]:
# Check the data types of all the columns
redfin_data.dtypes

unnamed: 0                                                                                      int64
unnamed: 0.1                                                                                    int64
sale type                                                                                      object
sold date                                                                                     float64
property type                                                                                  object
address                                                                                        object
city                                                                                           object
state or province                                                                              object
zip or postal code                                                                             object
price                                                                             

In [5]:
# Fix the data types for the redfin data
redfin_data['zip or postal code'] = redfin_data['zip or postal code'].astype(str)
redfin_data['zip or postal code'] = redfin_data['zip or postal code'].str.replace(".0", "", regex=False)
redfin_data['beds'] = redfin_data['beds'].fillna(0)
redfin_data['beds'] = redfin_data['beds'].astype(int)
redfin_data['baths'] = redfin_data['baths'].astype(str)
redfin_data['baths'] = redfin_data['baths'].str.replace(".0", "", regex=False)
redfin_data['price'] = redfin_data['price'].fillna(0)
redfin_data['price'] = redfin_data['price'].astype(int)
redfin_data['square feet'] = redfin_data['square feet'].fillna(0)
redfin_data['square feet'] = redfin_data['square feet'].astype(int)
redfin_data['lot size'] = redfin_data['lot size'].fillna(0)
redfin_data['lot size'] = redfin_data['lot size'].astype(int)
redfin_data['year built'] = redfin_data['year built'].fillna(0)
redfin_data['year built'] = redfin_data['year built'].astype(int)
redfin_data['days on market'] = redfin_data['days on market'].fillna(0)
redfin_data['days on market'] = redfin_data['days on market'].astype(int)
redfin_data['$/square feet'] = redfin_data['$/square feet'].map(lambda x: '{0:.2f}'.format(x)) 
redfin_data['hoa/month'] = redfin_data['hoa/month'].map(lambda x: '{0:.2f}'.format(x))

In [6]:
# Display the sample data
redfin_data.head(2)

Unnamed: 0,unnamed: 0,unnamed: 0.1,sale type,sold date,property type,address,city,state or province,zip or postal code,price,...,status,next open house start time,next open house end time,url (see http://www.redfin.com/buy-a-home/comparative-market-analysis for info on pricing),source,mls#,favorite,interested,latitude,longitude
0,0,0,MLS Listing,,Single Family Residential,25205 Oak St,Lomita,CA,90717,999000,...,Pre On-Market,November-6-2021 12:00 AM,November-6-2021 11:59 PM,http://www.redfin.com/CA/Lomita/25205-Oak-St-9...,CRMLS,OC21238572,N,Y,33.795732,-118.317474
1,1,1,MLS Listing,,Single Family Residential,203 E Camino Real,Monrovia,CA,91016,899000,...,Active,November-6-2021 01:00 PM,November-6-2021 04:00 PM,http://www.redfin.com/CA/Monrovia/203-E-Camino...,CRMLS,PF21238553,N,Y,34.12203,-118.001185


In [7]:
# Select a random house for testing
house_info = redfin_data.loc[2]

In [8]:
# Read all the predefined questios for housing data 
questions_data = pd.read_csv('AllQuestions.csv')

In [9]:
# Check a sample data
questions_data.head(2)

Unnamed: 0,Question,Answer
0,What is the sale type of this property ?,sale type
1,What is the sold date of this property ?,sold date


In [10]:
# Define a sample question users might ask
test_data = 'What is the multi listing number of this house ?'

In [11]:
# Method to check a question's levenshtein ratio with other questions 
def getResults(question):
    levenshteinRatio =[]
    for idx, row in questions_data.iterrows():
        score = ratio(row["Question"], question)
        levenshteinRatio.append(score)
    max_score = max(levenshteinRatio)
    max_index = levenshteinRatio.index(max_score)
    print('Levenshtein Ratio: ' + str(max_score))
#     print(max_index)
    return max_index

In [12]:
# Get the details on what the question is about  
index = getResults(test_data)
column_name = questions_data.iloc[index].Answer

Levenshtein Ratio: 0.8837209302325582


In [13]:
# Display the information for the user
print('Asked Question: '+test_data)
print('Related Question: '+questions_data.iloc[index].Question)
print('Related Data requested: '+questions_data.iloc[index].Answer)
print('Requested Details: ' + str(house_info[column_name]))

Asked Question: What is the multi listing number of this house ?
Related Question: What is the mls number of this house ?
Related Data requested: mls#
Requested Details: 21-100533
