## This is a Q&A system using BERT model
#### Note: Install the below python libraries before running this notebook
#### pip install pandas
#### pip install tensorflow==1.13.0rc1
#### pip install statsmodel
#### pip install sklearn
#### Start the client by training the model
#### Download a Pre-trained BERT Model and uncompress the zip file into a folder say cased_L-12_H-768_A-12/
#### Note: You will have to download the BERT model from https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
#### bert-serving-start -model_dir cased_L-12_H-768_A-12 -num_worker=4 -max_seq_len=75

In [1]:
import pandas as pd
from bert_serving.client import BertClient
import numpy as np

In [2]:
# Read all the predefined questios for housing data 
questions_data = pd.read_csv('AllQuestions.csv')

In [3]:
def encode_questions():
    bc = BertClient()
    questions = questions_data["Question"].values.tolist()
    print("Questions count", len(questions))
    print("Start to calculate encoder....")
    questions_encoder = bc.encode(questions)
    np.save("questions", questions_encoder)
    questions_encoder_len = np.sqrt(
        np.sum(questions_encoder * questions_encoder, axis=1)
    )
    np.save("questions_len", questions_encoder_len)
    print("Encoder ready")

In [4]:
%%time
encode_questions()

Questions count 90
Start to calculate encoder....
Encoder ready
Wall time: 11.9 s


In [5]:
# this function is used to get printable results
def getResults(questions, fn):
    def getResult(q):
        answer, score, prediction = fn(q)
        return [q, prediction, answer, score]

    return pd.DataFrame(list(map(getResult, questions)), columns=["Q", "Prediction", "A", "Score"])

In [6]:
class BertAnswer():
    def __init__(self):
        self.bc = BertClient()
        self.q_data = questions_data["Question"].values.tolist()
        self.a_data = questions_data["Answer"].values.tolist()
        self.questions_encoder = np.load("questions.npy")
        self.questions_encoder_len = np.load("questions_len.npy")

    def get(self, q):
        query_vector = self.bc.encode([q])[0]
        score = np.sum((query_vector * self.questions_encoder), axis=1) / (
            self.questions_encoder_len * (np.sum(query_vector * query_vector) ** 0.5)
        )
        top_id = np.argsort(score)[::-1][0]
        if float(score[top_id]) > 0.5:
            return self.a_data[top_id], score[top_id], self.q_data[top_id]
        return "Sorry, I didn't get you.", score[top_id], self.q_data[top_id]

bm = BertAnswer()

def getBertAnswer(q):
    return bm.get(q)

# print(getResults(questions_data, getBertAnswer))

In [7]:
# Read the housing data from the Redfin Dataset
redfin_data = pd.read_csv('CountyData\AllCounties_Data.csv')
redfin_data.columns= redfin_data.columns.str.lower()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
# Check the data types of all the columns
redfin_data.dtypes

unnamed: 0                                                                                      int64
unnamed: 0.1                                                                                    int64
sale type                                                                                      object
sold date                                                                                     float64
property type                                                                                  object
address                                                                                        object
city                                                                                           object
state or province                                                                              object
zip or postal code                                                                             object
price                                                                             

In [9]:
# Fix the data types for the redfin data
redfin_data['zip or postal code'] = redfin_data['zip or postal code'].astype(str)
redfin_data['zip or postal code'] = redfin_data['zip or postal code'].str.replace(".0", "", regex=False)
redfin_data['beds'] = redfin_data['beds'].fillna(0)
redfin_data['beds'] = redfin_data['beds'].astype(int)
redfin_data['baths'] = redfin_data['baths'].astype(str)
redfin_data['baths'] = redfin_data['baths'].str.replace(".0", "", regex=False)
redfin_data['price'] = redfin_data['price'].fillna(0)
redfin_data['price'] = redfin_data['price'].astype(int)
redfin_data['square feet'] = redfin_data['square feet'].fillna(0)
redfin_data['square feet'] = redfin_data['square feet'].astype(int)
redfin_data['lot size'] = redfin_data['lot size'].fillna(0)
redfin_data['lot size'] = redfin_data['lot size'].astype(int)
redfin_data['year built'] = redfin_data['year built'].fillna(0)
redfin_data['year built'] = redfin_data['year built'].astype(int)
redfin_data['days on market'] = redfin_data['days on market'].fillna(0)
redfin_data['days on market'] = redfin_data['days on market'].astype(int)
redfin_data['$/square feet'] = redfin_data['$/square feet'].map(lambda x: '{0:.2f}'.format(x)) 
redfin_data['hoa/month'] = redfin_data['hoa/month'].map(lambda x: '{0:.2f}'.format(x))

In [10]:
# Select a random house for testing
house_info = redfin_data.loc[2]

In [11]:
q = 'What is the multi listing number of this house ?'
question_answer = getBertAnswer(q)

In [12]:
# Display the information for the user
print('Asked Question: '+q)
print('Related Question: '+question_answer[2])
print('Related Data requested: '+question_answer[0])
print('Requested Details: ' + str(house_info[question_answer[0]]))

Asked Question: What is the multi listing number of this house ?
Related Question: What is the mls number of this house ?
Related Data requested: mls#
Requested Details: 21-100533
