# Property appraisal ML project.
## Phase 2: NLP processing the 'Public Remarks' Section

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
#reading one file
df = pd.read_csv('F20 P1.csv', index_col=None, header=0)

# reading part of the files from Detached June 2020-2021 folder

# data_files = ['F20 P1.csv', 'F20 P2.csv', 'F30 P1.csv', 'F30 P2.csv', 'F50 P1.csv', 'F50 P2.csv']
# data_list = []

# for filename in data_files:
#     df_current = pd.read_csv(filename, index_col=None, header=0)
#     data_list.append(df_current)

# df = pd.concat(data_list, axis=0, ignore_index=True)

# # deleting yellow columns
# df.drop(['Status', 'For Tax Year', 'Gross Taxes', 'Original Price', 'List Price', 'GST Incl'], axis = 1, inplace = True)

# # size of our dataset
# print('Our dataset has', len(df), 'data lines and', len(df.columns.tolist()), 'features:')
# print('\n')
# print(df.columns.tolist())

In [3]:
# dropping the columns with more than 90% NAs
moreThan = []

for feature in df:
    if df[feature].isna().sum() / df.shape[0] > 0.9:
        moreThan.append(feature)
        print("Dropping the feature:", feature)
df.drop(moreThan, axis = 1, inplace = True)

if moreThan == []:
    print('No features dropped.')
print('\n')

# dropping the columns that are not insightful: Days On Market, Public Remarks
# df1.drop(['Sold Date', 'Public Remarks'], axis=1, inplace = True)
# df.drop(['Public Remarks'], axis=1, inplace = True)

columns_names = df.columns.tolist()

print("Features left:")
print(columns_names)
print('\n')
print("Now we have", len(columns_names), "features and their types:")

# types of our columns
pd.DataFrame(df.dtypes, columns=['DataTypes'])

No features dropped.


Features left:
['Status', 'Address', 'S/A', 'Price', 'For Tax Year', 'Gross Taxes', 'Sold Date', 'Days On Market', 'Age', 'Area', 'Total Bedrooms', 'Total Baths', 'Lot Sz (Sq.Ft.)', 'Floor Area -Grand Total', 'Original Price', 'List Price', 'Driveway Finish', 'Floor Area - Unfinished', 'GST Incl', 'Foundation', 'Floor Area Fin - Basement', 'Zoning', 'Parking Places - Covered', '# Rms', 'No. Floor Levels', 'Frontage - Feet', 'Depth', 'Type', 'Public Remarks']


Now we have 29 features and their types:


Unnamed: 0,DataTypes
Status,object
Address,object
S/A,object
Price,object
For Tax Year,int64
Gross Taxes,object
Sold Date,object
Days On Market,int64
Age,int64
Area,object


In [4]:
# hereafter we're working only with the "Public Remarks" column

In [5]:
nlp_column = df['Public Remarks'].copy()

nlp_column

0      Investor's alert. 3 bedroom tenanted home with...
1      WHY RENT? Apartment size, 1 bedroom, modern, e...
2      INVESTORS and FIRST TIME HOME BUYERS ALERT! 2 ...
3      **LARGE 8255 sqft LOT****PERFECT FOR INVESTORS...
4      Tastefully renovated 2 bed 1bath house with de...
                             ...                        
553    6,500 SF of executive living. Exquisitely buil...
554    2.07 Acre Site Great Development Potential Lan...
555    Magnificently New Luxury home by SOOD DEVELOPM...
556    LOCATION, LOCATION!! Hobby Farm in South Pt. K...
557    Location! Location! Location! Port Kells futur...
Name: Public Remarks, Length: 558, dtype: object

In [6]:
# a bit of cleaning: filling with NaN's where not available, changing some words

nlp_column = nlp_column.fillna('NaN')
nlp_column = nlp_column.str.replace('&','and')

# replace all the digits with corresponding words: 5 -> five
import re
import num2words

nlp_column_prep_1 = [re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), paragraph) for paragraph in nlp_column]

# tokenize these comments
import nltk
from nltk.tokenize import word_tokenize
tokenizer = nltk.tokenize.WordPunctTokenizer()
preprocess = lambda text: ' '.join(tokenizer.tokenize(text.lower()))

nlp_column_prep_2 = [preprocess(paragraph) for paragraph in nlp_column_prep_1]

# the preprocessed output
nlp_column_prep_2[:2]

["investor ' s alert . three bedroom tenanted home with lots of potential for holding and rebuilding . this home sits on two lots with a bright south facing front yard . each lot is two thousand , five hundred sqft and adds up to a total of five thousand sqft . the back yard is fenced and quiet . perfect for holding and building one or two homes later . enquire at the city about your building options . location is very convenient : close to school , community centre and scott road skytrain station . short drive to get to patullo bridge and king george highway and south perimeter road . quiet home at the end of a no - thru road . home is tenanted , please drive by first before scheduling a showing .",
 "why rent ? apartment size , one bedroom , modern , energy efficient home with an energy rating , on a fifty ' x one hundred ' lot and fully fenced backyard . the house has been one hundred % professionally renovated including foundations within the last ten years . located on a no - thru

# Gensim Doc2Vec model

Now we build a Doc2Vec model which is one of the best NLP tools that gives opportunity to get the similarities between texts (exactly between paragraphs!).

In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_data = [TaggedDocument(words=word_tokenize(paragraph), tags=[i]) for i, paragraph in enumerate(nlp_column_prep_2)]

vect_len = 50

model_d2v = Doc2Vec(vector_size=vect_len,alpha=0.025, min_count=1)
  
model_d2v.build_vocab(tagged_data)

for epoch in range(vect_len):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)

In [8]:
paragraph_embeddings=np.zeros((np.shape(nlp_column_prep_2)[0],vect_len))

for i in range(len(paragraph_embeddings)):
    paragraph_embeddings[i]=model_d2v.dv[i]

In [9]:
# let's define a function to get the say 6 most similar paragraphs to the given one

def most_similar(doc_id, similarity_matrix, matrix):
    print (f'Similar Documents using {matrix}:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1][:4]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])[:4]
    for ix in similar_ix:
        if ix==doc_id:
            continue
        print('\n')
        print ({nlp_column[ix]})
        print (f'{matrix} Score : {similarity_matrix[doc_id][ix]}')
    print('\n')
    print('Similar paragraph indexes:', similar_ix[1:])
    print('\n')

In [10]:
pairwise_similarities = cosine_similarity(paragraph_embeddings)
pairwise_differences = euclidean_distances(paragraph_embeddings)

In [16]:
# idx = np.random.randint(len(nlp_column))
idx = 156
print("We are interested in this paragraph with index:", idx)
print(nlp_column[idx])
print('\n')

most_similar(idx, pairwise_similarities, 'Cosine Similarity')
most_similar(idx, pairwise_differences, 'Euclidean Distance')

We are interested in this paragraph with index: 156
INVESTOR and/or FIRST TIME BUYER SPECIAL!!! Well maintained basement entry 6 bdrm + 3 bath, over 2000 sq.ft built updated home on 8550 sq.ft (50'x 171') large lot in Royal Heights. Open 8 car/RV parking. Main floor offers; living rm, dining rm, 3 Bdrms + 2 Bath. Basement has 2 bdrm rental mortgage helper suite + extra bdrm for upstairs use. Steps to Royal Heights Daycare and Preschool, Elementary and L.A. Matheson Secondary School. Beautiful private big fenced backyard with HUGE Covered Sundeck , which is great for BBQ parties. Walk to park, Bus, shopping and 5 min drive to Scott road Skytrain station. Easy access to all major routes. Quick Possession Ok. Act Fast !!


Similar Documents using Cosine Similarity:


{"LOCATION LOCATION LOCATION 7 BED 5 BATH over 3600 sqft built house with Double Garage on  70 x 121 over 8400 sqft rectangular lot in beautiful Cedar Hills .4 bed 2.5 bath upstairs and 2 suites( 2+1) down . Updated kitchens 

# Insights

What here we do: take all the paragraphs, train a machine-learned NLP model which is able to represent paragraphs in terms of (in this case) 50-dimensional vector which is in fact giving the context of the paragraph. The model is called Doc2Vec, it is being trained on our Public Remarks and it is expected that it is able to catch similarity between texts (in fact one of the main tools used to check similarity between texts).
One big advantage of Doc2Vec model is that it is not very sensitive to bad non-correct words, noisy and not cleaned data. 

Similarity scores presented above are not very representative as this will be used jointly with the similarity ranking based on features of properties.