In [143]:
import numpy as np
import pandas as pd

import re
from nltk.corpus import stopwords
import nltk
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel, LsiModel, LdaModel

from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')

In [144]:
df = pd.read_excel('RealEstate.xlsx')
df

Unnamed: 0,web_id,property_text_en
0,7023850,Smart Business Real Estate is proud to present...
1,6905802,My Island Real Estate is proud to present 1 Be...
2,6998469,"New on the market, amazing opportunity for inv..."
3,6766136,1 Bedroom Apartment in Marina Diamond 2\n\n- 1...
4,6940244,No penalty for early contract termination!<br ...
...,...,...
756,6203680,Castles Plaza Real Estate is pleased to bring ...
757,7090736,The Noble House Real Estate is proud to presen...
758,7081426,Better Homes would like to present this type 2...
759,7123539,Morgan‚Äôs International Realty is proud to pr...


#### Inspecting the first 8 entries

In [145]:
# Inspecting the first 8 entries
for entry in df.loc[0:8,'property_text_en']:
    print(entry)
    print()
    print()

Smart Business Real Estate is proud to present2Bedrooms for sale locatedat Dubai MarinaElite Residenceon mid floor over looking court yard.\n\nProperty currently Rentedat AED 110,000 till25 of June 2020.\n\nElite Residenceis ideally locatedat the middle of Dubai Marina area , close to the Tran stationas well Metro stationnear by .surrounding by lots of cafe and restaurants , Marina Walkpromenade , across streetfew5 *Hotels. Dubai Marinais very popularand most desirable place to live.\n\nAt the building available facilities:\nGym\nPool\nSecurity 24/7\nCovered parking .\n\nFor more information or viewing appointment, call:\nSmart Business Real Estate LLC\nLandline:+971 4 5808525 / 0503474100\nEmail: info@sbrealestate.ae\nBRN 11575\nORN 2104


My Island Real Estate is proud to present 1 Bedroom Apt. in Dubai Marina with Partial Sea View\n\n\nDubai Marina\nPrincess Tower\n1 Bedroom Apartment\nPartial Sea View\n922.68 sqft\nUnfurnished Apartment\n\n\n\nFor more details, please call 056-9576

#### Here we notice the text includes a lot of:
- html
- '\n' which means new line
- phone numbers starting with + sign 
    - many formats
    - +xxx xx xxx xxxx
    - +xxx x xxx xxxx
    - +xxxxxxxxxxxx
    - xxxxx (continuous block of numbers)
    - xxxx,xxxx
- special characters
- emails

So we need to preprocess the data and clean it

### Method for getting the relevant pos tag

In [146]:
from nltk.corpus import wordnet
# method used to get the correct pos tag for correct lemmatization since nltk assumes all words are nouns which results in wrong lemmatization without specifying the correct pos
def get_pos(word):
    # WordNet Lemmatizer needs a pos to correctly lemmatize words. 
    # WordNet accepts pos in formats: 'wordnet.NOUN' 'wordnet.VERB' 'wordnet.ADJ' 'wordnet.ADV' for noun, verb, adj, adverb, respectively
    # to map nltk pos tags to wordnet pos tags we get the first letter out of the nltk pos tag and map it to a wordnet tag

    # get the nltk pos tag 
    pos = nltk.pos_tag([word])
    #outputs: [('playing', 'VBG')]

    # get the first letter of the nltk pos tag
    pos = pos[0][1][0]

    pos_tags = {
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "J": wordnet.ADJ,
        "R": wordnet.ADV,
        "default": wordnet.NOUN
    }
    # if the pos tag doesnt match any key, return wordnet.NOUN as a default
    return pos_tags.get(pos, pos_tags['default'])
    


### Method for preprocessing each document

In [147]:

def preprocessing(document):
    document = document.lower()

    #removing the html
    document = re.sub(r'(\<[ ]?[a-z]+>|\<\/[a-z]+\>)', ' ', document)
    # noticed that '<' stayed for some reason, so removing them
    document = re.sub(r'<', ' ', document)
    
    # removing emails
    document = re.sub(r'\S+@\S+', ' ', document)
    # removing \n
    document = re.sub(r'(\\n)', " ", document)
    # removing numbers of this formats:
    # - xxxx,xxxx (numbers with commas)
    document = re.sub(r'(\d+),(\d+),?(\d*)', " ", document)
    # removing numbers of all other formats
    document = re.sub(r'(\+)?\d+[ ]?\d*[ ]?\d*[ ]?\d*', ' ', document)
    # removing characters like ? ! : ; \n \
    document = re.sub('[*.?!,:;/&]', ' ', document)
    # removal of --
    document = re.sub(r'--'," ", document)

    document = " ".join([word for word in document.split() if len(word) > 3 ])
    
    document = " ".join([word for word in document.split() if word not in stop_words])
    
    document = " ".join([WordNetLemmatizer().lemmatize(word, get_pos(word)) for word in document.split()])
    
    return document

In [148]:
# example of lemmatization
word = WordNetLemmatizer().lemmatize('being', pos=wordnet.VERB)
word

'be'

In [149]:
# apply the method on the documents column
df['property_text_en'] = df['property_text_en'].apply(preprocessing)


Lets see how the text changed after preprocessing

In [150]:
for document in df.loc[0:8,'property_text_en']:
    print(document)
    print()
    print()

smart business real estate proud present bedroom sale locatedat dubai marinaelite residenceon floor look court yard property currently rentedat till june elite residenceis ideally locatedat middle dubai marina area close tran stationas well metro stationnear surround lot cafe restaurant marina walkpromenade across streetfew hotel dubai marinais popularand desirable place live building available facility pool security cover parking information view appointment call smart business real estate landline email


island real estate proud present bedroom dubai marina partial view dubai marina princess tower bedroom apartment partial view sqft unfurnished apartment detail please call email marina dubai marina awe-inspiring city-within-a city delight resident cosmopolitan free-spirited atmosphere unique invigorate lifestyle urban center water comparable exclusive waterfront development world lead city unlike anywhere else dubai middle east even world island real estate island real estate indepe

In [152]:
text = df['property_text_en'].tolist()
text_tokenized = [nltk.word_tokenize(doc) for doc in text]

In [177]:
# preparing inputs for the LSI model
dictionary = corpora.Dictionary(text_tokenized)

In [179]:
corpus = [dictionary.doc2bow(doc) for doc in text_tokenized]
# document-term frequency for first 2 documents
for doc in corpus[0:2]:
    print(doc,'\n')
# (0,1) -> word-id 0 occurs 1 time in the first document

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 3), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 2), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)] 

[(4, 2), (6, 1), (8, 1), (14, 5), (16, 1), (17, 5), (29, 4), (33, 1), (38, 1), (40, 1), (41, 5), (55, 2), (57, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 3), (65, 1), (66, 1), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 3), (90, 1), (91, 1), (92, 1), (93, 1), (94, 2), (95, 1), (96, 1), (97, 1), (98, 1), (

### Initializing the LSI model

In [216]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

### Inspecting the U matrix

In [190]:
u_df = pd.DataFrame(data= lsimodel.projection.u)
# sort by column 0
u_df.sort_values(axis= 0, by= 0, ascending= False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
14,0.395613,-0.306558,0.179144,-0.164529,0.092363,0.355959,0.201438,0.236546,-0.142905,0.084133
29,0.383955,-0.362382,0.303764,-0.151923,-0.166626,-0.170040,-0.109949,0.024479,0.022700,0.123919
55,0.232085,0.187964,-0.000206,-0.023951,-0.031610,-0.228445,0.211557,-0.221252,-0.127181,-0.133902
64,0.217057,-0.077603,0.064340,-0.012146,-0.143119,-0.010257,0.163825,-0.481375,0.184774,-0.002238
106,0.212777,-0.200079,-0.507520,-0.156112,-0.025199,-0.146552,0.013174,0.135836,0.102806,-0.083934
...,...,...,...,...,...,...,...,...,...,...
2528,0.000030,0.000235,0.000046,-0.000151,0.000024,-0.000193,0.000205,-0.000022,-0.000463,-0.000368
2529,0.000030,0.000235,0.000046,-0.000151,0.000024,-0.000193,0.000205,-0.000022,-0.000463,-0.000368
2530,0.000030,0.000235,0.000046,-0.000151,0.000024,-0.000193,0.000205,-0.000022,-0.000463,-0.000368
2531,0.000030,0.000235,0.000046,-0.000151,0.000024,-0.000193,0.000205,-0.000022,-0.000463,-0.000368


We can see here that the words with ids 14,29,55 relate most to topic 0

In [194]:
ids = [14,29,55]
for id in ids:
    print(f'word with id {id} is: {dictionary[id]}')

word with id 14 is: dubai
word with id 29 is: marina
word with id 55 is: view


Which does make sense, since those words appear a lot in the corpus

### Inspecting the S matrix

In [195]:
s_df = pd.DataFrame(data= lsimodel.projection.s)
s_df

Unnamed: 0,0
0,199.764581
1,72.147969
2,67.471718
3,62.997631
4,55.20946
5,48.759355
6,45.512727
7,43.436362
8,40.246567
9,38.177323


Since we have specified the number of topics as 10, we got 10 singular values.

This matrix ranks by importance of topic, so topic 0 is the most important and topic 9 is the least important

### Inspecting the V matrix

In [213]:
# V = gensim.matutils.corpus2dense(lsimodel[corpus], len(lsimodel.projection.s)).T / lsimodel.projection.s
# v_df = pd.DataFrame(data = V)
# v_df.sort_values(by=8, ascending=False)

### Most important topics in the corpus

In [217]:
lsimodel.show_topics(num_topics=10)

[(0,
  '0.396*"dubai" + 0.384*"marina" + 0.232*"view" + 0.217*"apartment" + 0.213*"tower" + 0.192*"bedroom" + 0.178*"room" + 0.168*"area" + 0.162*"property" + 0.158*"floor"'),
 (1,
  '-0.362*"marina" + -0.307*"dubai" + 0.256*"bedroom" + 0.253*"property" + -0.200*"tower" + 0.198*"room" + 0.188*"view" + 0.156*"study" + 0.138*"bathroom" + 0.138*"garden"'),
 (2,
  '0.508*"tower" + 0.334*"princess" + -0.304*"marina" + 0.294*"floor" + -0.179*"dubai" + 0.154*"residential" + -0.147*"walk" + 0.146*"tallest" + -0.136*"property" + 0.129*"world"'),
 (3,
  '0.429*"elite" + 0.421*"residence" + -0.235*"estate" + -0.234*"real" + 0.213*"room" + -0.200*"property" + -0.165*"dubai" + -0.156*"tower" + -0.152*"marina" + 0.146*"pool"'),
 (4,
  '0.392*"real" + 0.390*"estate" + 0.320*"elite" + 0.303*"residence" + -0.170*"walk" + -0.167*"marina" + 0.162*"service" + 0.158*"property" + -0.156*"room" + -0.143*"apartment"'),
 (5,
  '-0.372*"room" + -0.356*"dubai" + 0.228*"view" + 0.205*"walk" + 0.170*"marina" + 0.1

In [252]:
lsitopics = [[word for word, prob in topic]
             for topicid, topic in lsimodel.show_topics(formatted=False)]

coherence_model_lsi = CoherenceModel( model=lsimodel,texts=text_tokenized, dictionary=dictionary, coherence='c_v', topics = lsitopics )
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score: ', coherence_lsi)



Coherence Score:  0.43105337640086844


The higher the coherence score, the better.
We got a coherence score of 0.43 , which can be further improved


### Querying the lsi model to get relevant documents

In [239]:
# performing queries
# https://radimrehurek.com/gensim//auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py

query = 'spacious bedroom with sea view'
vec_bow = dictionary.doc2bow(query.lower().split())

# convert the query to LSI space
vec_lsi = lsimodel[vec_bow]  
# This shows how the query relates to the 10 topics
vec_lsi

[(0, 0.4797129720563416),
 (1, 0.5115215048363576),
 (2, -0.011951011362102469),
 (3, -0.039910471585753046),
 (4, -0.23176682432645468),
 (5, 0.3186853954058226),
 (6, 0.36022834036304147),
 (7, -0.2744119081399116),
 (8, -0.08750029836058071),
 (9, 0.1635112213392335)]

In [240]:
from gensim import similarities
index = similarities.MatrixSimilarity(lsimodel[corpus]) # transform corpus to LSI space and index it
# index.save('/tmp/deerwester.index')
# index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')


In [241]:
sims = index[vec_lsi]  # perform a similarity query against the corpus

sims = sorted(enumerate(sims), key=lambda item: -item[1])
print("Similarity of query:",query,"\n")
for i, s in enumerate(sims):
    print('doc number:',s[0],'   has similarity of',s[1])

Similarity of query: spacious bedroom with sea view 

doc number: 141    has similarity of 0.97095
doc number: 134    has similarity of 0.9362675
doc number: 148    has similarity of 0.9288845
doc number: 216    has similarity of 0.8921453
doc number: 23    has similarity of 0.8730042
doc number: 39    has similarity of 0.8586298
doc number: 289    has similarity of 0.85724294
doc number: 241    has similarity of 0.8534188
doc number: 207    has similarity of 0.84829426
doc number: 307    has similarity of 0.84438956
doc number: 457    has similarity of 0.8438725
doc number: 106    has similarity of 0.83653474
doc number: 109    has similarity of 0.83057046
doc number: 330    has similarity of 0.8287505
doc number: 747    has similarity of 0.8244047
doc number: 384    has similarity of 0.81013954
doc number: 629    has similarity of 0.8086602
doc number: 42    has similarity of 0.8052194
doc number: 156    has similarity of 0.8049013
doc number: 242    has similarity of 0.80347186
doc 