In [23]:
import numpy as np
import pandas as pd

import re
from nltk.corpus import stopwords
import nltk
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel, LsiModel, LdaModel
import pyLDAvis.gensim
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')

In [4]:
df = pd.read_excel('RealEstate.xlsx')
df

Unnamed: 0,web_id,property_text_en
0,7023850,Smart Business Real Estate is proud to present...
1,6905802,My Island Real Estate is proud to present 1 Be...
2,6998469,"New on the market, amazing opportunity for inv..."
3,6766136,1 Bedroom Apartment in Marina Diamond 2\n\n- 1...
4,6940244,No penalty for early contract termination!<br ...
...,...,...
756,6203680,Castles Plaza Real Estate is pleased to bring ...
757,7090736,The Noble House Real Estate is proud to presen...
758,7081426,Better Homes would like to present this type 2...
759,7123539,Morgan‚Äôs International Realty is proud to pr...


#### Inspecting the first 8 entries

In [5]:
# Inspecting the first 8 entries
for entry in df.loc[0:8,'property_text_en']:
    print(entry)
    print()
    print()

Smart Business Real Estate is proud to present2Bedrooms for sale locatedat Dubai MarinaElite Residenceon mid floor over looking court yard.\n\nProperty currently Rentedat AED 110,000 till25 of June 2020.\n\nElite Residenceis ideally locatedat the middle of Dubai Marina area , close to the Tran stationas well Metro stationnear by .surrounding by lots of cafe and restaurants , Marina Walkpromenade , across streetfew5 *Hotels. Dubai Marinais very popularand most desirable place to live.\n\nAt the building available facilities:\nGym\nPool\nSecurity 24/7\nCovered parking .\n\nFor more information or viewing appointment, call:\nSmart Business Real Estate LLC\nLandline:+971 4 5808525 / 0503474100\nEmail: info@sbrealestate.ae\nBRN 11575\nORN 2104


My Island Real Estate is proud to present 1 Bedroom Apt. in Dubai Marina with Partial Sea View\n\n\nDubai Marina\nPrincess Tower\n1 Bedroom Apartment\nPartial Sea View\n922.68 sqft\nUnfurnished Apartment\n\n\n\nFor more details, please call 056-9576

#### Here we notice the text includes a lot of:
- html
- '\n' which means new line
- phone numbers starting with + sign 
    - many formats
    - +xxx xx xxx xxxx
    - +xxx x xxx xxxx
    - +xxxxxxxxxxxx
    - xxxxx (continuous block of numbers)
    - xxxx,xxxx
- special characters
- emails

So we need to preprocess the data and clean it

Method for preprocessing each document

In [36]:

def preprocessing(document):
    document = document.lower()

    #removing the html
    document = re.sub(r'(\<[ ]?[a-z]+>|\<\/[a-z]+\>)', ' ', document)
    # noticed that '<' stayed for some reason, so removing them
    document = re.sub(r'<', ' ', document)
    #
    # removing emails
    document = re.sub(r'\S+@\S+', ' ', document)
    # removing \n
    document = re.sub(r'(\\n)', " ", document)
    # removing numbers of this formats:
    # - xxxx,xxxx (numbers with commas)
    document = re.sub(r'(\d+),(\d+),?(\d*)', " ", document)
    # removing numbers of all other formats
    document = re.sub(r'(\+)?\d+[ ]?\d*[ ]?\d*[ ]?\d*', ' ', document)
    # removing characters like ? ! : ; \n \
    document = re.sub('[*.?!,:;/&]', ' ', document)
    # removal of --
    document = re.sub(r'--'," ", document)

    document = " ".join([word for word in document.split() if len(word) > 3 ])
    
    document = " ".join([word for word in document.split() if word not in stop_words])

    document = " ".join([WordNetLemmatizer().lemmatize(word) for word in document.split()])

    return document

In [7]:
#df['property_text_en'][0]

In [37]:
# apply the method on the documents column
df['property_text_en'] = df['property_text_en'].apply(preprocessing)
for document in df['property_text_en']:
    pass

Lets see how the text changed after preprocessing

In [38]:
for document in df.loc[0:8,'property_text_en']:
    print(document)
    print()
    print()

smart business real estate proud present bedroom sale locatedat dubai marinaelite residenceon floor looking court yard property currently rentedat till june elite residenceis ideally locatedat middle dubai marina area close tran stationas well metro stationnear surrounding cafe restaurant marina walkpromenade across streetfew hotel dubai marinais popularand desirable place live building available facility pool security covered parking information viewing appointment call smart business real estate landline email


island real estate proud present bedroom dubai marina partial view dubai marina princess tower bedroom apartment partial view sqft unfurnished apartment detail please call email marina dubai marina awe-inspiring city-within-a city delight resident cosmopolitan free-spirited atmosphere unique invigorating lifestyle urban center water comparable exclusive waterfront development world leading city unlike anywhere else dubai middle east even world island real estate island real e

In [39]:
#re.sub(r'[\.\?\!\,\:\;\\\\/"]', '', df['property_text_en'][0])

In [40]:
df.loc[:,'property_text_en']

0      smart business real estate proud present bedro...
1      island real estate proud present bedroom dubai...
2      market amazing opportunity investor apartment ...
3      bedroom apartment marina diamond bedroom apart...
4      penalty early contract termination price varie...
                             ...                        
756    castle plaza real estate pleased bring market ...
757    noble house real estate proud present full flo...
758    better home would like present type three bedr...
759    morgan‚äôs international realty proud present ...
760    bedroom reem arabian ranch directly opposite p...
Name: property_text_en, Length: 761, dtype: object

In [41]:
text = df['property_text_en'].tolist()
text = [nltk.word_tokenize(doc) for doc in text]

In [42]:
# preparing inputs for the LSI model
dictionary = corpora.Dictionary(text)

In [43]:
corpus = [dictionary.doc2bow(doc) for doc in text]

In [44]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [45]:
lsimodel.show_topics(num_topics=10)

[(0,
  '0.406*"dubai" + 0.393*"marina" + 0.222*"apartment" + 0.212*"tower" + 0.195*"bedroom" + 0.182*"room" + 0.178*"view" + 0.172*"area" + 0.165*"property" + 0.157*"floor"'),
 (1,
  '-0.363*"marina" + -0.300*"dubai" + 0.268*"bedroom" + 0.258*"property" + 0.213*"room" + -0.168*"tower" + 0.161*"study" + 0.141*"bathroom" + 0.141*"garden" + 0.138*"villa"'),
 (2,
  '0.505*"tower" + 0.342*"princess" + 0.301*"floor" + -0.289*"marina" + -0.174*"dubai" + 0.159*"residential" + 0.152*"tallest" + -0.151*"property" + 0.130*"world" + -0.126*"residence"'),
 (3,
  '0.432*"elite" + 0.425*"residence" + -0.232*"estate" + -0.232*"real" + 0.212*"room" + -0.196*"property" + -0.173*"dubai" + -0.161*"marina" + -0.147*"tower" + 0.145*"pool"'),
 (4,
  '0.405*"real" + 0.403*"estate" + 0.314*"elite" + 0.298*"residence" + -0.175*"marina" + 0.165*"service" + -0.161*"room" + 0.154*"property" + -0.150*"apartment" + -0.127*"bedroom"'),
 (5,
  '-0.380*"room" + -0.347*"dubai" + 0.196*"marina" + 0.168*"walk" + 0.161*"to

Lets try it with bigrams instead 

In [17]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(text, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

bigramed_words = make_bigrams(text)

bigram_dictionary = corpora.Dictionary(text)
bigram_corpus = [dictionary.doc2bow(doc) for doc in text]

lsimodel = LsiModel(corpus=bigram_corpus, num_topics=10, id2word=bigram_dictionary)
lsimodel.print_topics(num_topics=10)

[(0,
  '0.405*"dubai" + 0.393*"marina" + 0.221*"apartment" + 0.212*"tower" + 0.195*"bedroom" + 0.182*"room" + 0.178*"view" + 0.172*"area" + 0.165*"property" + 0.157*"floor"'),
 (1,
  '-0.363*"marina" + -0.300*"dubai" + 0.269*"bedroom" + 0.257*"property" + 0.213*"room" + -0.167*"tower" + 0.161*"study" + 0.141*"bathroom" + 0.141*"garden" + 0.138*"villa"'),
 (2,
  '-0.504*"tower" + -0.341*"princess" + -0.301*"floor" + 0.289*"marina" + 0.175*"dubai" + -0.159*"residential" + -0.152*"tallest" + 0.152*"property" + -0.130*"world" + 0.125*"walk"'),
 (3,
  '0.432*"elite" + 0.424*"residence" + -0.233*"estate" + -0.232*"real" + 0.211*"room" + -0.196*"property" + -0.171*"dubai" + -0.160*"marina" + -0.149*"tower" + 0.145*"pool"'),
 (4,
  '0.404*"real" + 0.402*"estate" + 0.315*"elite" + 0.299*"residence" + -0.173*"marina" + 0.165*"service" + -0.162*"room" + 0.154*"property" + -0.150*"apartment" + -0.127*"bedroom"'),
 (5,
  '0.376*"room" + 0.349*"dubai" + -0.199*"marina" + -0.168*"walk" + -0.159*"towe

In [46]:
coherence_model_lsi = CoherenceModel(model=lsimodel, texts=text, dictionary=dictionary, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score: ', coherence_lsi)



Coherence Score:  0.47553380398703976


So we got a coherence score of 0.4


[(0,
  '0.022*"bedroom" + 0.018*"dubai" + 0.013*"residence" + 0.012*"property" + 0.012*"elite" + 0.011*"apartment" + 0.010*"estate" + 0.010*"pool" + 0.009*"view" + 0.009*"walk"'),
 (1,
  '0.025*"marina" + 0.024*"dubai" + 0.019*"apartment" + 0.014*"floor" + 0.013*"bedroom" + 0.013*"view" + 0.011*"tower" + 0.011*"room" + 0.011*"property" + 0.009*"area"'),
 (2,
  '0.034*"marina" + 0.034*"dubai" + 0.016*"apartment" + 0.014*"view" + 0.012*"room" + 0.011*"residence" + 0.011*"elite" + 0.011*"property" + 0.011*"bedroom" + 0.011*"pool"'),
 (3,
  '0.022*"dubai" + 0.017*"tower" + 0.011*"marina" + 0.010*"apartment" + 0.010*"bedroom" + 0.008*"room" + 0.008*"beach" + 0.007*"property" + 0.007*"pool" + 0.006*"princess"'),
 (4,
  '0.024*"dubai" + 0.024*"marina" + 0.018*"tower" + 0.018*"bedroom" + 0.016*"room" + 0.015*"area" + 0.012*"property" + 0.012*"view" + 0.011*"residence" + 0.011*"apartment"'),
 (5,
  '0.020*"dubai" + 0.018*"property" + 0.017*"marina" + 0.015*"apartment" + 0.014*"area" + 0.012*"be

In [48]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)