# Cosine Similarity - KL
first draft of cosine similarity for property recommender 10/3/2022

Data 

1) **State** -  state of the property
2) **District** -  District of the property location
3) **Mukim** - Mukim of the property location
4) **Floor_Num** -  Number of floor of the property
5) **Property_Type**  -
6) **SP_Date** - Date of sale and purchase
7) **Size_Lot** - Size of lot in m.p
8) **Building_Size** - in m.p
9) **Address , Scheme, Tenure_Type, Tenure_Period**

In [1]:
from PIL import Image

In [2]:
import pandas as pd
import numpy as np

#read property csv file
data = pd.read_csv('../data/kl_cleaned_v2.csv')

In [3]:
data.shape

(13374, 16)

In [4]:
# show all columns
data.describe()

Unnamed: 0,floor-num,lot-size,bldg-size,price,ppm,tenure-period,bedroom-num
count,12860.0,13374.0,13374.0,13374.0,13374.0,4896.0,13374.0
mean,14.369673,143.402722,124.549499,507226.5,3148.564902,98.185662,3.097503
std,10.800161,557.646532,85.0529,820682.1,2622.516678,39.13371,0.782428
min,0.0,0.0,0.0,40000.0,0.0,0.0,0.0
25%,4.0,77.0,81.0,148000.0,1651.0,99.0,3.0
50%,15.0,103.0,104.0,264968.0,2461.0,99.0,3.0
75%,20.0,143.0,140.0,548000.0,3984.0,99.0,3.0
max,47.0,45360.0,3112.0,22500000.0,106503.0,999.0,18.0


In [5]:
data.head()

Unnamed: 0,state,district,floor-num,type,property-type,transaction-date,lot-size,bldg-size,report-date,price,ppm,address,scheme,tenure-type,tenure-period,bedroom-num
0,Kuala Lumpur,Mukim Petaling,2.0,semid,2 - 2 1/2 Storey Semi-Detached,19/1/2008,0,0,8/7/2011,1062800,0,"32,JALAN MANDA'RINA DAMAI",TAMAN BUKIT MANDA'RINA,Pajakan,99.0,4
1,Kuala Lumpur,Mukim Petaling,2.0,semid,2 - 2 1/2 Storey Semi-Detached,27/10/2010,0,0,27/4/2011,1493888,0,14150,TMN ALAM DAMAI,Pajakan,99.0,4
2,Kuala Lumpur,Mukim Petaling,2.0,semid,2 - 2 1/2 Storey Semi-Detached,27/10/2010,0,0,27/4/2011,1366888,0,14153,TMN ALAM DAMAI,Pajakan,99.0,4
3,Kuala Lumpur,Mukim Batu,2.0,terrace,2 - 2 1/2 Storey Terraced,20/6/2011,0,0,20/6/2011,480000,0,"LOT 3,OFF JLN KEPONG",TMN FADASON,Pajakan,99.0,6
4,Kuala Lumpur,Mukim Batu,2.0,terrace,2 - 2 1/2 Storey Terraced,12/7/2008,0,0,8/7/2011,530000,0,"NO. 6,JLN 10/17",TMN FADASON,Pajakan,99.0,5


In [6]:
data.columns

Index(['state', 'district', 'floor-num', 'type', 'property-type',
       'transaction-date', 'lot-size', 'bldg-size', 'report-date', 'price',
       'ppm', 'address', 'scheme', 'tenure-type', 'tenure-period',
       'bedroom-num'],
      dtype='object')

In [7]:
data['scheme'].unique()

array(["TAMAN BUKIT MANDA'RINA ", 'TMN ALAM DAMAI ', 'TMN FADASON ',
       'DESA PARK CITY (ZENIA) ', 'KENSINGTON PARKHOMES ', 'KIARA VIEW ',
       'LAMAN RESIDENCE ', 'BANDAR BARU SRI PETALING ', 'KINRARA MAS ',
       '(Not Available)', 'ALAM SUTERA ', 'HIJAUAN KIARA ',
       'ANGKASA IMPIAN ', 'CONTINENTAL HEIGHTS CONDO ', 'HAMPSHIRE PARK ',
       'KIARAMAS SUTERA KONDOMINIUM ', 'MEDAN PUTRA CONDOMINIUM ',
       'SAVANNA BUKIT JALIL CONDOMINIUM ', 'VISTA DAMAI ',
       "10 MON'T KIARA ", 'ANGKASA CONDOMINIUM ',
       'BAYU TASIK CONDOMINIUM ', 'BINTANG MAS CONDOMINIUM ', 'CASA RIA ',
       'CENGAL KONDOMINIUM ', 'DANAU MURNI KONDOMINIUM ',
       'DESA CINDAIMAS ', 'DESA PARK CITY ', 'ENDAH PURI CONDOMINIUM ',
       'GREEN AVENUE KONDOMINIUM ', 'GREENFIELD APARTMENT ',
       'KETUMBAR HILL ', 'LE CHATEAU ', 'MANDY COURT ', 'MANDY VILLA ',
       'MEADOW PARK ', 'MENARA BINJAI ', 'MENARA KENANGA ', 'MENARA KLH ',
       'MIDAH HEIGHTS ', 'MIDAH RIA APARTMENT ', 'PALM COURT 

In [8]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if field exist. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        elif isinstance(x, int):
            return str(x)
        else:
            return ''

In [9]:
# Apply clean_data function to your selected columns.
features = ['scheme', 'property-type','bedroom-num', 'price', ]

for feature in features:
    data[feature] = data[feature].apply(clean_data)

In [10]:
data[['scheme', 'property-type','bedroom-num', 'price']].head(5)

Unnamed: 0,scheme,property-type,bedroom-num,price
0,tamanbukitmanda'rina,2-21/2storeysemi-detached,4,1062800
1,tmnalamdamai,2-21/2storeysemi-detached,4,1493888
2,tmnalamdamai,2-21/2storeysemi-detached,4,1366888
3,tmnfadason,2-21/2storeyterraced,6,480000
4,tmnfadason,2-21/2storeyterraced,5,530000


In [11]:
# Create a new column combination of selected  field
def combineField(x):
    return x['scheme'] + ' ' + x['property-type'] + ' bedroom'+ x['bedroom-num'] + ' price'+ x['price']

In [12]:
# Create a new field feature called 'Desc' for description
data['desc'] = data.apply(combineField, axis=1)
data[['desc']].head()

Unnamed: 0,desc
0,tamanbukitmanda'rina 2-21/2storeysemi-detached...
1,tmnalamdamai 2-21/2storeysemi-detached bedroom...
2,tmnalamdamai 2-21/2storeysemi-detached bedroom...
3,tmnfadason 2-21/2storeyterraced bedroom6 price...
4,tmnfadason 2-21/2storeyterraced bedroom5 price...


# ML Model
Things might get out of hand <br>
From now on, will use a lot of ram depending on data size

In [13]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['desc'])

In [14]:
count_matrix.shape

(13374, 3666)

In [15]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [16]:
cosine_sim.shape

(13374, 13374)

In [17]:
data.shape

(13374, 17)

In [18]:
data.loc[0]

state                                                    Kuala Lumpur
district                                               Mukim Petaling
floor-num                                                         2.0
type                                                            semid
property-type                               2-21/2storeysemi-detached
transaction-date                                            19/1/2008
lot-size                                                            0
bldg-size                                                           0
report-date                                                  8/7/2011
price                                                         1062800
ppm                                                                 0
address                                    32,JALAN MANDA'RINA DAMAI 
scheme                                           tamanbukitmanda'rina
tenure-type                                                   Pajakan
tenure-period       

In [19]:
# chosen property 
index = 0

sim_scores = list(enumerate(cosine_sim[index]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:6]

rec_indices = [i[0] for i in sim_scores]

#
data.iloc[rec_indices]

Unnamed: 0,state,district,floor-num,type,property-type,transaction-date,lot-size,bldg-size,report-date,price,ppm,address,scheme,tenure-type,tenure-period,bedroom-num,desc
9355,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,21/1/2011,405,265,14/2/2011,2150800,8116,"LOT 45678 VILLA 33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...
9356,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,28/10/2010,283,265,19/1/2011,1850800,6984,"33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...
9357,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,10/11/2010,286,265,14/2/2011,1850800,6984,"LOT 45684 VILLA 33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...
9358,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,23/12/2010,287,265,14/2/2011,1850800,6984,"LOT 45683 VILLA 33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...
9359,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,23/3/2011,288,265,6/4/2011,1850800,6984,"LOT 45682 VILLA 33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...


In [20]:
index = 0

sim_scores = list(enumerate(cosine_sim[index]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:6]

rec_indices = [i[0] for i in sim_scores]

#
data.iloc[rec_indices]

Unnamed: 0,state,district,floor-num,type,property-type,transaction-date,lot-size,bldg-size,report-date,price,ppm,address,scheme,tenure-type,tenure-period,bedroom-num,desc
9355,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,21/1/2011,405,265,14/2/2011,2150800,8116,"LOT 45678 VILLA 33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...
9356,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,28/10/2010,283,265,19/1/2011,1850800,6984,"33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...
9357,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,10/11/2010,286,265,14/2/2011,1850800,6984,"LOT 45684 VILLA 33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...
9358,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,23/12/2010,287,265,14/2/2011,1850800,6984,"LOT 45683 VILLA 33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...
9359,Kuala Lumpur,Mukim Petaling,2.0,semid,2-21/2storeysemi-detached,23/3/2011,288,265,6/4/2011,1850800,6984,"LOT 45682 VILLA 33,JALAN MANDA'RINA DAMAI",tamanbukitmanda'rina,Pajakan,99.0,4,tamanbukitmanda'rina 2-21/2storeysemi-detached...


In [21]:
def get_recommendations(index, n_rec):
    #index = 0

    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n_rec+1]

    rec_indices = [i[0] for i in sim_scores]
    
    return data.iloc[rec_indices]

In [22]:
get_recommendations(234,10)

Unnamed: 0,state,district,floor-num,type,property-type,transaction-date,lot-size,bldg-size,report-date,price,ppm,address,scheme,tenure-type,tenure-period,bedroom-num,desc
53,Kuala Lumpur,Mukim Batu,,condo,condominium/apartment,3/5/2010,0,0,22/6/2011,445000,0,"B-1-2,",10mon'tkiara,Kekal,,3,10mon'tkiara condominium/apartment bedroom3 pr...
235,Kuala Lumpur,Mukim Batu,,condo,condominium/apartment,2/5/2008,102,0,24/6/2011,470000,4608,"B-15-2,JLN KIARA 3",10mon'tkiara,Kekal,,3,10mon'tkiara condominium/apartment bedroom3 pr...
93,Kuala Lumpur,Kuala Lumpur Town Centre,19.0,condo,condominium/apartment,26/4/2004,0,0,30/5/2011,480000,0,"5-14-7,JLN BINJAI OFF JLN AMPANG",menarabinjai,Kekal,,3,menarabinjai condominium/apartment bedroom3 pr...
691,Kuala Lumpur,Kuala Lumpur Town Centre,37.0,condo,condominium/apartment,26/1/2011,115,115,26/7/2011,480000,4174,"20-23-2 (23B),PERSIARAN RAJA CHULAN",angkasaimpian,Pajakan,94.0,3,angkasaimpian condominium/apartment bedroom3 p...
723,Kuala Lumpur,Mukim Batu,35.0,condo,condominium/apartment,24/9/2010,117,117,10/1/2011,480000,4103,"20-15-5(A-15-1),JLN1/70C",angkupuri,Kekal,,3,angkupuri condominium/apartment bedroom3 price...
3179,Kuala Lumpur,Mukim Kuala Lumpur,18.0,condo,condominium/apartment,24/3/2011,104,104,22/4/2011,480000,4615,"24-12-3A(B-12-3A),JLN BUKIT DESA 3",desavillacondo,Kekal,,3,desavillacondo condominium/apartment bedroom3 ...
3515,Kuala Lumpur,Mukim Kuala Lumpur,5.0,condo,condominium/apartment,13/5/2011,154,154,24/6/2011,480000,3117,"26-3,OFF JLN KELANG LAMA",faberria,Kekal,,3,faberria condominium/apartment bedroom3 price4...
3924,Kuala Lumpur,Mukim Batu,15.0,condo,condominium/apartment,25/1/2011,0,109,15/4/2011,480000,4404,"12-2-10,OFF JLN DUTA",hartamasregency,Kekal,,3,hartamasregency condominium/apartment bedroom3...
5244,Kuala Lumpur,Mukim Batu,23.0,condo,condominium/apartment,24/5/2011,161,161,1/7/2011,480000,2981,"1-16-02,JLN KIARA 1/16A",lanaikiara,Kekal,,3,lanaikiara condominium/apartment bedroom3 pric...
5245,Kuala Lumpur,Mukim Batu,23.0,condo,condominium/apartment,13/5/2011,0,148,29/6/2011,480000,3243,"1-13-7,JALAN KIARA 3",lanaikiara,Kekal,,3,lanaikiara condominium/apartment bedroom3 pric...


## Evaluations

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [39]:
X = data
y = data['desc'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
similarities = cosine_similarity().fit(X_train, y_train)

In [40]:
y_pred = similarities.predict(X_test)
model_mae = mean_absolute_error(y_test, y_pred)
print(model_mae)

0.76


===========================================================================================================

In [31]:
# Save the file using picle
import pickle

In [32]:
pickle.dump(data,open('property_list.pkl','wb'))
pickle.dump(data,open('raw_property.pkl','wb'))
pickle.dump(cosine_sim,open('similarity.pkl','wb'))