In [17]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Introduction to Recommendation systems
A recommendation system which is also known as recommendation engine is a type of information retrieval system that is used to extract a very personalized information for a person based on the provided query information. There are 3 different types of recommendation systems and these 3 types are
1. Collaborative filtering recommendation system
2. Content based recommendation system
3. Hybrid recommendation system

Out of these 3 types we will be using the `content based recommendation system` where A content-based recommendation system generates recommendations for the user based on the features or attributes of the items that the user have interacted with in the past. This approach is commonly used in scenarios where the features or attributes of items are well-defined, such as movies, books, articles, or products.

Here's how a content-based recommendation system works ⬇️

1. **Item Profile Creation**: For each item in the system (e.g., movies or books), a profile `**(vector representation)**` is created based on features or attributes. These features could include genres, actors, directors, keywords, or any relevant metadata. For example, for movies, the features might include genres (action, comedy, drama), actors, and directors.
2. **User Profile Creation**: The system also maintains a user profile that represents the user's preferences. This user profile is constructed based on the features of items the user has previously liked, rated, or interacted with. For instance, if a user frequently watches action movies, the system infers a preference for action movies in the user profile.
3. **Recommendation Generation**: To make recommendations, the system calculates the similarity between the user profile and item profiles. Items that are most similar to the user profile are recommended. This similarity calculation is typically done using techniques like cosine similarity or euclidean distance metrics
4. **Recommendation Delivery**: The system delivers recommendations based on the highest similarity scores, and the user is presented with items that are expected to match their preferences based on the features.

In [2]:
df = pd.read_csv('Raw_data/appartments.csv')
df.dropna(inplace=True)
df.head(3)

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."


# Facilities Based recommendation system

In [3]:
def extract_list(s):
    """
    This method will extract the list of facilities from the string
    """
    return re.findall(r"'(.*?)'", s)

df['TopFacilities'] = df['TopFacilities'].apply(extract_list)

In [4]:
# Concetinating all the items in the TopFacilities list as single string
df['Facilities_Str'] = df['TopFacilities'].apply(' '.join)

In [8]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

# Creating word embedding using tf-idf
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Facilities_Str'])
print("Shape:",tfidf_matrix.toarray().shape)

# Calculating the cosine similarity betweeen the vectors
cosine_sim_facilities = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Shape:",cosine_sim_facilities.shape)

In [51]:
def recommend_properties(property_name):
    """
    This method will take the property name as an input and will return 5
    most similar properties
    """
    
    # Getting the index of the property that matches the name
    idx = df[df['PropertyName'] == property_name].index[0]

    # Calculating the similarity scores 
    sim_scores = list(enumerate(cosine_sim_facilities[idx]))

    # Sort the properties based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar properties
    sim_scores = sim_scores[1:6]

    # Get the property indices
    property_indices = [i[0] for i in sim_scores]
    
    recommendations_df = pd.DataFrame({
        'PropertyName': df['PropertyName'].iloc[property_indices],
        'SimilarityScore': sim_scores
    })

    # Return the top 10 most similar properties
    return recommendations_df

In [53]:
recommend_properties("M3M Crown")

Unnamed: 0,PropertyName,SimilarityScore
86,DLF The Ultima,"(85, 0.35860320390686057)"
227,BPTP Pedestal,"(226, 0.33176299517252417)"
75,Ireo Victory Valley,"(74, 0.3185862339669967)"
146,M3M Sky Lofts,"(145, 0.30917614904944274)"
90,Central Park Flower Valley Mikasa Plots,"(89, 0.29743060271126903)"
