In [5]:
import numpy as np
import pandas as pd
import json
import re
import ast
import gensim
from gensim.models import Word2Vec,KeyedVectors
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

ImportError: cannot import name 'triu' from 'scipy.linalg' (C:\Users\ropar_i9941t3\miniconda3\Lib\site-packages\scipy\linalg\__init__.py)

In [4]:
!pip install --upgrade scipy
!pip install --upgrade gensim



In [2]:
# Loading the pre-trained model
W2V_model = KeyedVectors.load_word2vec_format('/kaggle/input/word2vec-file/GoogleNews-vectors-negative300.bin',binary=True,limit=500000)

# Introduction to Recommendation systems
A recommendation system which is also known as recommendation engine is a type of information retrieval system that is used to extract a very personalized information for a person based on the provided query information. There are 3 different types of recommendation systems and these 3 types are
1. Collaborative filtering recommendation system
2. Content based recommendation system
3. Hybrid recommendation system

Out of these 3 types we will be using the `content based recommendation system` where A content-based recommendation system generates recommendations for the user based on the features or attributes of the items that the user have interacted with in the past. This approach is commonly used in scenarios where the features or attributes of items are well-defined, such as movies, books, articles, or products.

Here's how a content-based recommendation system works ⬇️

1. **Item Profile Creation**: For each item in the system (e.g., movies or books), a profile **(vector representation)** is created based on features or attributes. These features could include genres, actors, directors, keywords, or any relevant metadata. For example, for movies, the features might include genres (action, comedy, drama), actors, and directors.
2. **User Profile Creation**: The system also maintains a user profile that represents the user's preferences. This user profile is constructed based on the features of items the user has previously liked, rated, or interacted with. For instance, if a user frequently watches action movies, the system infers a preference for action movies in the user profile.
3. **Recommendation Generation**: To make recommendations, the system calculates the similarity between the user profile and item profiles. Items that are most similar to the user profile are recommended. This similarity calculation is typically done using techniques like cosine similarity or euclidean distance metrics
4. **Recommendation Delivery**: The system delivers recommendations based on the highest similarity scores, and the user is presented with items that are expected to match their preferences based on the features.

In [3]:
df = pd.read_csv('/kaggle/input/apartment-data/appartments.csv')
df.drop(22,inplace=True)
df.dropna(inplace=True)

df.reset_index(drop=True,inplace=True)
df.head(3)

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."


Since we will be utilizing the same data for making 3 different recommendation engines so we will now simply create 3 seperate copies of the original dataframe.

In [4]:
# Let's create 3 seperate copies of the original dataframe
Facilities_df = df.copy()
Price_df = df.copy()

# Facilities Based recommendation system

In [5]:
def extract_list(s):
    """
    This method will extract the list of facilities from the string
    """
    return re.findall(r"'(.*?)'", s)

# Extracing the list of facilities from the string
Facilities_df['TopFacilities'] = Facilities_df['TopFacilities'].apply(extract_list)

# Concetinating all the items in the TopFacilities list as single string
Facilities_df['Facilities_Str'] = Facilities_df['TopFacilities'].apply(' '.join)

In [6]:
# Define a function to get the vector representation of a document using Word2Vec
def document_vector(doc):
    
    # Remove out-of-vocabulary words and get Word2Vec vectors for the words in the document
    words = [word for word in doc.split() if word in W2V_model]
    if not words:
        
        # If none of the words are in the Word2Vec model, return zeros
        return np.zeros(300)
        
    # Return the mean of Word2Vec vectors for words in the document
    return np.mean(W2V_model[words], axis=0)

# Apply the function to each document in Facilities_df['Facilities_Str']
word2vec_matrix = np.array([document_vector(doc) for doc in Facilities_df['Facilities_Str']])

# Calculating the cosine similarity
cosine_sim1 = cosine_similarity(word2vec_matrix, word2vec_matrix)

In [7]:
def reccom_facility_based(property_name):
    """
    This method will take the property name as an input and will return 5
    most similar properties
    """

    # Getting the index of the property that matches the name
    idx = Facilities_df[Facilities_df['PropertyName'] == property_name].index[0]

    # Calculating the similarity scores
    sim_scores = list(enumerate(cosine_sim1[idx]))

    # Sort the properties based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar properties
    sim_scores = sim_scores[1:6]

    # Get the property indices
    property_indices = [i[0] for i in sim_scores]

    facilities_based_reccm = pd.DataFrame({
        'PropertyName': Facilities_df['PropertyName'].iloc[property_indices],
        'SimilarityScore': sim_scores
    })

    # Return the top 10 most similar properties
    return facilities_based_reccm

In [8]:
# Let's get the facilities based recommendation
reccom_facility_based("DLF The Ultima")

Unnamed: 0,PropertyName,SimilarityScore
71,Emaar MGF The Palm Drive,"(71, 0.92862475)"
1,M3M Crown,"(1, 0.9205359)"
74,Ireo Victory Valley,"(74, 0.9148083)"
183,Silverglades Hightown Residences,"(183, 0.9102179)"
25,Birla Navya,"(25, 0.90776926)"


# Price based recommendation engine

The below mentioned code first defines a function named refined_parse_modified_v2. It takes a string detail_str containing property details and extracts specific features from it. The string is converted into a dictionary format using JSON parsing.

The function processes each item in this dictionary, representing different property types (such as bedroom, hall, kitchen configurations). For each property type, it retrieves details like the building type, area range (both low and high values), and price range (also low and high values).

To handle variations in the provided information, it separates area and price ranges, ensuring numerical values are extracted accurately. If there's only one value provided for an area, it's considered both the low and high range. Similarly, for prices, it accounts for different units (Crores and Lakhs) and converts the currency to numerical representation. After processing, the function creates a dictionary containing these extracted features, organized by property type. This dictionary holds key details such as building type, area range, and price range for each property configuration.

In [9]:
# Function to parse and extract the required features from the PriceDetails column
def refined_parse_modified_v2(detail_str):
    try:
        details = json.loads(detail_str.replace("'", "\""))
    except:
        return {}

    extracted = {}
    for bhk, detail in details.items():
        # Extract building type
        extracted[f'building type_{bhk}'] = detail.get('building_type')

        # Parsing area details
        area = detail.get('area', '')
        area_parts = area.split('-')
        if len(area_parts) == 1:
            try:
                value = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area low {bhk}'] = value
                extracted[f'area high {bhk}'] = value
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None
        elif len(area_parts) == 2:
            try:
                extracted[f'area low {bhk}'] = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area high {bhk}'] = float(area_parts[1].replace(',', '').replace(' sq.ft.', '').strip())
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None

        # Parsing price details
        price_range = detail.get('price-range', '')
        price_parts = price_range.split('-')
        if len(price_parts) == 2:
            try:
                extracted[f'price low {bhk}'] = float(price_parts[0].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                extracted[f'price high {bhk}'] = float(price_parts[1].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                if 'L' in price_parts[0]:
                    extracted[f'price low {bhk}'] /= 100
                if 'L' in price_parts[1]:
                    extracted[f'price high {bhk}'] /= 100
            except:
                extracted[f'price low {bhk}'] = None
                extracted[f'price high {bhk}'] = None

    return extracted

In [10]:
# Apply the refined parsing and generate the new DataFrame structure
data_refined = []

for _, row in Price_df.iterrows():
    features = refined_parse_modified_v2(row['PriceDetails'])

    # Construct a new row for the transformed dataframe
    new_row = {'PropertyName': row['PropertyName']}

    # Populate the new row with extracted features
    for config in ['1 BHK', '2 BHK', '3 BHK', '4 BHK', '5 BHK', '6 BHK', '1 RK', 'Land']:
        new_row[f'building type_{config}'] = features.get(f'building type_{config}')
        new_row[f'area low {config}'] = features.get(f'area low {config}')
        new_row[f'area high {config}'] = features.get(f'area high {config}')
        new_row[f'price low {config}'] = features.get(f'price low {config}')
        new_row[f'price high {config}'] = features.get(f'price high {config}')

    data_refined.append(new_row)

Price_df = pd.DataFrame(data_refined).set_index('PropertyName')
Price_df['building type_Land'] = Price_df['building type_Land'].replace({'':'Land'})

In [11]:
building_type_catg = Price_df.select_dtypes(include=['object']).columns.tolist()
print(building_type_catg)

['building type_1 BHK', 'building type_2 BHK', 'building type_3 BHK', 'building type_4 BHK', 'building type_5 BHK', 'building type_6 BHK', 'building type_1 RK', 'building type_Land']


In [12]:
# Performing one hot encoding of the input data and imputing missing values with 0
Price_df = pd.get_dummies(Price_df, columns=building_type_catg, drop_first=True)
Price_df.fillna(0,inplace=True)

In [13]:
# Initialize the scaler
scale_price_df = StandardScaler()

# Apply the scaler to the entire dataframe
Price_df = pd.DataFrame(scale_price_df.fit_transform(Price_df), columns=Price_df.columns, index=Price_df.index)

# Let's calculate the cosine_similarity
cosine_sim2 = cosine_similarity(Price_df)

In [14]:
def reccom_price_based(property_name, top_n=247):

    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim2[Price_df.index.get_loc(property_name)]))

    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]

    # Retrieve the names of the top properties using the indices
    top_properties = Price_df.index[top_indices].tolist()

    # Create a dataframe with the results
    price_based_reccom = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })

    return price_based_reccom

# Test the recommender function using a property name
reccom_price_based('M3M Golf Hills').head(5)

Unnamed: 0,PropertyName,SimilarityScore
0,AIPL The Peaceful Homes,0.955462
1,Smartworld One DXP,0.95467
2,Unitech Escape,0.953092
3,M3M Capital,0.951156
4,BPTP Terra,0.943128


# Saving the dataframe using pickle

In [15]:
# Save DataFrame using pickle
with open('/kaggle/working/Facilities_RE.pkl', 'wb') as f:
    pickle.dump(Price_df, f)

# Save Cosine similarity 1 and 2 using pickle
with open('/kaggle/working/CosineSim_facilities.pkl', 'wb') as f:
    pickle.dump(cosine_sim1, f)

# Save DataFrame using pickle
with open('/kaggle/working/CosineSim_Prices.pkl', 'wb') as f:
    pickle.dump(cosine_sim2, f)