In [733]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import re
import math
from dotenv import load_dotenv
import os
import requests

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from pysentimiento.preprocessing import preprocess_tweet

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [734]:
import pickle
pickle_file_path = '~/data/train.pickle'
with open(pickle_file_path, 'rb') as f:
    train = pickle.load(f)
    
pickle_file_path = '~/data/test_kaggle.pickle'
with open(pickle_file_path, 'rb') as f:
    test = pickle.load(f)

In [735]:
df = pd.DataFrame(train)
df_test = pd.DataFrame(test)
df.head()

Unnamed: 0,price,title,loc_string,loc,features,type,subtype,selltype,desc
0,320.000 €,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,"[85 m2, 2 hab., 1 baño, 3.647 €/m2]",FLAT,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...
1,335.000 €,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,"[65 m2, 2 hab., 1 baño, 5.000 €/m2]",FLAT,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,330.000 €,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,"[77 m2, 2 hab., 1 baño, 4.286 €/m2]",FLAT,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de..."
3,435.000 €,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,"[96 m2, 3 hab., 2 baños, 4.531 €/m2]",FLAT,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,410.000 €,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,"[84 m2, 2 hab., 1 baño, 4.881 €/m2]",FLAT,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin..."


In [736]:
df_test.shape

(132, 10)

## 1) Numerical features

In [737]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\sáéíóúüñ]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text, language='spanish')
    
    # Remove stopwords
    stop_words = set(stopwords.words('spanish'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    stemmer = SnowballStemmer('spanish')
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [738]:
df.head()

Unnamed: 0,price,title,loc_string,loc,features,type,subtype,selltype,desc
0,320.000 €,Piso Tallers. Piso con 2 habitaciones con asce...,Barcelona - Sant Antoni,,"[85 m2, 2 hab., 1 baño, 3.647 €/m2]",FLAT,FLAT,SECOND_HAND,Piso en última planta a reformar en calle Tall...
1,335.000 €,Piso C/ de valència. Piso reformado en venta d...,Barcelona - Dreta de l´Eixample,,"[65 m2, 2 hab., 1 baño, 5.000 €/m2]",FLAT,FLAT,SECOND_HAND,"Ubicado en la zona del Camp de l’Arpa, cerca d..."
2,330.000 €,Piso en Dreta de l´Eixample. Acogedor piso al ...,Barcelona - Dreta de l´Eixample,,"[77 m2, 2 hab., 1 baño, 4.286 €/m2]",FLAT,FLAT,SECOND_HAND,"En pleno centro de Barcelona, justo al lado de..."
3,435.000 €,"Piso Barcelona - corts catalanes. Soleado, cén...",Barcelona - Sant Antoni,,"[96 m2, 3 hab., 2 baños, 4.531 €/m2]",FLAT,FLAT,SECOND_HAND,"Vivienda espaciosa en Sant Antoni, cerca de Pl..."
4,410.000 €,"Piso en Carrer de sardenya 271. Alto, reformad...",Barcelona - Sagrada Família,Carrer de Sardenya 271,"[84 m2, 2 hab., 1 baño, 4.881 €/m2]",FLAT,FLAT,SECOND_HAND,"En el corazón de Barcelona, en una hermosa fin..."


In [739]:
def data_pipeline(df, df_lat_long, tfidf_vectorizer_desc=None, tfidf_vectorizer_title=None, encoder=None, if_train=True):
    
    # Drop unnecessary columns
    df = df.drop(['subtype', 'selltype'], axis=1)
    
    # Extract features from 'features' column
    df['sq_meters'] = df['features'].str[0].str.extract('(\d+)').astype(float)
    df['num_rooms'] = df['features'].str[1].str.extract('(\d+)').astype(float)
    df['num_bathrooms'] = df['features'].str[2].str.extract('(\d+)').astype(float)
    
    # Add latitude and longitude from df_lat_long
    df['latitude'] = df_lat_long['Latitude']
    df['longitude'] = df_lat_long['Longitude']
    
    # One-hot encoding for 'type' column
    
    if encoder is None:
        type_data = df['type'].values.reshape(-1, 1)
        encoder = OneHotEncoder()
        one_hot_encoded = encoder.fit_transform(type_data)
        one_hot_array = one_hot_encoded.toarray()
    else:
        one_hot_encoded = encoder.transform(df['type'].values.reshape(-1, 1))
        one_hot_array = one_hot_encoded.toarray()
        
    for i, category in enumerate(encoder.categories_[0]):
        df[category] = one_hot_array[:, i]
    df.drop('type', axis=1, inplace=True)
    
    # Text preprocessing for description and title
    df['preprocessed_desc'] = df['desc'].apply(preprocess_text)
    df['preprocessed_title'] = df['title'].apply(preprocess_text)
    
    ## Using the text data
    # TF-IDF vectorization for description
    if tfidf_vectorizer_desc is None:
        tfidf_vectorizer_desc = TfidfVectorizer()
        desc_tfidf = tfidf_vectorizer_desc.fit_transform(df['preprocessed_desc'])
    else:
        desc_tfidf = tfidf_vectorizer_desc.transform(df['preprocessed_desc'])
    
    # TF-IDF vectorization for title
    if tfidf_vectorizer_title is None:
        tfidf_vectorizer_title = TfidfVectorizer()
        title_tfidf = tfidf_vectorizer_title.fit_transform(df['preprocessed_title'])
    else:
        title_tfidf = tfidf_vectorizer_title.transform(df['preprocessed_title'])
    
    # If training, return the target variable 'y'
    if if_train:
        # Price processing
        df['price'] = df['price'].str.replace(' €', '').str.replace('.', '').astype(float) / 1000
        y = df['price']
        df = df.drop('price', axis=1)
        
        # Drop unnecessary columns
        df = df.drop(['title', 'loc_string', 'loc', 'features', 'desc', 'preprocessed_desc', 'preprocessed_title'], axis=1)
        
        X_combined = np.concatenate((df, desc_tfidf.toarray(),title_tfidf.toarray()), axis=1)
        return y, X_combined, tfidf_vectorizer_desc, tfidf_vectorizer_title, encoder
    else:
        df = df.drop(['title','loc_string','loc','features','desc','id','preprocessed_desc','preprocessed_title'], axis=1)
        X_combined = np.concatenate((df, desc_tfidf.toarray(),title_tfidf.toarray()), axis=1)
        return X_combined    

In [740]:
df_lat_long_train = pd.read_csv('location_data/latitude_longitude.csv')

df_new = df.copy()
y, X,  tfidf_vectorizer_desc, tfidf_vectorizer_title, encoder = data_pipeline(df_new, df_lat_long_train)

  df['price'] = df['price'].str.replace(' €', '').str.replace('.', '').astype(float) / 1000


In [741]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [742]:
df_lat_long_test = pd.read_csv('location_data/longitude_latitude_test.csv')

### Model XGBoost

In [743]:
# Train the model
model = XGBRegressor()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

In [744]:
print(r2)

0.5974830929042595


### Model CatBoosting

In [745]:
catboost_model = CatBoostRegressor(iterations=500,  # Number of trees (boosting iterations)
                                   learning_rate=None,  # Learning rate
                                   depth=6,  # Depth of each tree
                                   loss_function='RMSE',  # Loss function to optimize
                                   verbose=100  # Print training progress
                                   )

# Fit the regressor to the training data
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test))

# Make predictions on the test set
y_pred_catboost = catboost_model.predict(X_test)

# Evaluate the model
r2_catboost = r2_score(y_test, y_pred_catboost)
print(f"R Square (CatBoost): {r2_catboost}")

Learning rate set to 0.073353
0:	learn: 75.3254015	test: 73.2376099	best: 73.2376099 (0)	total: 10.5ms	remaining: 5.24s
100:	learn: 37.4828535	test: 45.5022920	best: 45.5022920 (100)	total: 789ms	remaining: 3.12s
200:	learn: 24.0243458	test: 43.2545889	best: 43.2260990 (198)	total: 1.53s	remaining: 2.28s
300:	learn: 17.0130349	test: 42.6199708	best: 42.6167272 (298)	total: 2.32s	remaining: 1.53s
400:	learn: 12.4001613	test: 42.4692382	best: 42.4614328 (353)	total: 3.1s	remaining: 767ms
499:	learn: 9.4471853	test: 42.4888497	best: 42.3944173 (447)	total: 3.92s	remaining: 0us

bestTest = 42.39441734
bestIteration = 447

Shrink model to first 448 iterations.
R Square (CatBoost): 0.6786126738752609


- Since we can see that Catboost gives a much better RMSE score compared to XGBoost, hence we perform a RandomSearchCV on catboost to find the best parameters

### RandomSearchCV on Catboosting Algorithm

In [659]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

def perform_random_search(x, y, n_splits=10, random_state=8675309, n_iter=10):
    # Define the parameter grid
    param_dist = {
        'iterations': [100, 500, 1000],
        'learning_rate': [None, 0.01, 0.1],
        'depth': [6, 9, 12],
    }

    catboost_model = CatBoostRegressor(loss_function='RMSE')

    kf = KFold(n_splits=n_splits, random_state=random_state, shuffle=True)

    random_search = RandomizedSearchCV(
        catboost_model, 
        param_distributions=param_dist, 
        scoring='r2',  # Adjust scoring metric as needed
        n_iter=n_iter, 
        cv=kf, 
        verbose=2, 
        n_jobs=-1, 
        random_state=random_state
    )

    x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.2, random_state=random_state)

    random_search.fit(x_train, y_train, eval_set=[(x_eval, y_eval)], early_stopping_rounds=50, verbose=50)

    return random_search.best_score_, random_search.best_params_

In [660]:
best_score, best_params = perform_random_search(X_train, y_train)

print("Best Score:", best_score)
print("Best Parameters:", best_params)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
0:	learn: 75.1857167	test: 75.1584751	best: 75.1584751 (0)	total: 9.99ms	remaining: 4.99s
50:	learn: 41.4792140	test: 51.6320095	best: 51.5974356 (49)	total: 350ms	remaining: 3.08s
100:	learn: 29.1738385	test: 48.1942486	best: 48.1942486 (100)	total: 692ms	remaining: 2.73s
150:	learn: 20.9754114	test: 48.0498841	best: 47.9943754 (137)	total: 1.02s	remaining: 2.35s
200:	learn: 15.7088740	test: 47.7106172	best: 47.4346416 (177)	total: 1.34s	remaining: 2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 47.43464161
bestIteration = 177

Shrink model to first 178 iterations.
Best Score: 0.5612736388473437
Best Parameters: {'learning_rate': 0.1, 'iterations': 500, 'depth': 6}


- We got the best parameters around 500 iterations, 0.1 learning rate and a depth of 6

### Model Prediction

In [750]:
df_test_new = df_test.copy()
df_test_new = df_test_new.drop('description', axis=1)
X_test = data_pipeline(df_test_new, df_lat_long_test, tfidf_vectorizer_desc, \
                       tfidf_vectorizer_title, encoder,if_train = False)

In [751]:
catboost_model = CatBoostRegressor(iterations=500,  # Number of trees (boosting iterations)
                                   learning_rate=0.1,  # Learning rate
                                   depth=6,  # Depth of each tree
                                   loss_function='RMSE',  # Loss function to optimize
                                   verbose=100  # Print training progress
                                   )

catboost_model.fit(X, y)

0:	learn: 73.6096865	total: 40.2ms	remaining: 20s
100:	learn: 32.5818827	total: 946ms	remaining: 3.74s
200:	learn: 19.9792565	total: 1.78s	remaining: 2.65s
300:	learn: 13.5352449	total: 2.7s	remaining: 1.78s
400:	learn: 9.8221635	total: 3.51s	remaining: 867ms
499:	learn: 7.2065041	total: 4.36s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x317b8e550>

In [752]:
catboost_predictions = catboost_model.predict(X_test)

In [753]:
catboost_predictions

array([329.65635384, 332.31019861, 293.98065835, 329.09506788,
       340.58455438, 350.92081896, 277.98607993, 267.78002829,
       297.95271326, 284.91791528, 358.06505472, 382.01433612,
       374.23760398, 404.41436548, 358.96069921, 317.41778879,
       272.3985736 , 432.02299575, 391.40886416, 314.89469792,
       323.8309567 , 336.29935667, 263.356325  , 431.1000853 ,
       234.86339517, 226.58030501, 355.43865064, 351.8828732 ,
       392.8630642 , 394.37101724, 278.16777367, 361.34849428,
       392.15302784, 332.53304102, 415.51272929, 311.94076206,
       362.25337546, 294.53013814, 408.66160054, 341.25274244,
       413.91485546, 392.15302784, 254.34197667, 261.36545289,
       384.8673695 , 404.22308653, 333.0392116 , 407.53141134,
       363.11956077, 426.51385549, 287.70045918, 362.9803626 ,
       301.47571213, 241.23517675, 391.31578381, 295.53761611,
       435.04817402, 371.78165873, 352.17352522, 416.47643146,
       304.73635634, 304.93951604, 353.28594825, 242.29

In [754]:
df_test_id = pd.DataFrame(test)
ids = df_test_id.id

In [755]:
data = np.column_stack((ids, catboost_predictions))
df_pred = pd.DataFrame(data, columns=['id','price'])
df_pred['id'] = df_pred['id'].astype(int)
df_pred.to_csv('/Users/ranjeetnagarkar/Desktop/AdvancedML/project/project-group5/solution.csv', index=False)