In [1]:
#Import packages
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

Import all preprocessed CSV files

In [2]:
#load listings data and review data
listings_replication= 'replication.csv'
listings_ordinal_encoded= 'listings_features_ordinal_encoded.csv'
listings_ordinal_encoded_replication= 'listings_features_ordinal_encoded_replication.csv'
reviews= 'listings_features_reviews.csv'
textblob_sentiment= 'average_sentiment_per_listing_textblob.csv'
vader_sentiment= 'average_sentiment_per_listing_vader.csv'
ordinal_encoded_features = pd.read_csv(listings_ordinal_encoded)
ordinal_encoded_replication_features = pd.read_csv(listings_ordinal_encoded_replication)
reviews_features = pd.read_csv(reviews)
textblob_sentiment_features= pd.read_csv(textblob_sentiment)
vader_sentiment_features= pd.read_csv(vader_sentiment)
replication_features= pd.read_csv(listings_replication)



In [3]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full width of column content
pd.set_option('display.max_rows', None)  # Show all rows

Check the shape of these 4 sets of features. We see that the replication deletes listings with prices above $500. Therefore it has less rows than the label encoded and review-related features.
As we can see those 2 have the same data cleaning approach, such that we can easily compare them together as only the features (columns) differ.
The sentiment related features are preprocessed on the reviews and therefore have a different shape, however the download contains the same listings, listings with high prices are not removed, but for example non-english reviews are removed and therefore the number of listings is different.

We see that the sentiment related sets are divided between Textblob and Vader sentiment

In [4]:
#Shapes
print('Replication shape:', replication_features.shape)
print('Ordinal encoded shape:', ordinal_encoded_features.shape)
print('Ordinal encoded replication shape:', ordinal_encoded_replication_features.shape)
print('Reviews shape:', reviews_features.shape)
print('Textblob Sentiment shape:', textblob_sentiment_features.shape)
print('Vader Sentiment shape:', vader_sentiment_features.shape)

Replication shape: (200361, 12)
Ordinal encoded shape: (210937, 98)
Ordinal encoded replication shape: (200361, 98)
Reviews shape: (210937, 11)
Textblob Sentiment shape: (270585, 6)
Vader Sentiment shape: (270585, 6)


In [5]:
reviews_features.head()

Unnamed: 0,id,number_of_reviews,number_of_reviews_l30d,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,days_between_first_review,days_since_last_review
0,888038298563003925,8,0,5.0,4.88,4.75,4.5,4.88,4.75,645,280
1,50666229,165,6,4.87,4.85,4.99,4.98,4.93,4.62,1131,7
2,933523,140,2,4.84,4.93,4.97,4.99,4.91,4.74,3775,7
3,656222143862971193,14,0,4.86,4.86,5.0,5.0,4.93,4.71,750,64
4,20259345,712,7,4.93,4.84,4.92,4.93,4.83,4.77,2356,13


We make sure that each id column has the same name (id instead of listing_id)

In [6]:
textblob_sentiment_features['id']=textblob_sentiment_features['listing_id']
vader_sentiment_features['id']=vader_sentiment_features['listing_id']
textblob_sentiment_features.drop(columns=['listing_id'], inplace=True)
vader_sentiment_features.drop(columns=['listing_id'], inplace=True)

In [7]:
#We will merge the sentiment and reviews to get the review-related features and keep sure this are the same listings as in the non-review related feature sets.
reviews_sentiment_features_textblob=pd.merge(textblob_sentiment_features, reviews_features, on='id', how='inner')
print('Reviews sentiment shape:', reviews_sentiment_features_textblob.shape)
reviews_sentiment_features_vader=pd.merge(vader_sentiment_features, reviews_features, on='id', how='inner')
print('Reviews sentiment shape:', reviews_sentiment_features_vader.shape)



Reviews sentiment shape: (201865, 16)
Reviews sentiment shape: (201865, 16)


In [8]:
#Keep the same listings in all datasets except for the replication
listings=reviews_sentiment_features_textblob['id']
ordinal_encoded_features=ordinal_encoded_features[ordinal_encoded_features['id'].isin(listings)]
reviews_features=reviews_features[reviews_features['id'].isin(listings)]

Reviews data do not include the price right now, we add the price to reviews_data_including price, because it can be useful later on in the research

In [9]:
#Add the price column from ordinal encoded to the reviews_sentiment_features
reviews_sentiment_including_price_textblob=pd.merge(reviews_sentiment_features_textblob, ordinal_encoded_features[['id','price']], on='id', how='inner')
reviews_features_including_price_textblob=pd.merge(reviews_features, ordinal_encoded_features[['id','price']], on='id', how='inner')
print('Reviews sentiment including price shape:', reviews_sentiment_including_price_textblob.shape)
print('Reviews including price shape:', reviews_features_including_price_textblob.shape)

reviews_sentiment_including_price_vader=pd.merge(reviews_sentiment_features_vader, ordinal_encoded_features[['id','price']], on='id', how='inner')
reviews_features_including_price_vader=pd.merge(reviews_features, ordinal_encoded_features[['id','price']], on='id', how='inner')
print('Reviews sentiment including price shape:', reviews_sentiment_including_price_vader.shape)
print('Reviews including price shape:', reviews_features_including_price_vader.shape)

Reviews sentiment including price shape: (201865, 17)
Reviews including price shape: (201865, 12)
Reviews sentiment including price shape: (201865, 17)
Reviews including price shape: (201865, 12)


In [10]:
reviews_without_sentiment_listings_ordinal_encoded= pd.merge(reviews_sentiment_features_textblob, ordinal_encoded_features, on='id', how='inner').drop(columns=['topic_0_average_sentiment','topic_1_average_sentiment','topic_2_average_sentiment','topic_3_average_sentiment'])
reviews_sentiment_features_textblob_ordinal_encoded= pd.merge(reviews_sentiment_features_textblob, ordinal_encoded_features, on='id', how='inner')  
reviews_sentiment_features_vader_ordinal_encoded= pd.merge(reviews_sentiment_features_vader, ordinal_encoded_features, on='id', how='inner')

This leads to four different feature sets, with the following shapes, divided into textblob and vader sentiment. These are the feature sets that we are using for research question 2 and research question 3.

In [11]:
feature_set_1_replication=ordinal_encoded_replication_features
feature_set_1=ordinal_encoded_features
feature_set_2=reviews_without_sentiment_listings_ordinal_encoded
feature_set_3_textblob=reviews_sentiment_features_textblob_ordinal_encoded
feature_set_3_vader=reviews_sentiment_features_vader_ordinal_encoded
feature_set_4_textblob=reviews_sentiment_including_price_textblob
feature_set_4_vader=reviews_sentiment_including_price_vader

print('Feature set 1 replication:' , feature_set_1_replication.shape)
print('Feature set 1:' , feature_set_1.shape)
print('Feature set 2:' , feature_set_2.shape)
print('Feature set 3 textblob:' , feature_set_3_textblob.shape)
print('Feature set 3 vader:' , feature_set_3_vader.shape)
print('Feature set 4 textblob:' , feature_set_4_textblob.shape)
print('Feature set 4 vader:' , feature_set_4_vader.shape)


Feature set 1 replication: (200361, 98)
Feature set 1: (201865, 98)
Feature set 2: (201865, 109)
Feature set 3 textblob: (201865, 113)
Feature set 3 vader: (201865, 113)
Feature set 4 textblob: (201865, 17)
Feature set 4 vader: (201865, 17)


In [12]:
feature_set_4_vader.columns

Index(['topic_0_average_sentiment', 'topic_1_average_sentiment',
       'topic_2_average_sentiment', 'topic_3_average_sentiment',
       'average_word_count', 'id', 'number_of_reviews',
       'number_of_reviews_l30d', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'days_between_first_review',
       'days_since_last_review', 'price'],
      dtype='object')

In [13]:
feature_set_2.columns[0:19]

Index(['average_word_count', 'id', 'number_of_reviews',
       'number_of_reviews_l30d', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'days_between_first_review',
       'days_since_last_review', 'neighborhood_overview', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost'],
      dtype='object')

We do the same train-test split for those four feature sets.

In [14]:
# Ensure all feature sets are aligned on the 'id' column
# Use inner join on the 'id' column to ensure consistency across feature sets
feature_set_2 = feature_set_2.merge(feature_set_1[['id']], on='id', how='inner')
feature_set_3_textblob = feature_set_3_textblob.merge(feature_set_1[['id']], on='id', how='inner')
feature_set_3_vader = feature_set_3_vader.merge(feature_set_1[['id']], on='id', how='inner')
feature_set_4_textblob = feature_set_4_textblob.merge(feature_set_1[['id']], on='id', how='inner')
feature_set_4_vader = feature_set_4_vader.merge(feature_set_1[['id']], on='id', how='inner')

# Extract unique IDs
unique_ids = feature_set_1['id'].unique()

# Perform a 90/10 train/test split
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)

# Define masks for train and test sets
train_mask = feature_set_1['id'].isin(train_ids)
test_mask = feature_set_1['id'].isin(test_ids)

# Apply the split to each feature set
feature_set_1_train = feature_set_1[train_mask]
feature_set_1_test = feature_set_1[test_mask]

feature_set_2_train = feature_set_2[feature_set_2['id'].isin(train_ids)]
feature_set_2_test = feature_set_2[feature_set_2['id'].isin(test_ids)]

feature_set_3_textblob_train = feature_set_3_textblob[feature_set_3_textblob['id'].isin(train_ids)]
feature_set_3_textblob_test = feature_set_3_textblob[feature_set_3_textblob['id'].isin(test_ids)]

feature_set_3_vader_train = feature_set_3_vader[feature_set_3_vader['id'].isin(train_ids)]
feature_set_3_vader_test = feature_set_3_vader[feature_set_3_vader['id'].isin(test_ids)]

feature_set_4_textblob_train = feature_set_4_textblob[feature_set_4_textblob['id'].isin(train_ids)]
feature_set_4_textblob_test = feature_set_4_textblob[feature_set_4_textblob['id'].isin(test_ids)]

feature_set_4_vader_train = feature_set_4_vader[feature_set_4_vader['id'].isin(train_ids)]
feature_set_4_vader_test = feature_set_4_vader[feature_set_4_vader['id'].isin(test_ids)]

# Checking shapes and consistency
print("Feature Set 1 - Train Shape:", feature_set_1_train.shape)
print("Feature Set 1 - Test Shape:", feature_set_1_test.shape)

print("Feature Set 2 - Train Shape:", feature_set_2_train.shape)
print("Feature Set 2 - Test Shape:", feature_set_2_test.shape)

print("Feature Set 3 (TextBlob) - Train Shape:", feature_set_3_textblob_train.shape)
print("Feature Set 3 (TextBlob) - Test Shape:", feature_set_3_textblob_test.shape)

print("Feature Set 3 (Vader) - Train Shape:", feature_set_3_vader_train.shape)
print("Feature Set 3 (Vader) - Test Shape:", feature_set_3_vader_test.shape)

print("Feature Set 4 (TextBlob) - Train Shape:", feature_set_4_textblob_train.shape)
print("Feature Set 4 (TextBlob) - Test Shape:", feature_set_4_textblob_test.shape)

print("Feature Set 4 (Vader) - Train Shape:", feature_set_4_vader_train.shape)
print("Feature Set 4 (Vader) - Test Shape:", feature_set_4_vader_test.shape)



Feature Set 1 - Train Shape: (181678, 98)
Feature Set 1 - Test Shape: (20187, 98)
Feature Set 2 - Train Shape: (181678, 109)
Feature Set 2 - Test Shape: (20187, 109)
Feature Set 3 (TextBlob) - Train Shape: (181678, 113)
Feature Set 3 (TextBlob) - Test Shape: (20187, 113)
Feature Set 3 (Vader) - Train Shape: (181678, 113)
Feature Set 3 (Vader) - Test Shape: (20187, 113)
Feature Set 4 (TextBlob) - Train Shape: (181678, 17)
Feature Set 4 (TextBlob) - Test Shape: (20187, 17)
Feature Set 4 (Vader) - Train Shape: (181678, 17)
Feature Set 4 (Vader) - Test Shape: (20187, 17)


In [15]:
#Delete ID column from all feature sets (Train and Test), because it should not have a meaning
# Delete the 'id' column from all train and test sets
feature_set_1_train = feature_set_1_train.drop(columns=['id'])
feature_set_1_test = feature_set_1_test.drop(columns=['id'])

feature_set_2_train = feature_set_2_train.drop(columns=['id'])
feature_set_2_test = feature_set_2_test.drop(columns=['id'])

feature_set_3_textblob_train = feature_set_3_textblob_train.drop(columns=['id'])
feature_set_3_textblob_test = feature_set_3_textblob_test.drop(columns=['id'])

feature_set_3_vader_train = feature_set_3_vader_train.drop(columns=['id'])
feature_set_3_vader_test = feature_set_3_vader_test.drop(columns=['id'])

feature_set_4_textblob_train = feature_set_4_textblob_train.drop(columns=['id'])
feature_set_4_textblob_test = feature_set_4_textblob_test.drop(columns=['id'])

feature_set_4_vader_train = feature_set_4_vader_train.drop(columns=['id'])
feature_set_4_vader_test = feature_set_4_vader_test.drop(columns=['id'])

In [16]:
print("Feature Set 1 - Train Shape:", feature_set_1_train.shape)
print("Feature Set 1 - Test Shape:", feature_set_1_test.shape)

print("Feature Set 2 - Train Shape:", feature_set_2_train.shape)
print("Feature Set 2 - Test Shape:", feature_set_2_test.shape)

print("Feature Set 3 (TextBlob) - Train Shape:", feature_set_3_textblob_train.shape)
print("Feature Set 3 (TextBlob) - Test Shape:", feature_set_3_textblob_test.shape)

print("Feature Set 3 (Vader) - Train Shape:", feature_set_3_vader_train.shape)
print("Feature Set 3 (Vader) - Test Shape:", feature_set_3_vader_test.shape)

print("Feature Set 4 (TextBlob) - Train Shape:", feature_set_4_textblob_train.shape)
print("Feature Set 4 (TextBlob) - Test Shape:", feature_set_4_textblob_test.shape)

print("Feature Set 4 (Vader) - Train Shape:", feature_set_4_vader_train.shape)
print("Feature Set 4 (Vader) - Test Shape:", feature_set_4_vader_test.shape)






Feature Set 1 - Train Shape: (181678, 97)
Feature Set 1 - Test Shape: (20187, 97)
Feature Set 2 - Train Shape: (181678, 108)
Feature Set 2 - Test Shape: (20187, 108)
Feature Set 3 (TextBlob) - Train Shape: (181678, 112)
Feature Set 3 (TextBlob) - Test Shape: (20187, 112)
Feature Set 3 (Vader) - Train Shape: (181678, 112)
Feature Set 3 (Vader) - Test Shape: (20187, 112)
Feature Set 4 (TextBlob) - Train Shape: (181678, 16)
Feature Set 4 (TextBlob) - Test Shape: (20187, 16)
Feature Set 4 (Vader) - Train Shape: (181678, 16)
Feature Set 4 (Vader) - Test Shape: (20187, 16)


In [17]:
# Save all those train and test sets to CSV files
feature_set_1_train.to_csv('feature_set_1_train.csv', index=False)
feature_set_1_test.to_csv('feature_set_1_test.csv', index=False)

feature_set_2_train.to_csv('feature_set_2_train.csv', index=False)
feature_set_2_test.to_csv('feature_set_2_test.csv', index=False)

feature_set_3_textblob_train.to_csv('feature_set_3_textblob_train.csv', index=False)
feature_set_3_textblob_test.to_csv('feature_set_3_textblob_test.csv', index=False)

feature_set_3_vader_train.to_csv('feature_set_3_vader_train.csv', index=False)
feature_set_3_vader_test.to_csv('feature_set_3_vader_test.csv', index=False)

feature_set_4_textblob_train.to_csv('feature_set_4_textblob_train.csv', index=False)
feature_set_4_textblob_test.to_csv('feature_set_4_textblob_test.csv', index=False)

feature_set_4_vader_train.to_csv('feature_set_4_vader_train.csv', index=False)
feature_set_4_vader_test.to_csv('feature_set_4_vader_test.csv', index=False)



In purpose of research question 1, we start with the hyperparameter tuning of the XGboost model in the replication, and compare the results with the mean-price prediction model (Baseline), Linear regression and XGBoost with the default parameters. This leads to the following set of best hyperparameters.

In comparison with (Peng, 2020) we also keep a separate test set for replication purpose, both for linear regression as well for the XGBoost model

In [20]:
# Ensure reproducibility
RANDOM_SEED = 42

# Splitting the dataset into features and target
features = replication_features.drop(columns=['price'])
target = replication_features['price']

# Train-test split (90% train, 10% test)
X_train_replication, X_test_replication, y_train_replication, y_test_replication = train_test_split(
    features, target, test_size=0.10, random_state=RANDOM_SEED
)

# Scale training data only
scaler = StandardScaler()
X_train_replication_scaled = scaler.fit_transform(X_train_replication)
y_train_replication = y_train_replication.reset_index(drop=True)

# Scale test data
X_test_replication_scaled = scaler.transform(X_test_replication)

# Baseline 1: Mean Prediction
baseline_mean_price = y_train_replication.mean()
baseline_rmse = np.sqrt(mean_squared_error(y_train_replication, [baseline_mean_price] * len(y_train_replication)))
baseline_r2 = r2_score(y_train_replication, [baseline_mean_price] * len(y_train_replication))

print("Baseline (Mean Prediction):")
print(f"RMSE: {baseline_rmse:.3f}, R-squared: {baseline_r2:.3f}")

# Function to evaluate a model with cross-validation
def evaluate_model(model, X, y, cv):
    rmse_scores, r2_scores = [], []
    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        y_pred_val = model.predict(X_val)
        rmse_scores.append(np.sqrt(mean_squared_error(y_val, y_pred_val)))
        r2_scores.append(r2_score(y_val, y_pred_val))
    return np.mean(rmse_scores), np.mean(r2_scores)

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# Linear Regression cross-validation evaluation
linear_model = LinearRegression()
rmse_lr_cv, r2_lr_cv = evaluate_model(linear_model, X_train_replication_scaled, y_train_replication, kf)

print("\nLinear Regression (5-fold CV):")
print(f"Mean RMSE (CV): {rmse_lr_cv:.3f}, Mean R-squared (CV): {r2_lr_cv:.3f}")

# Train Linear Regression on full training data
linear_model.fit(X_train_replication_scaled, y_train_replication)

# Evaluate Linear Regression on test data
y_pred_test_lr = linear_model.predict(X_test_replication_scaled)
rmse_test_lr = np.sqrt(mean_squared_error(y_test_replication, y_pred_test_lr))
r2_test_lr = r2_score(y_test_replication, y_pred_test_lr)

print("\nLinear Regression on Test Set:")
print(f"RMSE: {rmse_test_lr:.3f}, R-squared: {r2_test_lr:.3f}")

# XGBoost with default parameters cross-validation evaluation
xgb_default = XGBRegressor(random_state=RANDOM_SEED)
rmse_xgb_cv, r2_xgb_cv = evaluate_model(xgb_default, X_train_replication_scaled, y_train_replication, kf)

print("\nXGBoost (default parameters, 5-fold CV):")
print(f"Mean RMSE (CV): {rmse_xgb_cv:.3f}, Mean R-squared (CV): {r2_xgb_cv:.3f}")

# Train XGBoost with default parameters on full training data
xgb_default.fit(X_train_replication_scaled, y_train_replication)

# Evaluate XGBoost on test data
y_pred_test_xgb = xgb_default.predict(X_test_replication_scaled)
rmse_test_xgb = np.sqrt(mean_squared_error(y_test_replication, y_pred_test_xgb))
r2_test_xgb = r2_score(y_test_replication, y_pred_test_xgb)

print("\nXGBoost (default parameters) on Test Set:")
print(f"RMSE: {rmse_test_xgb:.3f}, R-squared: {r2_test_xgb:.3f}")

# Define parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [8, 10, 12],            # Moderate tree depths
    'n_estimators': [400, 800, 1200],  # Number of trees
    'learning_rate': [0.01, 0.1, 0.3],      # Step size shrinkage
    'subsample': [0.8, 1.0],                # Row sampling
}


# GridSearchCV setup
xgb_grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=RANDOM_SEED),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=kf,
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV on the training data
xgb_grid_search.fit(X_train_replication_scaled, y_train_replication)

# Cross-validation evaluation for the best XGBoost model
best_xgb = xgb_grid_search.best_estimator_
rmse_best_xgb_cv, r2_best_xgb_cv = evaluate_model(best_xgb, X_train_replication_scaled, y_train_replication, kf)

print("\nXGBoost (tuned parameters, 5-fold CV):")
print(f"Best Parameters: {xgb_grid_search.best_params_}")
print(f"Mean RMSE (CV): {rmse_best_xgb_cv:.3f}, Mean R-squared (CV): {r2_best_xgb_cv:.3f}")

# Train the best XGBoost model on full training data
best_xgb.fit(X_train_replication_scaled, y_train_replication)

# Evaluate the best XGBoost model on test data
y_pred_test_best_xgb = best_xgb.predict(X_test_replication_scaled)
rmse_test_best_xgb = np.sqrt(mean_squared_error(y_test_replication, y_pred_test_best_xgb))
r2_test_best_xgb = r2_score(y_test_replication, y_pred_test_best_xgb)

print("\nXGBoost (tuned parameters) on Test Set:")
print(f"RMSE: {rmse_test_best_xgb:.3f}, R-squared: {r2_test_best_xgb:.3f}")


Baseline (Mean Prediction):
RMSE: 0.597, R-squared: 0.000

Linear Regression (5-fold CV):
Mean RMSE (CV): 0.456, Mean R-squared (CV): 0.415

Linear Regression on Test Set:
RMSE: 0.457, R-squared: 0.420

XGBoost (default parameters, 5-fold CV):
Mean RMSE (CV): 0.415, Mean R-squared (CV): 0.516

XGBoost (default parameters) on Test Set:
RMSE: 0.416, R-squared: 0.520
Fitting 5 folds for each of 24 candidates, totalling 120 fits

XGBoost (tuned parameters, 5-fold CV):
Best Parameters: {'learning_rate': 0.01, 'max_depth': 12, 'n_estimators': 1200, 'subsample': 0.8}
Mean RMSE (CV): 0.407, Mean R-squared (CV): 0.535

XGBoost (tuned parameters) on Test Set:
RMSE: 0.407, R-squared: 0.540


We do the same with feature set 1, according to the replication preprocessing approach. This means that all listings with a price above 500 are deleted.

In [18]:
# Ensure reproducibility
RANDOM_SEED = 42

# Splitting the dataset into features and target
features_fs1 = feature_set_1_replication.drop(columns=['price'])
target_fs1 = feature_set_1_replication['price']

# Train-test split (90% train, 10% test)
X_train_fs1, X_test_fs1, y_train_fs1, y_test_fs1 = train_test_split(
    features_fs1, target_fs1, test_size=0.10, random_state=RANDOM_SEED
)

# Scale training data only
scaler_fs1 = StandardScaler()
X_train_fs1_scaled = scaler_fs1.fit_transform(X_train_fs1)
X_train_fs1_scaled = X_train_fs1_scaled.astype('float32')
y_train_fs1 = y_train_fs1.reset_index(drop=True)

# Scale test data
X_test_fs1_scaled = scaler_fs1.transform(X_test_fs1)
X_test_fs1_scaled = X_test_fs1_scaled.astype('float32')

# Baseline 1: Mean Prediction
baseline_mean_price_fs1 = y_train_fs1.mean()
baseline_rmse_fs1 = np.sqrt(mean_squared_error(y_train_fs1, [baseline_mean_price_fs1] * len(y_train_fs1)))
baseline_r2_fs1 = r2_score(y_train_fs1, [baseline_mean_price_fs1] * len(y_train_fs1))

print("Baseline (Mean Prediction):")
print(f"RMSE: {baseline_rmse_fs1:.3f}, R-squared: {baseline_r2_fs1:.3f}")

# Function to evaluate a model with cross-validation
def evaluate_model_fs1(model, X, y, cv):
    rmse_scores, r2_scores = [], []
    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        y_pred_val = model.predict(X_val)
        rmse_scores.append(np.sqrt(mean_squared_error(y_val, y_pred_val)))
        r2_scores.append(r2_score(y_val, y_pred_val))
    return np.mean(rmse_scores), np.mean(r2_scores)

# Cross-validation setup
kf_fs1 = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# Linear Regression cross-validation evaluation
linear_model_fs1 = LinearRegression()
rmse_lr_fs1_cv, r2_lr_fs1_cv = evaluate_model_fs1(linear_model_fs1, X_train_fs1_scaled, y_train_fs1, kf_fs1)

print("\nLinear Regression (5-fold CV):")
print(f"Mean RMSE (CV): {rmse_lr_fs1_cv:.3f}, Mean R-squared (CV): {r2_lr_fs1_cv:.3f}")

# Train Linear Regression on full training data
linear_model_fs1.fit(X_train_fs1_scaled, y_train_fs1)

# Evaluate Linear Regression on test data
y_pred_test_lr_fs1 = linear_model_fs1.predict(X_test_fs1_scaled)
rmse_test_lr_fs1 = np.sqrt(mean_squared_error(y_test_fs1, y_pred_test_lr_fs1))
r2_test_lr_fs1 = r2_score(y_test_fs1, y_pred_test_lr_fs1)

print("\nLinear Regression on Test Set:")
print(f"RMSE: {rmse_test_lr_fs1:.3f}, R-squared: {r2_test_lr_fs1:.3f}")

# XGBoost with default parameters cross-validation evaluation
xgb_default_fs1 = XGBRegressor(random_state=RANDOM_SEED)
rmse_xgb_default_fs1_cv, r2_xgb_default_fs1_cv = evaluate_model_fs1(xgb_default_fs1, X_train_fs1_scaled, y_train_fs1, kf_fs1)

print("\nXGBoost (default parameters, 5-fold CV):")
print(f"Mean RMSE (CV): {rmse_xgb_default_fs1_cv:.3f}, Mean R-squared (CV): {r2_xgb_default_fs1_cv:.3f}")

# Train XGBoost with default parameters on full training data
xgb_default_fs1.fit(X_train_fs1_scaled, y_train_fs1)

# Evaluate XGBoost on test data
y_pred_test_xgb_fs1 = xgb_default_fs1.predict(X_test_fs1_scaled)
rmse_test_xgb_fs1 = np.sqrt(mean_squared_error(y_test_fs1, y_pred_test_xgb_fs1))
r2_test_xgb_fs1 = r2_score(y_test_fs1, y_pred_test_xgb_fs1)

print("\nXGBoost (default parameters) on Test Set:")
print(f"RMSE: {rmse_test_xgb_fs1:.3f}, R-squared: {r2_test_xgb_fs1:.3f}")

# Define parameter grid for hyperparameter tuning
param_grid_fs1 = {
    'max_depth': [10, 12],
    'n_estimators': [800, 1200],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
}

# GridSearchCV setup
xgb_grid_search_fs1 = GridSearchCV(
    estimator=XGBRegressor(random_state=RANDOM_SEED),
    param_grid=param_grid_fs1,
    scoring='neg_mean_squared_error',
    cv=kf_fs1,
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV on the training data
xgb_grid_search_fs1.fit(X_train_fs1_scaled, y_train_fs1)

# Cross-validation evaluation for the best XGBoost model
best_xgb_fs1 = xgb_grid_search_fs1.best_estimator_
rmse_xgb_tuned_fs1_cv, r2_xgb_tuned_fs1_cv = evaluate_model_fs1(best_xgb_fs1, X_train_fs1_scaled, y_train_fs1, kf_fs1)

print("\nXGBoost (tuned parameters, 5-fold CV):")
print(f"Best Parameters: {xgb_grid_search_fs1.best_params_}")
print(f"Mean RMSE (CV): {rmse_xgb_tuned_fs1_cv:.3f}, Mean R-squared (CV): {r2_xgb_tuned_fs1_cv:.3f}")

# Train the best XGBoost model on full training data
best_xgb_fs1.fit(X_train_fs1_scaled, y_train_fs1)

# Evaluate the best XGBoost model on test data
y_pred_test_best_xgb_fs1 = best_xgb_fs1.predict(X_test_fs1_scaled)
rmse_test_best_xgb_fs1 = np.sqrt(mean_squared_error(y_test_fs1, y_pred_test_best_xgb_fs1))
r2_test_best_xgb_fs1 = r2_score(y_test_fs1, y_pred_test_best_xgb_fs1)

print("\nXGBoost (tuned parameters) on Test Set:")
print(f"RMSE: {rmse_test_best_xgb_fs1:.3f}, R-squared: {r2_test_best_xgb_fs1:.3f}")


Baseline (Mean Prediction):
RMSE: 0.597, R-squared: 0.000

Linear Regression (5-fold CV):
Mean RMSE (CV): 0.412, Mean R-squared (CV): 0.522

Linear Regression on Test Set:
RMSE: 0.413, R-squared: 0.528

XGBoost (default parameters, 5-fold CV):
Mean RMSE (CV): 0.300, Mean R-squared (CV): 0.747

XGBoost (default parameters) on Test Set:
RMSE: 0.299, R-squared: 0.752
Fitting 5 folds for each of 16 candidates, totalling 80 fits

XGBoost (tuned parameters, 5-fold CV):
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 1200, 'subsample': 1.0}
Mean RMSE (CV): 0.278, Mean R-squared (CV): 0.783

XGBoost (tuned parameters) on Test Set:
RMSE: 0.274, R-squared: 0.792


In [None]:
#Results on test set
# Scale the test set using the same scaler fitted on the training data
X_test_fs1_scaled = scaler_fs1.transform(X_test_fs1)
# Evaluate Linear Regression on the Test Set
y_pred_test_lr_fs1 = linear_model_fs1.predict(X_test_fs1_scaled)
rmse_test_lr_fs1 = np.sqrt(mean_squared_error(y_test_fs1, y_pred_test_lr_fs1))
r2_test_lr_fs1 = r2_score(y_test_fs1, y_pred_test_lr_fs1)

print("\nLinear Regression:")
print(f"Test RMSE: {rmse_test_lr_fs1:.3f}")
print(f"Test R-squared: {r2_test_lr_fs1:.3f}")

# Best model retrieved from GridSearchCV
best_xgb_fs1 = xgb_grid_search_fs1.best_estimator_

# Predict on the test set
y_pred_test_fs1 = best_xgb_fs1.predict(X_test_fs1_scaled)   

# Evaluate RMSE and R-squared on the test set
rmse_test_fs1 = np.sqrt(mean_squared_error(y_test_fs1, y_pred_test_fs1))
r2_test_fs1 = r2_score(y_test_fs1, y_pred_test_fs1)

# Print test set evaluation metrics
print("\nTest Set Evaluation:")
print(f"RMSE: {rmse_test_fs1:.3f}")
print(f"R-squared: {r2_test_fs1:.3f}")

The baseline model will generate the same results for all 4 feature sets, because they all consist of the same listings in the train set.

Now we compare the four feature sets to each other.

Hyperparameter tuning for feature set 1

In [16]:
# Function to evaluate a model with cross-validation
def evaluate_model(model, X, y, cv):
    rmse_scores, r2_scores = [], []
    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        y_pred_val = model.predict(X_val)
        rmse_scores.append(np.sqrt(mean_squared_error(y_val, y_pred_val)))
        r2_scores.append(r2_score(y_val, y_pred_val))
    return np.mean(rmse_scores), np.mean(r2_scores)

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Scale training data only
scaler = StandardScaler()
features_train_1 = scaler.fit_transform(feature_set_1_train.drop(columns=['price']))
target_train_1 = feature_set_1_train['price']

# Baseline 1: Mean Prediction
baseline_mean_price = target_train_1.mean()
baseline_rmse = np.sqrt(mean_squared_error(target_train_1, [baseline_mean_price] * len(target_train_1)))
baseline_r2 = r2_score(target_train_1, [baseline_mean_price] * len(target_train_1))

print("Baseline (Mean Prediction):")
print(f"RMSE: {baseline_rmse:.3f}, R-squared: {baseline_r2:.3f}")

# Baseline 2: Linear Regression
linear_model = LinearRegression()
rmse_lr, r2_lr = evaluate_model(linear_model, features_train_1, target_train_1, kf)

print("\nLinear Regression:")
print(f"Mean RMSE (CV): {rmse_lr:.3f}, Mean R-squared (CV): {r2_lr:.3f}")

# XGBoost with default parameters
xgb_default = XGBRegressor(random_state=42)
rmse_xgb_default, r2_xgb_default = evaluate_model(xgb_default, features_train_1, target_train_1, kf)

print("\nXGBoost (default parameters):")
print(f"Mean RMSE (CV): {rmse_xgb_default:.3f}, Mean R-squared (CV): {r2_xgb_default:.3f}")

# Define parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [6, 8, 10, 12],            # Moderate tree depths
    'n_estimators': [200, 400, 800, 1200],      # Number of trees
    'learning_rate': [0.01, 0.1, 0.3],    # Step size shrinkage
    'subsample': [0.8, 1.0],              # Row sampling
}

# GridSearchCV setup
xgb_grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=kf,
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV on the training data
xgb_grid_search.fit(features_train_1, target_train_1)

# Retrieve the best estimator and evaluate it using cross-validation
best_xgb = xgb_grid_search.best_estimator_
rmse_xgb_tuned, r2_xgb_tuned = evaluate_model(best_xgb, features_train_1, target_train_1, kf)

print("\nXGBoost (tuned parameters):")
print(f"Best Parameters: {xgb_grid_search.best_params_}")
print(f"Mean RMSE (CV): {rmse_xgb_tuned:.3f}, Mean R-squared (CV): {r2_xgb_tuned:.3f}")


Baseline (Mean Prediction):
RMSE: 0.698, R-squared: 0.000

Linear Regression:
Mean RMSE (CV): 0.464, Mean R-squared (CV): 0.557

XGBoost (default parameters):
Mean RMSE (CV): 0.338, Mean R-squared (CV): 0.765
Fitting 5 folds for each of 96 candidates, totalling 480 fits

XGBoost (tuned parameters):
Best Parameters: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 1200, 'subsample': 1.0}
Mean RMSE (CV): 0.313, Mean R-squared (CV): 0.799


Hyperparameter tuning for feature set 2

In [18]:
# Scale training data for feature set 2
features_train_2 = scaler.fit_transform(feature_set_2_train.drop(columns=['price']))
target_train_2 = feature_set_2_train['price']

# Baseline 1: Mean Prediction
baseline_mean_price_2 = target_train_2.mean()
baseline_rmse_2 = np.sqrt(mean_squared_error(target_train_2, [baseline_mean_price_2] * len(target_train_2)))
baseline_r2_2 = r2_score(target_train_2, [baseline_mean_price_2] * len(target_train_2))

print("\nBaseline (Mean Prediction) - Feature Set 2:")
print(f"RMSE: {baseline_rmse_2:.3f}, R-squared: {baseline_r2_2:.3f}")

# Baseline 2: Linear Regression
linear_model_2 = LinearRegression()
rmse_lr_2, r2_lr_2 = evaluate_model(linear_model_2, features_train_2, target_train_2, kf)

print("\nLinear Regression - Feature Set 2:")
print(f"Mean RMSE (CV): {rmse_lr_2:.3f}, Mean R-squared (CV): {r2_lr_2:.3f}")

# XGBoost with default parameters
xgb_default_2 = XGBRegressor(random_state=42)
rmse_xgb_default_2, r2_xgb_default_2 = evaluate_model(xgb_default_2, features_train_2, target_train_2, kf)

print("\nXGBoost (default parameters) - Feature Set 2:")
print(f"Mean RMSE (CV): {rmse_xgb_default_2:.3f}, Mean R-squared (CV): {r2_xgb_default_2:.3f}")

# Define parameter grid for hyperparameter tuning
param_grid_2 = {
    'max_depth': [6, 8, 10, 12],            # Moderate tree depths
    'n_estimators': [200, 400, 800, 1200],      # Number of trees
    'learning_rate': [0.01, 0.1, 0.3],    # Step size shrinkage
    'subsample': [0.8, 1.0],              # Row sampling
}

# GridSearchCV setup
xgb_grid_search_2 = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid_2,
    scoring='neg_mean_squared_error',
    cv=kf,
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV on the training data for feature set 2
xgb_grid_search_2.fit(features_train_2, target_train_2)

# Retrieve the best estimator and evaluate it using cross-validation
best_xgb_2 = xgb_grid_search_2.best_estimator_
rmse_xgb_tuned_2, r2_xgb_tuned_2 = evaluate_model(best_xgb_2, features_train_2, target_train_2, kf)

print("\nXGBoost (tuned parameters) - Feature Set 2:")
print(f"Best Parameters: {xgb_grid_search_2.best_params_}")
print(f"Mean RMSE (CV): {rmse_xgb_tuned_2:.3f}, Mean R-squared (CV): {r2_xgb_tuned_2:.3f}")



Baseline (Mean Prediction) - Feature Set 2:
RMSE: 0.698, R-squared: 0.000

Linear Regression - Feature Set 2:
Mean RMSE (CV): 0.451, Mean R-squared (CV): 0.583

XGBoost (default parameters) - Feature Set 2:
Mean RMSE (CV): 0.329, Mean R-squared (CV): 0.778
Fitting 5 folds for each of 96 candidates, totalling 480 fits

XGBoost (tuned parameters) - Feature Set 2:
Best Parameters: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 1200, 'subsample': 1.0}
Mean RMSE (CV): 0.306, Mean R-squared (CV): 0.808


Hyperparametertuning for feature set 3 comparing Textblob and Vader

In [18]:
# Function to evaluate a model with cross-validation
def evaluate_model(model, X, y, cv):
    rmse_scores, r2_scores = [], []
    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        y_pred_val = model.predict(X_val)
        rmse_scores.append(np.sqrt(mean_squared_error(y_val, y_pred_val)))
        r2_scores.append(r2_score(y_val, y_pred_val))
    return np.mean(rmse_scores), np.mean(r2_scores)

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Function to evaluate a feature set
def evaluate_feature_set(features_train, target_train, feature_set_name):
    print(f"\n=== {feature_set_name} ===")
    
    # Baseline 1: Mean Prediction
    baseline_mean_price = target_train.mean()
    baseline_rmse = np.sqrt(mean_squared_error(target_train, [baseline_mean_price] * len(target_train)))
    baseline_r2 = r2_score(target_train, [baseline_mean_price] * len(target_train))
    print("Baseline (Mean Prediction):")
    print(f"RMSE: {baseline_rmse:.3f}, R-squared: {baseline_r2:.3f}")
    
    # Baseline 2: Linear Regression
    linear_model = LinearRegression()
    rmse_lr, r2_lr = evaluate_model(linear_model, features_train, target_train, kf)
    print("\nLinear Regression:")
    print(f"Mean RMSE (CV): {rmse_lr:.3f}, Mean R-squared (CV): {r2_lr:.3f}")
    
    # XGBoost with default parameters
    xgb_default = XGBRegressor(random_state=42)
    rmse_xgb_default, r2_xgb_default = evaluate_model(xgb_default, features_train, target_train, kf)
    print("\nXGBoost (default parameters):")
    print(f"Mean RMSE (CV): {rmse_xgb_default:.3f}, Mean R-squared (CV): {r2_xgb_default:.3f}")
    
    # GridSearchCV for XGBoost
    param_grid = {
    'max_depth': [6, 8, 10, 12],            # Moderate tree depths
    'n_estimators': [200, 400, 800, 1200],      # Number of trees
    'learning_rate': [0.01, 0.1, 0.3],    # Step size shrinkage
    'subsample': [0.8, 1.0],              # Row sampling
}
    xgb_grid_search = GridSearchCV(
        estimator=XGBRegressor(random_state=42),
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=kf,
        verbose=1,
        n_jobs=-1
    )
    xgb_grid_search.fit(features_train, target_train)
    
    # Save best model and parameters
    best_model = xgb_grid_search.best_estimator_
    best_params = xgb_grid_search.best_params_
    rmse_xgb_tuned, r2_xgb_tuned = evaluate_model(best_model, features_train, target_train, kf)
    
    print("\nXGBoost (tuned parameters):")
    print(f"Best Parameters: {best_params}")
    print(f"Mean RMSE (CV): {rmse_xgb_tuned:.3f}, Mean R-squared (CV): {r2_xgb_tuned:.3f}")
    
    return best_model, best_params

# Prepare TextBlob data
scaler_textblob = StandardScaler()
features_train_textblob = scaler_textblob.fit_transform(feature_set_3_textblob_train.drop(columns=['price']))
target_train_textblob = feature_set_3_textblob_train['price']

# Evaluate TextBlob and save results
best_xgb_textblob, best_params_textblob = evaluate_feature_set(features_train_textblob, target_train_textblob, "Feature Set 3 (TextBlob)")

# Prepare Vader data
scaler_vader = StandardScaler()
features_train_vader = scaler_vader.fit_transform(feature_set_3_vader_train.drop(columns=['price']))
target_train_vader = feature_set_3_vader_train['price']

# Evaluate Vader and save results
best_xgb_vader, best_params_vader = evaluate_feature_set(features_train_vader, target_train_vader, "Feature Set 3 (Vader)")





=== Feature Set 3 (TextBlob) ===
Baseline (Mean Prediction):
RMSE: 0.698, R-squared: 0.000

Linear Regression:
Mean RMSE (CV): 0.450, Mean R-squared (CV): 0.585

XGBoost (default parameters):
Mean RMSE (CV): 0.331, Mean R-squared (CV): 0.776
Fitting 5 folds for each of 96 candidates, totalling 480 fits


6 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\PC\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\PC\AppData\Local\Programs\Python\Python313\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "c:\Users\PC\AppData\Local\Programs\Python\Python313\Lib\site-packages\xgboost\sklearn.py", line 1108, in fit
    self._Booster = train(
                    ~~~~~^
        params,
        ^^


XGBoost (tuned parameters):
Best Parameters: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 1200, 'subsample': 1.0}
Mean RMSE (CV): 0.308, Mean R-squared (CV): 0.805

=== Feature Set 3 (Vader) ===
Baseline (Mean Prediction):
RMSE: 0.698, R-squared: 0.000

Linear Regression:
Mean RMSE (CV): 0.450, Mean R-squared (CV): 0.584

XGBoost (default parameters):
Mean RMSE (CV): 0.330, Mean R-squared (CV): 0.776
Fitting 5 folds for each of 96 candidates, totalling 480 fits

XGBoost (tuned parameters):
Best Parameters: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 1200, 'subsample': 1.0}
Mean RMSE (CV): 0.309, Mean R-squared (CV): 0.805





=== Test Set Evaluation: Feature Set 3 (TextBlob) ===
Test RMSE: 0.968, Test R-squared: -0.917





=== Test Set Evaluation: Feature Set 3 (Vader) ===
Test RMSE: 1.072, Test R-squared: -1.352


Hyperparameter tuning comparing Textblob and Vader for feature set 4

In [18]:
# Function to evaluate a model with cross-validation
def evaluate_model(model, X, y, cv):
    rmse_scores, r2_scores = [], []
    for train_idx, val_idx in cv.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        y_pred_val = model.predict(X_val)
        rmse_scores.append(np.sqrt(mean_squared_error(y_val, y_pred_val)))
        r2_scores.append(r2_score(y_val, y_pred_val))
    return np.mean(rmse_scores), np.mean(r2_scores)

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Function to evaluate a feature set
def evaluate_feature_set(features_train, target_train, feature_set_name):
    print(f"\n=== {feature_set_name} ===")
    
    # Baseline 1: Mean Prediction
    baseline_mean_price = target_train.mean()
    baseline_rmse = np.sqrt(mean_squared_error(target_train, [baseline_mean_price] * len(target_train)))
    baseline_r2 = r2_score(target_train, [baseline_mean_price] * len(target_train))
    print("Baseline (Mean Prediction):")
    print(f"RMSE: {baseline_rmse:.3f}, R-squared: {baseline_r2:.3f}")
    
    # Baseline 2: Linear Regression
    linear_model = LinearRegression()
    rmse_lr, r2_lr = evaluate_model(linear_model, features_train, target_train, kf)
    print("\nLinear Regression:")
    print(f"Mean RMSE (CV): {rmse_lr:.3f}, Mean R-squared (CV): {r2_lr:.3f}")
    
    # XGBoost with default parameters
    xgb_default = XGBRegressor(random_state=42)
    rmse_xgb_default, r2_xgb_default = evaluate_model(xgb_default, features_train, target_train, kf)
    print("\nXGBoost (default parameters):")
    print(f"Mean RMSE (CV): {rmse_xgb_default:.3f}, Mean R-squared: {r2_xgb_default:.3f}")
    
    # GridSearchCV for XGBoost
    param_grid =  {
    'max_depth': [6, 8, 10, 12],            # Moderate tree depths
    'n_estimators': [200, 400, 800, 1200],      # Number of trees
    'learning_rate': [0.01, 0.1, 0.3],    # Step size shrinkage
    'subsample': [0.8, 1.0],              # Row sampling
}
    
    xgb_grid_search = GridSearchCV(
        estimator=XGBRegressor(random_state=42),
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=kf,
        verbose=1,
        n_jobs=-1
    )
    xgb_grid_search.fit(features_train, target_train)
    
    # Save best model and parameters
    best_model = xgb_grid_search.best_estimator_
    best_params = xgb_grid_search.best_params_
    rmse_xgb_tuned, r2_xgb_tuned = evaluate_model(best_model, features_train, target_train, kf)
    
    print("\nXGBoost (tuned parameters):")
    print(f"Best Parameters: {best_params}")
    print(f"Mean RMSE (CV): {rmse_xgb_tuned:.3f}, Mean R-squared (CV): {r2_xgb_tuned:.3f}")
    
    return best_model, best_params

# Prepare TextBlob data for Feature Set 4
scaler_textblob = StandardScaler()
features_train_textblob = scaler_textblob.fit_transform(feature_set_4_textblob_train.drop(columns=['price']))
target_train_textblob = feature_set_4_textblob_train['price']

# Evaluate TextBlob for Feature Set 4
best_xgb_textblob_4, best_params_textblob_4 = evaluate_feature_set(features_train_textblob, target_train_textblob, "Feature Set 4 (TextBlob)")

# Prepare Vader data for Feature Set 4
scaler_vader = StandardScaler()
features_train_vader = scaler_vader.fit_transform(feature_set_4_vader_train.drop(columns=['price']))
target_train_vader = feature_set_4_vader_train['price']

# Evaluate Vader for Feature Set 4
best_xgb_vader_4, best_params_vader_4 = evaluate_feature_set(features_train_vader, target_train_vader, "Feature Set 4 (Vader)")

# Test set evaluation
def test_model(model, scaler, features_test, target_test, feature_set_name):
    # Scale the test data using the same scaler as training
    features_test_scaled = scaler.transform(features_test)
    # Predict on the test data
    y_pred_test = model.predict(features_test_scaled)
    # Evaluate RMSE and R-squared
    rmse_test = np.sqrt(mean_squared_error(target_test, y_pred_test))
    r2_test = r2_score(target_test, y_pred_test)
    print(f"\n=== Test Set Evaluation: {feature_set_name} ===")
    print(f"Test RMSE: {rmse_test:.3f}, Test R-squared: {r2_test:.3f}")

# Test TextBlob for Feature Set 4
features_test_textblob = feature_set_4_textblob_test.drop(columns=['price'])
target_test_textblob = feature_set_4_textblob_test['price']
test_model(best_xgb_textblob_4, scaler_textblob, features_test_textblob, target_test_textblob, "Feature Set 4 (TextBlob)")

# Test Vader for Feature Set 4
features_test_vader = feature_set_4_vader_test.drop(columns=['price'])
target_test_vader = feature_set_4_vader_test['price']
test_model(best_xgb_vader_4, scaler_vader, features_test_vader, target_test_vader, "Feature Set 4 (Vader)")



=== Feature Set 4 (TextBlob) ===
Baseline (Mean Prediction):
RMSE: 0.698, R-squared: 0.000

Linear Regression:
Mean RMSE (CV): 0.669, Mean R-squared (CV): 0.082

XGBoost (default parameters):
Mean RMSE (CV): 0.633, Mean R-squared: 0.178
Fitting 5 folds for each of 96 candidates, totalling 480 fits

XGBoost (tuned parameters):
Best Parameters: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1200, 'subsample': 0.8}
Mean RMSE (CV): 0.627, Mean R-squared (CV): 0.193

=== Feature Set 4 (Vader) ===
Baseline (Mean Prediction):
RMSE: 0.698, R-squared: 0.000

Linear Regression:
Mean RMSE (CV): 0.673, Mean R-squared (CV): 0.071

XGBoost (default parameters):
Mean RMSE (CV): 0.641, Mean R-squared: 0.157
Fitting 5 folds for each of 96 candidates, totalling 480 fits

XGBoost (tuned parameters):
Best Parameters: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1200, 'subsample': 0.8}
Mean RMSE (CV): 0.635, Mean R-squared (CV): 0.171

=== Test Set Evaluation: Feature Set 4 (TextBlob) 