<a href="https://colab.research.google.com/github/torbenbillow/CBS-AML-PROJECT/blob/main/notebooks/00_welcome.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Import all the relevant libraries below to process, explore, and model data.*

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC

from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score,
                             accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             roc_auc_score, roc_curve, max_error, mean_absolute_percentage_error)

from google.colab import drive


In [54]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/listings.csv'
df = pd.read_csv(file_path)
df.shape

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


(22684, 79)

In [55]:
df = df.dropna(subset=['price'])
df['price'] = df['price'].replace({'\\$': '', ',': ''}, regex=True).astype(float)


In [56]:
features = [
    'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'room_type', 'property_type', 'neighbourhood_cleansed',
    'host_is_superhost', 'host_response_rate', 'host_acceptance_rate'
]

X = df[features].copy()
y = df['price']

In [57]:
X['host_response_rate'] = X['host_response_rate'].str.rstrip('%').astype(float)
X['host_acceptance_rate'] = X['host_acceptance_rate'].str.rstrip('%').astype(float)
X['host_is_superhost'] = X['host_is_superhost'].map({'t': 1, 'f': 0})

In [58]:
X = pd.get_dummies(X, columns=['room_type', 'property_type', 'neighbourhood_cleansed'], drop_first=True)


In [61]:
X = X.dropna(subset=['bathrooms', 'bedrooms', 'beds', 'host_is_superhost'])
X['host_response_rate'] = X['host_response_rate'].fillna(0)
X['host_acceptance_rate'] = X['host_acceptance_rate'].fillna(0)

In [65]:
y = df.loc[X.index, 'price']
print(X.shape, y.shape)

(13719, 69) (13719,)


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [70]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Linear Regression R²:", r2)
print("Linear Regression RMSE:", rmse)

Linear Regression R²: 0.3335328359923392
Linear Regression RMSE: 738.5395165452009


In [71]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [72]:
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Random Forest R²:", r2_rf)
print("Random Forest RMSE:", rmse_rf)

Random Forest R²: -0.01850871646258967
Random Forest RMSE: 912.9914999911936


In [73]:
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).head(10)

Unnamed: 0,0
host_response_rate,0.192295
accommodates,0.150302
host_acceptance_rate,0.125564
bedrooms,0.11972
beds,0.080114
neighbourhood_cleansed_Indre By,0.050273
bathrooms,0.047987
neighbourhood_cleansed_sterbro,0.030579
property_type_Entire condo,0.029081
neighbourhood_cleansed_Nrrebro,0.022685
