In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import catboost
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [3]:
data = pd.read_csv('rentfaster.csv')

In [None]:
data

In [None]:
data.info()

In [None]:
data.describe()

In [7]:
data.dropna(inplace=True) # modifying the original DataFrame to drop rows with null values

In [None]:
data.describe()

In [9]:
X = data.drop(['price'], axis=1)
y = data['price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [11]:
train_data = X_train.join(y_train)

In [None]:
train_data.hist()

In [None]:
numeric_data = train_data.select_dtypes(include=[np.number])
plt.figure(figsize=(20,10))
sns.heatmap(numeric_data.corr(),annot=True, cmap="YlGnBu")

In [14]:
train_data['pet_eligibility'] = train_data['cats'] & train_data['dogs']
train_data['pet_eligibility'] = train_data['pet_eligibility'].astype(int) #converts True/False to 1/0


In [None]:
train_data.pet_eligibility.value_counts()

In [None]:
print(train_data.dtypes)

In [17]:
# Convert 'beds', 'baths', and 'sq_feet' to numeric values 
train_data['baths'] = pd.to_numeric(train_data['baths'], errors='coerce')
train_data['sq_feet'] = pd.to_numeric(train_data['sq_feet'], errors='coerce')


In [None]:
print(train_data.dtypes)

In [None]:
train_data.hist(figsize=(20,10))

In [20]:
train_data['beds'] = train_data['beds'].replace('Studio','0 Beds') # replacing the 'Studio' variable w/ 0 bedrooms by definition, to be able to work with numerically


In [None]:
train_data.beds.value_counts()

In [22]:
train_data['beds'] = train_data['beds'].str.split().str[0] # replacing all entries for this column w/ their equivalent numerical value
train_data['beds'] = pd.to_numeric(train_data['beds'].str.split().str[0], errors='coerce')


In [None]:
print(train_data.dtypes)

In [24]:
train_data['total_rooms'] = train_data['beds'] + train_data['baths']
train_data = train_data.drop(['link'],axis=1)
train_data = train_data.drop(['address'],axis=1)
categorical_features = ['city', 'province', 'lease_term', 'type', 'furnishing', 'availability_date', 'smoking']
for category in categorical_features: 
    train_data = train_data.join(pd.get_dummies(train_data[category], prefix=category, drop_first=True).astype(int))
    train_data.drop(category, axis=1, inplace=True)


train_data['total_rooms'] = np.log(train_data['total_rooms'] + 1) 
train_data['beds'] = np.log(train_data['beds'] + 1) 
train_data['baths'] = np.log(train_data['baths'] + 1) 
train_data['sq_feet'] = np.log(train_data['sq_feet'] + 1)

In [None]:
print(train_data.dtypes)

In [26]:
test_data = X_test.join(y_test)

In [27]:
test_data['pet_eligibility'] = test_data['cats'] & test_data['dogs']
test_data['pet_eligibility'] = test_data['pet_eligibility'].astype(int) #converts True/False to 1/0
test_data = test_data.drop(['link'], axis=1)
test_data = test_data.drop(['address'],axis=1)

In [28]:
test_data['beds'] = test_data['beds'].replace('Studio','0 Beds') # replacing the 'Studio' variable w/ 0 bedrooms by definition, to be able to work with numerically
test_data['beds'] = test_data['beds'].str.split().str[0] # replacing all entries for this column w/ their equivalent numerical value
test_data['beds'] = pd.to_numeric(test_data['beds'].str.split().str[0], errors='coerce')



test_data['baths'] = pd.to_numeric(test_data['baths'], errors='coerce')
test_data['sq_feet'] = pd.to_numeric(test_data['sq_feet'], errors='coerce')

In [29]:
test_data['total_rooms'] = test_data['beds'] + test_data['baths']

categorical_features = ['city', 'province', 'lease_term', 'type', 'furnishing', 'availability_date', 'smoking']
for category in categorical_features: 
    test_data = test_data.join(pd.get_dummies(test_data[category], prefix=category, drop_first=True).astype(int))
    test_data.drop(category, axis=1, inplace=True)


test_data['total_rooms'] = np.log(test_data['total_rooms'] + 1) 
test_data['beds'] = np.log(test_data['beds'] + 1) 
test_data['baths'] = np.log(test_data['baths'] + 1) 
test_data['sq_feet'] = np.log(test_data['sq_feet'] + 1)




In [30]:
train_data = train_data.apply(pd.to_numeric, errors='coerce')


In [31]:
# Align columns of test data to match the training data
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)


In [None]:
print(test_data.dtypes)

In [None]:
print(train_data.dtypes)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=1)
rf_model.fit(train_data.drop('price', axis=1), train_data['price'])

In [None]:
# Make predictions on the test data
y_pred = rf_model.predict(test_data.drop('price', axis=1))

# R² Score (Coefficient of Determination)
r2_score = rf_model.score(test_data.drop('price', axis=1), test_data['price'])
print(f'R² Score: {r2_score}')

# Mean Absolute Error (MAE)
mae = mean_absolute_error(test_data['price'], y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

# Mean Squared Error (MSE)
mse = mean_squared_error(test_data['price'], y_pred)
print(f'Mean Squared Error (MSE): {mse}')

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [10, 20, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(train_data.drop('price', axis=1), train_data['price'])

best_rf_model = grid_search.best_estimator_

y_pred = best_rf_model.predict(test_data.drop('price', axis=1))

# Evaluate the predictions
print("R² Score: ", best_rf_model.score(test_data.drop('price', axis=1), test_data['price']))
print("Mean Absolute Error (MAE): ", mean_absolute_error(test_data['price'], y_pred))
print("Mean Squared Error (MSE): ", mean_squared_error(test_data['price'], y_pred))
print("Root Mean Squared Error (RMSE): ", np.sqrt(mean_squared_error(test_data['price'], y_pred)))
