# GitHub: https://github.com/williamclark13/HousingPredictions

# Import Libraries

In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Data Wrangling

In [133]:
df = pd.read_csv("Housing.csv")

df = df[(df['price'] < 12000000) & (df['price'] > 2000000)]
y = df['price']

prediction_goal = "Predict house prices (regression)"
print(prediction_goal)

Predict house prices (regression)


In [134]:
categorical_features = ["mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "prefarea", "furnishingstatus"]
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

df = df.dropna()
df = df.drop_duplicates()

df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0,0
5,10850000,7500,3,3,1,2,1,0,1,0,1,1,1,0
6,10150000,8580,4,3,4,2,1,0,0,0,1,1,1,0
7,10150000,16200,5,3,2,0,1,0,0,0,0,0,0,1
8,9870000,8100,4,1,2,2,1,1,1,0,1,1,0,0


# Feature Engineering

In [135]:
df['total_bed_bath'] = df['bedrooms'] + df['bathrooms']

X = df.drop("price", axis=1)
anova = SelectKBest(score_func=f_classif, k=5)
X_new = anova.fit_transform(X, y)

# Feature Scaling

In [136]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)

# Build Prediction Model: Linear Regression

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f} \n")

cross_val_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -cross_val_scores.mean()
print(f"Cross-Validation Mean Squared Error: {mse_cv:.2f}")

Mean Squared Error: 1900344376308.84
R-squared Score: 0.52 

Cross-Validation Mean Squared Error: 2840388204773.03


# 1: Problem Statement
* **Dataset**: housing prices prediction
* **Target Variable**: prices
* **State Prediction Goal**: regression

# 3: Feature Engineering
* **Variables with little variance**: none chosen
* **Features irrelevant to target variable**: none are, but some are less relevant
* **Highly-Correlated features**: none chosen
* **Algorithm require normalized/standardized data**: yes
* **Categorical variables need be changed to dummy variables**: yes

# 4: Build Prediction Model
* **Does data need to be balanced**: no
* **What do you use to validate result**: train_test_split
* **Which algorithm is appropriate for this problem**: random forest regressor
* **Which metrics do you use to evalutate prediction model**:
    - Chose MSE because: gives clear understanding of sqrd prediction error.
    - Chose R2 because: states how well model fits data.

# Task: 2, 5, 6 dont have answerable questions
# Ultimately, the model that I had built failed.