## Housing Data Analysis

In [None]:
# import statements

import pandas as pd
import math
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# read data

data = pd.read_csv('Housing Data.csv')
#data_test = pd.read_csv('test.csv')

In [None]:
data.head(5)

In [None]:
# summary of the data

data.info()

## House Price Distribution

In [None]:
print(data['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(data['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4})
plt.grid(True)

## Numerical Data Dsitribution

In [None]:
# Numerical Features

num_features = data.select_dtypes(include=['int64', 'float64'])
num_features.head(5)

We don't need ID number and Sale Price, so we can drop it

In [None]:
num_features = num_features.drop(columns = ['Id'], axis = 1)

In [None]:
plt.figure(figsize=(15, 10))
num_features.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)
plt.title('Numerical Features Distribution')
plt.show()


In [None]:
num_features.isna().any()

In [None]:
num_features.dropna(inplace=True)

In [None]:
# Correlation Matrix to visualize relationships between features
corr_matrix = num_features.corr()

# Heatmap of correlations
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False, linewidths=.5)
plt.title('Correlation Heatmap')
plt.show()

## Linear Regression

In [None]:
# Features and target variable
X = num_features.drop('SalePrice', axis=1)
y = num_features['SalePrice']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating linear regression model
model = LinearRegression()

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


In [None]:
# Plotting actual vs. predicted prices
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs. Predicted Housing Rent Prices")
plt.grid(True)
plt.show()

In [None]:
# Extracting feature names and coefficients
feature_names = X.columns
coefficients = model.coef_

# Creating a DataFrame to hold feature names and their corresponding coefficients
coeff_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sorting the features by the absolute values of their coefficients in descending order
coeff_df = coeff_df.reindex(coeff_df.Coefficient.abs().sort_values(ascending=False).index)

# Plotting the top 10 most important features
plt.figure(figsize=(10, 6))
sns.barplot(x='Coefficient', y='Feature', data=coeff_df.head(10), palette='coolwarm')
plt.title('Top 10 Most Important Features for Predicting House Prices')
plt.show()

## Multiple Regression

In [None]:
numeric_df = data.select_dtypes(include=[np.number])
numeric_df.fillna(numeric_df.median(), inplace=True)
correlation_matrix = numeric_df.corr()
top_3_features = correlation_matrix['SalePrice'].apply(lambda x: abs(x)).sort_values(ascending=False)[1:4].index.tolist()

X = numeric_df[top_3_features]
y = numeric_df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)

plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)

plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs. Predicted Sale Prices')
plt.grid(True)
plt.show()


## Random Forest Regressor

In [None]:
# Selecting features and target variable
X = data[['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'FullBath', 'BedroomAbvGr']]
y = data['SalePrice']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (Random Forest Regression):", mse)
print("R-squared (Random Forest Regression):", r2)

In [None]:
# Plotting actual vs. predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.title('Actual vs. Predicted Sale Prices')
plt.grid(True)
plt.show()

# Plotting prediction error vs. frequency
prediction_errors = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.hist(prediction_errors, bins=50, color='blue', alpha=0.7)
plt.title('Prediction Error vs. Frequency')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

## Modified Random Forest Regressor

In [None]:
# Selecting features and target variable
# X = data[['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'YearBuilt', 'FullBath', 'BedroomAbvGr']]
X = data[['GrLivArea', 'YearBuilt', 'BedroomAbvGr']]
y = data['SalePrice']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Random Forest Regressor
model_forest = RandomForestRegressor(n_estimators=100, random_state=42)
model_forest.fit(X_train, y_train)
# Making predictions
y_pred = model_forest.predict(X_test)
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (Random Forest Regression):", mse)
print("R-squared (Random Forest Regression):", r2)

In [None]:
# Plotting actual vs. predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.title('Actual vs. Predicted Sale Prices')
plt.grid(True)
plt.show()
# Plotting prediction error vs. frequency
prediction_errors = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.hist(prediction_errors, bins=50, color='blue', alpha=0.7)
plt.title('Prediction Error vs. Frequency')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Function to take user input and predict housing price
def predict_housing_price():
    # Get user input for features
    num_bedrooms = int(input('Enter the number of bedrooms: '))
    size_sqft = float(input('Enter the living room size in square feet: '))
    year_built = int(input('Enter the year in which the house was built: '))
    # Predict housing price
    predicted_price = model_forest.predict([[num_bedrooms, size_sqft, year_built]])
    print('Predicted housing price:', predicted_price[0])
# Call the function to predict housing price based on user input
predict_housing_price()