In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR

In [None]:
data = pd.read_excel("innercity.xlsx")
data.head()

In [None]:
data.info()

# Dataset Cleaning

In [None]:
data = data[~data.apply(lambda row: row.astype(str).str.contains('$', regex=False)).any(axis=1)]

In [None]:
# to check the number of null values 
pd.isnull(data).sum()

In [None]:
# Removing all Null Values
data = data.dropna()

# Convert 'dayhours' to datetime

In [None]:
data['dayhours'] = pd.to_datetime(data['dayhours'])

data['year'] = data['dayhours'].dt.year
data['month'] = data['dayhours'].dt.month
data['day'] = data['dayhours'].dt.day
data['hour'] = data['dayhours'].dt.hour

In [None]:
# Remove the original 'dayhours' column
data = data.drop(columns=['dayhours'])

In [None]:
# Drop 'cid' column
data = data.drop(columns=['cid'])

In [None]:
data.head()

# Check the duplicate values if any

In [None]:
# Find and display duplicate rows based on all columns
duplicates = data[data.duplicated(keep=False)]
print(duplicates)

In [None]:
data.describe()

# EDA (Exploratory Data Analysis)

## Visualize the distribution of the target variable 'price'

In [None]:
living_measure = data['living_measure']
price = data['price']

# Create scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(living_measure, price, alpha=0.5)
plt.title('Scatter Plot of Living Measure vs. Price')
plt.xlabel('Living Measure')
plt.ylabel('Price')
plt.grid(True)
plt.show()


In [None]:
room_bed = data['room_bed']

# Create histogram
plt.figure(figsize=(8, 6))
plt.hist(room_bed, bins=10, color='skyblue', edgecolor='black')
plt.title('Histogram of Room Bed Distribution')
plt.xlabel('Room Bed')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
numerical_columns = ['living_measure', 'price', 'room_bed', 'room_bath', 'lot_measure', 'ceil_measure', 'coast', 'yr_built', 'yr_renovated', 'lat', 'long', 'total_area']
corr_matrix = data[numerical_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Selecting Features

In [None]:
# Split the data into features (X) and the target (y)
X = data.drop(columns=['price'])
Y = data['price']

In [None]:
# Split the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Model Building

## Applying Linear Regression

In [None]:
# Create and train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='black')
plt.title('Actual vs. Predicted Prices (Linear Regression)')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.grid(True)
plt.show()

In [None]:
sns.distplot((y_test-y_pred),bins=50)
plt.title('Distplot of Linear Regression')
plt.show()

In [None]:
model.score(X_test, y_test)

## Model Interpretation

In [None]:
coefficients = model.coef_
feature_names = X.columns

for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef}")

## Applying Random Forest Method

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train,y_train)

rf_predictions = regressor.predict(X_test)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, rf_predictions, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--', color='black')
plt.title('Actual vs. Predicted Prices (Random Forest)')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.grid(True)
plt.show()

In [None]:
sns.distplot((y_test-y_pred),bins=50)
plt.title('Distplot of Random Forest Model')
plt.show()

In [None]:
regressor.score(X_test, y_test)

# Evaluation

In [None]:
# Predict house prices on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model using metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print("Mean Squared Error(MSE):", mse)
print("R-squared (R2) Score:", r2)

## Model Interpretation

In [None]:
coefficients = model.coef_
feature_names = X.columns

for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef}")

In [None]:
# Define a dictionary for the new data point
new_data_point = {
    'room_bed': 3,
    'room_bath': 2,
    'living_measure': 1800,
    'lot_measure': 6000,
    'ceil': 1,
    'coast': 0,
    'sight': 0,
    'condition': 4,
    'quality': 7,
    'ceil_measure': 1800,
    'basement': 0,  
    'yr_built': 1995,
    'yr_renovated': 0,
    'zipcode': 98034,
    'lat': 47.7228,
    'long': -122.183,
    'living_measure15': 1800,
    'lot_measure15': 6000,
    'furnished': 0,
    'total_area': 7800,
    'year': 2023,
    'month': 10,
    'day': 15,
    'hour': 14
}

In [None]:
# Convert the dictionary to a DataFrame
new_data = pd.DataFrame([new_data_point])

In [None]:
# Predict the price for the new data point using the trained model
predicted_price = model.predict(new_data)

In [None]:
print("Predicted Price:", predicted_price[0])

## The predicted price is approximately 806552.51.