In [None]:
import pandas as pd

# Importing the Final Dataset

In [None]:
final_df = pd.read_csv("/content/drive/MyDrive/Housing Datasets/Dataframe_final.csv")
final_df.head()

Unnamed: 0,DATE,Population,Interest Rate,Inflation,Unemployment Percentage,Median Family Income,SP500,GDP,CSUSHPISA
0,2003-01-01,219897.0,5.916,1.754286,5.8,43320.0,855.7,11174.129,128.461
1,2003-02-01,220114.0,5.8425,1.912632,5.9,43320.0,841.15,11174.129,129.355
2,2003-03-01,220317.0,5.745,1.862857,5.9,43320.0,848.18,11174.129,130.148
3,2003-04-01,220540.0,5.8125,1.774762,6.0,43320.0,916.92,11312.766,130.884
4,2003-05-01,220768.0,5.484,1.660952,6.1,43320.0,963.59,11312.766,131.735


# Data Cleaning and Preprocessing

In [None]:
# Calculating and checking for Null values

null_values = final_df.isnull().sum()
print("Number of NULL values in each column:")
print(null_values)

Number of NULL values in each column:
DATE                        3
Population                  3
Interest Rate               3
Inflation                   3
Unemployment Percentage     3
Median Family Income       11
SP500                       3
GDP                         2
CSUSHPISA                   3
dtype: int64


In [None]:
# Checking Rows with Null values

rows_with_null = final_df[final_df.isnull().any(axis=1)]
print("Rows with NULL values:")
print(rows_with_null)

Rows with NULL values:
           DATE  Population  Interest Rate  Inflation  \
240  2023-01-01    265962.0         6.2725   2.237000   
241  2023-02-01    266112.0         6.2575   2.333684   
242  2023-03-01    266272.0         6.5440   2.302174   
243  2023-04-01    266443.0         6.3425   2.266500   
244  2023-05-01    266618.0         6.4250   2.213636   
245  2023-06-01    266801.0         6.7140   2.200476   
246  2023-07-01    267002.0         6.8400   2.296500   
247  2023-08-01    267213.0         7.0720   2.335217   
248         NaN         NaN            NaN        NaN   
249         NaN         NaN            NaN        NaN   
250         NaN         NaN            NaN        NaN   

     Unemployment Percentage  Median Family Income     SP500        GDP  \
240                      3.4                   NaN  4,076.60  26813.601   
241                      3.6                   NaN  3,970.15  26813.601   
242                      3.5                   NaN  4,109.31  26813

In [None]:
# Dropping the last three Empty rows

final_df = final_df.iloc[:-3]

In [None]:
# As Median Income for Final year was not avaliable, Using Linear Interpolation to Fill those values

final_df["Median Family Income"].interpolate(method="linear", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["Median Family Income"].interpolate(method="linear", inplace=True)


In [None]:
null_values = final_df.isnull().sum()
print("Number of NULL values in each column:")
print(null_values)

Number of NULL values in each column:
DATE                       0
Population                 0
Interest Rate              0
Inflation                  0
Unemployment Percentage    0
Median Family Income       0
SP500                      0
GDP                        0
CSUSHPISA                  0
dtype: int64


In [None]:
print(final_df.dtypes)

DATE                        object
Population                 float64
Interest Rate              float64
Inflation                  float64
Unemployment Percentage    float64
Median Family Income       float64
SP500                       object
GDP                         object
CSUSHPISA                  float64
dtype: object


In [None]:
# Converting Object Datatypes into Numeric DataTypes

# As SP500 Column has a , in numeric values, first removing that
final_df['SP500'] = final_df['SP500'].str.replace(',', '').astype(float)

columns_to_convert = ['SP500', 'GDP']

for column in columns_to_convert:
    final_df[column] = pd.to_numeric(final_df[column])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['SP500'] = final_df['SP500'].str.replace(',', '').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df[column] = pd.to_numeric(final_df[column])


In [None]:
print(final_df.dtypes)

DATE                        object
Population                 float64
Interest Rate              float64
Inflation                  float64
Unemployment Percentage    float64
Median Family Income       float64
SP500                      float64
GDP                        float64
CSUSHPISA                  float64
dtype: object


# Checking for Impact of each of these columns of Final Housing Price

**Correlation, Linear Regression, ANOVA and Random Forest Feature Importance**

In [None]:
correlation_matrix = final_df.corr()

# Extract the correlation coefficients for the "CSUSHPISA" column
correlation_with_housing_price = correlation_matrix["CSUSHPISA"]
print("Pearson Correlation with Housing Price (CSUSHPISA):")
print(correlation_with_housing_price)

Pearson Correlation with Housing Price (CSUSHPISA):
Population                 0.739373
Interest Rate             -0.047991
Inflation                  0.142515
Unemployment Percentage   -0.535753
Median Family Income       0.892649
SP500                      0.931535
GDP                        0.884637
CSUSHPISA                  1.000000
Name: CSUSHPISA, dtype: float64


  correlation_matrix = final_df.corr()


In [None]:
# Implementing Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = final_df[['Population', 'Interest Rate', 'Inflation', 'Unemployment Percentage', 'Median Family Income', 'SP500','GDP']]
y = final_df['CSUSHPISA']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

# Getting the coefficients (impact) of each feature
coefficients = model.coef_

print("Coefficients (Impact of Features):")
for feature, coefficient in zip(X.columns, coefficients):
    print(f"{feature}: {coefficient}")


Coefficients (Impact of Features):
Population: -0.002018926364403867
Interest Rate: 8.559294149451542
Inflation: -0.2836051028624317
Unemployment Percentage: -0.07391112161791825
Median Family Income: 0.0024308792441072313
SP500: 0.023926675527280006
GDP: 0.005861130220022659


In [None]:
# Implementing one-way ANOVA

from scipy.stats import f_oneway

features = ['Population', 'Interest Rate', 'Inflation', 'Unemployment Percentage', 'Median Family Income', 'SP500','GDP']

for feature in features:
    # Group the data by the feature
    groups = final_df.groupby(feature)

    # Extract the housing price data for each group
    data = [group[1]['CSUSHPISA'] for group in groups]

    # Perform one-way ANOVA
    f_statistic, p_value = f_oneway(*data)

    # Display the feature name and its ANOVA p-value
    print(f"Feature: {feature}")
    print("ANOVA p-value:", p_value)


Feature: Population
ANOVA p-value: nan
Feature: Interest Rate
ANOVA p-value: 0.7406117493368806
Feature: Inflation
ANOVA p-value: 0.1543963017172223
Feature: Unemployment Percentage
ANOVA p-value: 9.087364406653531e-21
Feature: Median Family Income
ANOVA p-value: 9.49438884794728e-211
Feature: SP500
ANOVA p-value: nan
Feature: GDP
ANOVA p-value: 2.526899120789098e-224




In [None]:
# Implementing Random Forest Regression

from sklearn.ensemble import RandomForestRegressor

X = final_df[['Population', 'Interest Rate', 'Inflation', 'Unemployment Percentage', 'Median Family Income', 'SP500','GDP']]
y = final_df['CSUSHPISA']

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

feature_importance = model.feature_importances_

# Display feature importance scores
print("Feature Importance:")
for feature, importance in zip(X.columns, feature_importance):
    print(f"{feature}: {importance}")


Feature Importance:
Population: 0.4771772309094921
Interest Rate: 0.01074882943304159
Inflation: 0.0015672180810437357
Unemployment Percentage: 0.11352697052230477
Median Family Income: 0.038164086493545205
SP500: 0.13494514137981645
GDP: 0.2238705231807562
