In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression



  from pandas.core import (


In [2]:
# Load the data
data = pd.read_csv('Resources/clean_property_data.csv')

# Print the columns to check for the correct column name
print(data.columns)

Index(['proptype', 'district', 'nbhd', 'style', 'stories', 'year_built',
       'rooms', 'finishedsqft', 'units', 'bdrms', 'fbath', 'lotsize',
       'sale_date', 'sale_price', 'sale_year', 'sale_month'],
      dtype='object')


In [3]:

# Clean column names by stripping any extra spaces
data.columns = data.columns.str.strip()

In [4]:

# Check the cleaned column names
print(data.columns)

Index(['proptype', 'district', 'nbhd', 'style', 'stories', 'year_built',
       'rooms', 'finishedsqft', 'units', 'bdrms', 'fbath', 'lotsize',
       'sale_date', 'sale_price', 'sale_year', 'sale_month'],
      dtype='object')


In [5]:
data.info(type)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55870 entries, 0 to 55869
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   proptype      55870 non-null  object 
 1   district      55870 non-null  float64
 2   nbhd          55870 non-null  float64
 3   style         55870 non-null  object 
 4   stories       55870 non-null  float64
 5   year_built    55870 non-null  float64
 6   rooms         55870 non-null  float64
 7   finishedsqft  55870 non-null  float64
 8   units         55870 non-null  int64  
 9   bdrms         55870 non-null  float64
 10  fbath         55870 non-null  float64
 11  lotsize       55870 non-null  float64
 12  sale_date     55870 non-null  object 
 13  sale_price    55870 non-null  float64
 14  sale_year     55870 non-null  int64  
 15  sale_month    55870 non-null  int64  
dtypes: float64(10), int64(3), object(3)
memory usage: 6.8+ MB


In [6]:
#Checking to make sure there are no nulls 
data.isnull().sum()

proptype        0
district        0
nbhd            0
style           0
stories         0
year_built      0
rooms           0
finishedsqft    0
units           0
bdrms           0
fbath           0
lotsize         0
sale_date       0
sale_price      0
sale_year       0
sale_month      0
dtype: int64

In [7]:
#converting the sale date to datetime , becasue when imported here it couldnt read it as datetime
data['sale_date'] = pd.to_datetime(data['sale_date'], errors='coerce')
data.info(type)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55870 entries, 0 to 55869
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   proptype      55870 non-null  object        
 1   district      55870 non-null  float64       
 2   nbhd          55870 non-null  float64       
 3   style         55870 non-null  object        
 4   stories       55870 non-null  float64       
 5   year_built    55870 non-null  float64       
 6   rooms         55870 non-null  float64       
 7   finishedsqft  55870 non-null  float64       
 8   units         55870 non-null  int64         
 9   bdrms         55870 non-null  float64       
 10  fbath         55870 non-null  float64       
 11  lotsize       55870 non-null  float64       
 12  sale_date     55870 non-null  datetime64[ns]
 13  sale_price    55870 non-null  float64       
 14  sale_year     55870 non-null  int64         
 15  sale_month    55870 non-null  int64 

In [8]:
#dropping saledate b/c it's not needed for the model. We're going to use sale-year and sale-month
data = data.drop(columns=['sale_date'])
data

Unnamed: 0,proptype,district,nbhd,style,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,lotsize,sale_price,sale_year,sale_month
0,commercial,9.0,6202.0,office building - 1 story,1.0,1981.0,0.0,12960.0,5,0.0,0.0,54885.0,530000.0,2023,8
1,residential,9.0,40.0,cape cod,1.0,1942.0,7.0,1182.0,1,4.0,1.0,33541.0,160000.0,2023,12
2,residential,9.0,40.0,ranch,1.0,2006.0,9.0,1880.0,1,3.0,2.0,10607.0,387500.0,2023,8
3,residential,9.0,40.0,ranch,1.0,1980.0,6.0,1489.0,1,3.0,3.0,8640.0,335000.0,2023,8
4,residential,9.0,40.0,ranch,1.0,1986.0,5.0,1209.0,1,3.0,1.0,7200.0,250000.0,2023,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55865,commercial,12.0,6275.0,"store building - single tenant, 1 story",1.0,1926.0,0.0,1533.0,1,0.0,0.0,421.0,203000.0,2013,12
55866,commercial,14.0,6282.0,tavern,2.0,1906.0,0.0,2880.0,1,0.0,0.0,3000.0,225000.0,2013,12
55867,commercial,4.0,6296.0,warehouse building - 1 story,1.0,1964.0,0.0,42141.0,1,0.0,0.0,91440.0,10000000.0,2013,12
55868,commercial,1.0,6234.0,warehouse building - 1 story,1.0,1951.0,0.0,11297.0,1,0.0,0.0,16403.0,200000.0,2013,12


In [9]:
# Converting all non-numerical vvalues to numerical category 
data_dummies = pd.get_dummies(data)
data_dummies

Unnamed: 0,district,nbhd,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,lotsize,...,style_townhse,style_trilevel,style_triplex,style_trucking terminal,style_tudor,style_unkwn,style_used car sale,style_warehouse,style_warehouse building - 1 story,style_warehouse vintage
0,9.0,6202.0,1.0,1981.0,0.0,12960.0,5,0.0,0.0,54885.0,...,False,False,False,False,False,False,False,False,False,False
1,9.0,40.0,1.0,1942.0,7.0,1182.0,1,4.0,1.0,33541.0,...,False,False,False,False,False,False,False,False,False,False
2,9.0,40.0,1.0,2006.0,9.0,1880.0,1,3.0,2.0,10607.0,...,False,False,False,False,False,False,False,False,False,False
3,9.0,40.0,1.0,1980.0,6.0,1489.0,1,3.0,3.0,8640.0,...,False,False,False,False,False,False,False,False,False,False
4,9.0,40.0,1.0,1986.0,5.0,1209.0,1,3.0,1.0,7200.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55865,12.0,6275.0,1.0,1926.0,0.0,1533.0,1,0.0,0.0,421.0,...,False,False,False,False,False,False,False,False,False,False
55866,14.0,6282.0,2.0,1906.0,0.0,2880.0,1,0.0,0.0,3000.0,...,False,False,False,False,False,False,False,False,False,False
55867,4.0,6296.0,1.0,1964.0,0.0,42141.0,1,0.0,0.0,91440.0,...,False,False,False,False,False,False,False,False,True,False
55868,1.0,6234.0,1.0,1951.0,0.0,11297.0,1,0.0,0.0,16403.0,...,False,False,False,False,False,False,False,False,True,False


In [10]:
#creating the independent variables set 
X = data_dummies.drop(columns=['sale_price'])
X

Unnamed: 0,district,nbhd,stories,year_built,rooms,finishedsqft,units,bdrms,fbath,lotsize,...,style_townhse,style_trilevel,style_triplex,style_trucking terminal,style_tudor,style_unkwn,style_used car sale,style_warehouse,style_warehouse building - 1 story,style_warehouse vintage
0,9.0,6202.0,1.0,1981.0,0.0,12960.0,5,0.0,0.0,54885.0,...,False,False,False,False,False,False,False,False,False,False
1,9.0,40.0,1.0,1942.0,7.0,1182.0,1,4.0,1.0,33541.0,...,False,False,False,False,False,False,False,False,False,False
2,9.0,40.0,1.0,2006.0,9.0,1880.0,1,3.0,2.0,10607.0,...,False,False,False,False,False,False,False,False,False,False
3,9.0,40.0,1.0,1980.0,6.0,1489.0,1,3.0,3.0,8640.0,...,False,False,False,False,False,False,False,False,False,False
4,9.0,40.0,1.0,1986.0,5.0,1209.0,1,3.0,1.0,7200.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55865,12.0,6275.0,1.0,1926.0,0.0,1533.0,1,0.0,0.0,421.0,...,False,False,False,False,False,False,False,False,False,False
55866,14.0,6282.0,2.0,1906.0,0.0,2880.0,1,0.0,0.0,3000.0,...,False,False,False,False,False,False,False,False,False,False
55867,4.0,6296.0,1.0,1964.0,0.0,42141.0,1,0.0,0.0,91440.0,...,False,False,False,False,False,False,False,False,True,False
55868,1.0,6234.0,1.0,1951.0,0.0,11297.0,1,0.0,0.0,16403.0,...,False,False,False,False,False,False,False,False,True,False


In [11]:
# creating the dependent variable set 
y = data_dummies['sale_price']
y

0          530000.0
1          160000.0
2          387500.0
3          335000.0
4          250000.0
            ...    
55865      203000.0
55866      225000.0
55867    10000000.0
55868      200000.0
55869       25000.0
Name: sale_price, Length: 55870, dtype: float64

In [12]:
# spliting data into training and testing 80/20 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# setting the model to linear regression 
model = LinearRegression()
# fitting the model 
model.fit(X_train, y_train)

In [14]:
# Prediction with test data 
y_pred = model.predict(X_test)
# getting mean square 
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 744184368009.607


In [15]:
# determining accuracy score 
model.score(X_test, y_test)

0.5602274545689324

# Optimization Attempts (incomplete)

In [None]:
# 
X_train.info()

In [None]:
X_test.info()

In [None]:
y_train.info()

In [256]:
# Create a StandardScaler instances
scaler_x = StandardScaler()
scaler_y = StandardScaler()

# Fit the StandardScaler
x_scaler = scaler_x.fit(X_train)
y_scaler = scaler_y.fit(y_train)

# Scale the data
X_train_scaled = data_scaler.transform(X_train)
X_test_scaled = data_scaler.transform(X_test)
df_X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
df_X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.columns)



model_2 = LinearRegression()
model_2.fit(df_X_train_scaled, y_train)


In [278]:
y_pred_2 = model.predict(df_X_test_scaled)
y_pred_2

array([-6229058.21934363, -4739615.62862225, -5772307.1190488 , ...,
       -5733956.23168218, -5014605.84440024, -3689317.97982302])

In [279]:
mse = mean_squared_error(y_test, y_pred_2)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 123660044910223.69


In [283]:
model_2.score(X_test, y_test)

-2.4797394871491074e+22