<a href="https://colab.research.google.com/github/leyli16/HousingPricePrediction/blob/main/housing_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

# Data Wrangling and Cleaning

## Loading data through kaggle setup

In [2]:
%%capture
!pip instsall kaggle

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/

## Loading in USA Real Estate Dataset


In [5]:
#!/bin/bash
!!kaggle datasets download ahmedshahriarsakib/usa-real-estate-dataset

['Dataset URL: https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset',
 'License(s): other']

In [6]:
!unzip usa-real-estate-dataset.zip

Archive:  usa-real-estate-dataset.zip
  inflating: realtor-data.zip.csv    


In [8]:
df_raw = pd.read_csv('realtor-data.zip.csv')
df_raw.head(10)

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,103378.0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,601.0,920.0,
1,52707.0,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,601.0,1527.0,
2,103379.0,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,795.0,748.0,
3,31239.0,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,731.0,1800.0,
4,34632.0,for_sale,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,680.0,,
5,103378.0,for_sale,179000.0,4.0,3.0,0.46,1850806.0,San Sebastian,Puerto Rico,612.0,2520.0,
6,1205.0,for_sale,50000.0,3.0,1.0,0.2,1298094.0,Ciales,Puerto Rico,639.0,2040.0,
7,50739.0,for_sale,71600.0,3.0,2.0,0.08,1048466.0,Ponce,Puerto Rico,731.0,1050.0,
8,81909.0,for_sale,100000.0,2.0,1.0,0.09,734904.0,Ponce,Puerto Rico,730.0,1092.0,
9,65672.0,for_sale,300000.0,5.0,3.0,7.46,1946226.0,Las Marias,Puerto Rico,670.0,5403.0,


In [9]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226382 entries, 0 to 2226381
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   brokered_by     float64
 1   status          object 
 2   price           float64
 3   bed             float64
 4   bath            float64
 5   acre_lot        float64
 6   street          float64
 7   city            object 
 8   state           object 
 9   zip_code        float64
 10  house_size      float64
 11  prev_sold_date  object 
dtypes: float64(8), object(4)
memory usage: 203.8+ MB


In [10]:
df_raw.describe()

Unnamed: 0,brokered_by,price,bed,bath,acre_lot,street,zip_code,house_size
count,2221849.0,2224841.0,1745065.0,1714611.0,1900793.0,2215516.0,2226083.0,1657898.0
mean,52939.89,524195.5,3.275841,2.49644,15.22303,1012325.0,52186.68,2714.471
std,30642.75,2138893.0,1.567274,1.652573,762.8238,583763.5,28954.08,808163.5
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0,4.0
25%,23861.0,165000.0,3.0,2.0,0.15,506312.8,29617.0,1300.0
50%,52884.0,325000.0,3.0,2.0,0.26,1012766.0,48382.0,1760.0
75%,79183.0,550000.0,4.0,3.0,0.98,1521173.0,78070.0,2413.0
max,110142.0,2147484000.0,473.0,830.0,100000.0,2001357.0,99999.0,1040400000.0


In [11]:
df_raw.describe(include = 'object')

Unnamed: 0,status,city,state,prev_sold_date
count,2226382,2224975,2226374,1492085
unique,3,20098,55,14954
top,for_sale,Houston,Florida,2022-03-31
freq,1389306,23862,249432,17171


## Cleaning up USA Real Estate data

In [12]:
df_raw.duplicated().sum()

np.int64(0)

In [13]:
# Keep only the necessary columns
df_req_cols = df_raw [['price', 'bed', 'bath', 'acre_lot', 'city', 'state', 'zip_code', 'house_size', 'status']]

In [14]:
total_missing = df_req_cols.isna().sum()*100/len(df_req_cols)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
price,0.069215
bed,21.618797
bath,22.986666
acre_lot,14.62413
city,0.063197
state,0.000359
zip_code,0.01343
house_size,25.533983
status,0.0


In [15]:
# Drop nulls, reset and drop the index
df_nadropped = df_req_cols.dropna().reset_index(drop=True)
df_nadropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360347 entries, 0 to 1360346
Data columns (total 9 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   price       1360347 non-null  float64
 1   bed         1360347 non-null  float64
 2   bath        1360347 non-null  float64
 3   acre_lot    1360347 non-null  float64
 4   city        1360347 non-null  object 
 5   state       1360347 non-null  object 
 6   zip_code    1360347 non-null  float64
 7   house_size  1360347 non-null  float64
 8   status      1360347 non-null  object 
dtypes: float64(6), object(3)
memory usage: 93.4+ MB


In [16]:
total_missing = df_nadropped.isna().sum()*100/len(df_nadropped)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
price,0.0
bed,0.0
bath,0.0
acre_lot,0.0
city,0.0
state,0.0
zip_code,0.0
house_size,0.0
status,0.0


In [17]:
# cast date types
df_typed = df_nadropped.astype({'price': 'float', 'bed': 'int', 'bath': 'int', 'acre_lot': 'float', 'city':'str', 'state':'str', 'zip_code':'int', 'house_size': 'float'})

In [18]:
df_typed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360347 entries, 0 to 1360346
Data columns (total 9 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   price       1360347 non-null  float64
 1   bed         1360347 non-null  int64  
 2   bath        1360347 non-null  int64  
 3   acre_lot    1360347 non-null  float64
 4   city        1360347 non-null  object 
 5   state       1360347 non-null  object 
 6   zip_code    1360347 non-null  int64  
 7   house_size  1360347 non-null  float64
 8   status      1360347 non-null  object 
dtypes: float64(3), int64(3), object(3)
memory usage: 93.4+ MB


In [19]:
# Filter rows with status = sold
df_sold = df_typed[df_typed['status'] == 'sold'].drop(columns=['status'])

In [20]:
# Adding a column price_per_sqft
df_sold['price_per_sqft'] = df_sold['price'] / df_sold['house_size']

In [21]:
df_sold.head()

Unnamed: 0,price,bed,bath,acre_lot,city,state,zip_code,house_size,price_per_sqft
750602,524900.0,3,2,0.09,Aguada,Puerto Rico,602,2200.0,238.590909
750603,90000.0,3,2,0.08,Aguadilla,Puerto Rico,603,1421.0,63.335679
750604,22500.0,2,1,0.32,Anasco,Puerto Rico,610,850.0,26.470588
750605,168000.0,6,4,0.05,Anasco,Puerto Rico,610,3422.0,49.094097
750606,200000.0,3,1,4.98,Arecibo,Puerto Rico,612,580.0,344.827586


In [22]:
df_real_estate = df_sold

## Loading in Zip code to County Dataset

In [23]:
#!/bin/bash
!!kaggle datasets download danofer/zipcodes-county-fips-crosswalk

['Dataset URL: https://www.kaggle.com/datasets/danofer/zipcodes-county-fips-crosswalk',
 'License(s): CC0-1.0']

In [24]:
!unzip zipcodes-county-fips-crosswalk.zip

Archive:  zipcodes-county-fips-crosswalk.zip
  inflating: ZIP-COUNTY-FIPS_2017-06.csv  


In [25]:
zip_county_df = pd.read_csv('ZIP-COUNTY-FIPS_2017-06.csv')
zip_county_df.head(10)

Unnamed: 0,ZIP,COUNTYNAME,STATE,STCOUNTYFP,CLASSFP
0,36003,Autauga County,AL,1001,H1
1,36006,Autauga County,AL,1001,H1
2,36067,Autauga County,AL,1001,H1
3,36066,Autauga County,AL,1001,H1
4,36703,Autauga County,AL,1001,H1
5,36701,Autauga County,AL,1001,H1
6,36091,Autauga County,AL,1001,H1
7,36051,Autauga County,AL,1001,H1
8,36068,Autauga County,AL,1001,H1
9,36008,Autauga County,AL,1001,H1


In [26]:
zip_county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52889 entries, 0 to 52888
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ZIP         52889 non-null  int64 
 1   COUNTYNAME  52889 non-null  object
 2   STATE       52889 non-null  object
 3   STCOUNTYFP  52889 non-null  int64 
 4   CLASSFP     52889 non-null  object
dtypes: int64(2), object(3)
memory usage: 2.0+ MB


## Cleaning up Zip Code to County Dataset

In [27]:
zip_county_df.duplicated().sum()

np.int64(0)

In [28]:
total_missing = zip_county_df.isna().sum() * 100 / len(zip_county_df)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
ZIP,0.0
COUNTYNAME,0.0
STATE,0.0
STCOUNTYFP,0.0
CLASSFP,0.0


In [29]:
# cast date types
zip_county_df = zip_county_df.astype({'ZIP': 'int', 'COUNTYNAME':'str', 'STATE':'str'})

In [30]:
zip_county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52889 entries, 0 to 52888
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ZIP         52889 non-null  int64 
 1   COUNTYNAME  52889 non-null  object
 2   STATE       52889 non-null  object
 3   STCOUNTYFP  52889 non-null  int64 
 4   CLASSFP     52889 non-null  object
dtypes: int64(2), object(3)
memory usage: 2.0+ MB


In [31]:
zip_county_df.rename(columns={'ZIP': 'zip_code', 'COUNTYNAME':'county', 'STATE':'state'}, inplace=True)

In [32]:
zip_county_df.drop(columns=['STCOUNTYFP', 'CLASSFP'], inplace=True)

In [33]:
# count unique counties per ZIP code
zip_counts = zip_county_df['zip_code'].value_counts()

# Step 1: Find ZIP codes with only one county
zips_one_county = zip_counts[zip_counts == 1].index
# Step 2: Filter out rows with those ZIP codes
zip_county_df = zip_county_df[zip_county_df['zip_code'].isin(zips_one_county)]

In [34]:
zip_county_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29102 entries, 0 to 52888
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  29102 non-null  int64 
 1   county    29102 non-null  object
 2   state     29102 non-null  object
dtypes: int64(1), object(2)
memory usage: 909.4+ KB


## Create new column "county" in USA Real Estate Dataset

In [35]:
# Merge df with zip_county_df to get county info
real_estate_df = pd.merge(df_real_estate, zip_county_df, on='zip_code', how='inner')
real_estate_df.drop(columns=['state_x'], inplace=True)
real_estate_df.rename(columns={'state_y':'state'},inplace=True)
real_estate_df.head(10)

Unnamed: 0,price,bed,bath,acre_lot,city,zip_code,house_size,price_per_sqft,county,state
0,22500.0,2,1,0.32,Anasco,610,850.0,26.470588,Anasco Municipio,PR
1,168000.0,6,4,0.05,Anasco,610,3422.0,49.094097,Anasco Municipio,PR
2,120000.0,4,2,0.18,Sabana Grande,637,1188.0,101.010101,Sabana Grande Municipio,PR
3,133000.0,3,1,0.6,Hatillo,659,1049.0,126.787417,Hatillo Municipio,PR
4,177000.0,2,2,0.1,Isabela,662,4674.0,37.869063,Isabela Municipio,PR
5,220000.0,5,3,0.11,Isabela,662,2992.0,73.529412,Isabela Municipio,PR
6,175000.0,3,3,0.45,Las Marias,670,3090.0,56.634304,Las Marias Municipio,PR
7,149000.0,3,2,0.28,Mayaguez,682,2000.0,74.5,Mayaguez Municipio,PR
8,2750000.0,3,4,0.15,San Juan,907,2820.0,975.177305,San Juan Municipio,PR
9,185000.0,3,2,13.47,Maunabo,707,1159.0,159.620362,Maunabo Municipio,PR


In [36]:
real_estate_df.shape

(451948, 10)

In [37]:
real_estate_df.describe()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size,price_per_sqft
count,451948.0,451948.0,451948.0,451948.0,451948.0,451948.0,451948.0
mean,551866.5,3.284531,2.428421,11.169715,59467.386436,1943.216633,285.079374
std,750810.3,1.316137,1.195875,774.451696,31666.360915,1763.082967,240.109022
min,1.0,1.0,1.0,0.0,610.0,100.0,0.000282
25%,249907.5,3.0,2.0,0.13,31312.0,1303.0,150.130548
50%,395000.0,3.0,2.0,0.19,70433.0,1714.0,214.868181
75%,625000.0,4.0,3.0,0.32,91106.0,2300.0,333.266667
max,80000000.0,444.0,222.0,100000.0,99402.0,927828.0,15015.015015


In [38]:
real_estate_df.describe(include = 'object')

Unnamed: 0,city,county,state
count,451948,451948,451948
unique,8090,1267,51
top,Houston,Los Angeles County,CA
freq,10570,20836,93327


## Loading in Zip Code Demographics

In [39]:
#!/bin/bash
!!kaggle datasets download erdi28/zip-codes-demographics

['Dataset URL: https://www.kaggle.com/datasets/erdi28/zip-codes-demographics',
 'License(s): MIT']

In [40]:
!unzip zip-codes-demographics.zip

Archive:  zip-codes-demographics.zip
  inflating: zip_code_demographics.csv  


In [41]:
zip_code_demographics_df_raw = pd.read_csv('zip_code_demographics.csv')
zip_code_demographics_df_raw.head(10)

Unnamed: 0,zip,lat,lng,city,state_id,state_name,population,density,county_name,po_box,dist_highway,dist2_large_airport,dist2_medium_airport,dist_to_shore,number_of_business,adjusted_gross_income,total_income_amount,number_of_returns
0,1001,42.06262,-72.62521,Agawam,MA,Massachusetts,16088,550.1,Hampden,0,1.387035,106.145765,12.946212,93.049251,438.0,598807,604769.0,9320
1,1002,42.37633,-72.46462,Amherst,MA,Massachusetts,27323,198.1,Hampshire,0,14.438177,112.264368,21.080079,133.370144,571.0,989558,1005796.0,9880
2,1005,42.42117,-72.10655,Barre,MA,Massachusetts,4947,44.2,Worcester,0,16.788339,90.664964,25.547718,97.639881,97.0,164207,166054.0,2490
3,1007,42.28163,-72.40009,Belchertown,MA,Massachusetts,15304,107.7,Hampshire,0,13.663839,101.552921,14.762395,114.406034,217.0,647074,654739.0,7970
4,1008,42.18234,-72.95819,Blandford,MA,Massachusetts,1171,7.4,Hampden,0,2.593655,136.548797,20.17795,107.466779,18.0,47826,48241.0,660
5,1010,42.12904,-72.20597,Brimfield,MA,Massachusetts,3703,40.6,Hampden,0,4.737271,78.6455,28.049262,94.928024,73.0,155666,157677.0,1980
6,1011,42.30233,-72.96448,Chester,MA,Massachusetts,1332,15.7,Hampden,0,8.058693,142.414627,26.041908,128.56317,13.0,38223,38553.0,630
7,1012,42.38495,-72.84675,Chesterfield,MA,Massachusetts,503,16.1,Hampshire,0,18.331096,138.381684,27.466664,137.693285,10.0,24826,25102.0,410
8,1013,42.16059,-72.60788,Chicopee,MA,Massachusetts,22709,1549.1,Hampden,0,0.062286,109.114246,7.075893,104.576258,317.0,516431,520174.0,11270
9,1020,42.17618,-72.56538,Chicopee,MA,Massachusetts,30704,951.4,Hampden,0,0.92514,106.769095,3.205533,105.789421,550.0,828125,834614.0,15760


In [42]:
zip_code_demographics_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33704 entries, 0 to 33703
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   zip                    33704 non-null  int64  
 1   lat                    33704 non-null  float64
 2   lng                    33704 non-null  float64
 3   city                   33704 non-null  object 
 4   state_id               33704 non-null  object 
 5   state_name             33704 non-null  object 
 6   population             33704 non-null  int64  
 7   density                33704 non-null  float64
 8   county_name            33704 non-null  object 
 9   po_box                 33704 non-null  int64  
 10  dist_highway           33704 non-null  float64
 11  dist2_large_airport    33704 non-null  float64
 12  dist2_medium_airport   33704 non-null  float64
 13  dist_to_shore          33704 non-null  float64
 14  number_of_business     33624 non-null  float64
 15  ad

In [43]:
zip_code_demographics_df_raw.describe()

Unnamed: 0,zip,lat,lng,population,density,po_box,dist_highway,dist2_large_airport,dist2_medium_airport,dist_to_shore,number_of_business,adjusted_gross_income,total_income_amount,number_of_returns
count,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33624.0,33704.0,33704.0,33704.0
mean,49842.607821,38.888195,-91.051366,11287.959738,528.704068,0.179949,29.291294,158.648048,40.854624,304.798736,260.69492,425134.1,429894.3,5341.873665
std,27451.111442,5.228075,15.460095,15200.216997,2219.277025,0.384151,174.104869,166.207042,80.224956,298.948686,432.112814,756151.9,764441.6,7273.61366
min,1001.0,-14.21984,-176.62962,0.0,0.0,0.0,0.000194,0.112768,0.134153,0.011449,1.0,577.0,577.0,90.0
25%,27052.75,35.427243,-97.255342,1253.75,8.2,0.0,3.05042,59.434074,19.513217,61.286177,19.0,32475.0,32879.25,580.0
50%,49780.5,39.50297,-88.22482,4099.0,29.0,0.0,11.541912,125.957938,35.131017,207.328858,74.0,112866.5,114036.5,1880.0
75%,72210.25,42.11,-80.289333,16113.25,238.825,0.0,30.046597,209.476362,55.231087,471.857289,334.0,505229.0,509227.0,7560.0
max,99929.0,71.27434,144.87637,130352.0,68424.5,1.0,10767.713382,6127.454722,5101.697518,1336.551268,7263.0,15224670.0,15277980.0,61920.0


In [44]:
zip_code_demographics_df_raw.describe(include = 'object')

Unnamed: 0,city,state_id,state_name,county_name
count,33704,33704,33704,33704
unique,17551,51,51,1800
top,Houston,TX,Texas,Washington
freq,106,1990,1990,401


## Cleaning Zip Code Demographics Data

In [45]:
zip_code_demographics_df_raw.duplicated().sum()

np.int64(0)

In [46]:
# Keep only the necessary columns
zip_code_demographics_df = zip_code_demographics_df_raw [['zip', 'population', 'density', 'dist_highway', 'dist2_large_airport', 'dist_to_shore', 'number_of_business', 'adjusted_gross_income']]

In [47]:
total_missing = zip_code_demographics_df.isna().sum() * 100 / len(zip_code_demographics_df)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
zip,0.0
population,0.0
density,0.0
dist_highway,0.0
dist2_large_airport,0.0
dist_to_shore,0.0
number_of_business,0.237361
adjusted_gross_income,0.0


In [50]:
# Drop nulls
zip_code_demographics_df = zip_code_demographics_df.dropna()
# Reset and drop the index
zip_code_demographics_df = zip_code_demographics_df.reset_index(drop=True)

In [52]:
total_missing = zip_code_demographics_df.isna().sum() * 100 / len(zip_code_demographics_df)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
zip,0.0
population,0.0
density,0.0
dist_highway,0.0
dist2_large_airport,0.0
dist_to_shore,0.0
number_of_business,0.0
adjusted_gross_income,0.0


In [53]:
# cast date types
zip_code_demographics_df = zip_code_demographics_df.astype({'zip': 'int', 'population': 'float', 'density': 'float', 'dist_highway': 'float', 'dist2_large_airport': 'float', 'dist_to_shore': 'float', 'number_of_business': 'float', 'adjusted_gross_income': 'float'})

In [54]:
zip_code_demographics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33624 entries, 0 to 33623
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   zip                    33624 non-null  int64  
 1   population             33624 non-null  float64
 2   density                33624 non-null  float64
 3   dist_highway           33624 non-null  float64
 4   dist2_large_airport    33624 non-null  float64
 5   dist_to_shore          33624 non-null  float64
 6   number_of_business     33624 non-null  float64
 7   adjusted_gross_income  33624 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 2.1 MB


In [55]:
zip_code_demographics_df.rename(columns={'zip': 'zip_code'}, inplace=True)

## Merge real_estate_df with zip_code_demographics_df to add the zip code related features

In [56]:
real_estate_features_df = pd.merge(real_estate_df, zip_code_demographics_df, on='zip_code', how='inner')
real_estate_features_df.head(10)

Unnamed: 0,price,bed,bath,acre_lot,city,zip_code,house_size,price_per_sqft,county,state,population,density,dist_highway,dist2_large_airport,dist_to_shore,number_of_business,adjusted_gross_income
0,215000.0,3,2,0.19,Chicopee,1020,1828.0,117.61488,Hampden County,MA,30704.0,951.4,0.92514,106.769095,105.789421,550.0,828125.0
1,389900.0,3,1,24.0,Northampton,1062,1559.0,250.096216,Hampshire County,MA,9835.0,217.6,3.630387,122.235452,124.761768,191.0,639133.0
2,269900.0,2,1,0.46,South Hadley,1075,1312.0,205.716463,Hampshire County,MA,18051.0,393.3,2.910859,112.181953,115.181126,297.0,697896.0
3,314900.0,5,2,0.28,Chicopee,1013,2219.0,141.910771,Hampden County,MA,22709.0,1549.1,0.062286,109.114246,104.576258,317.0,516431.0
4,244999.0,4,1,0.23,Springfield,1104,1285.0,190.6607,Hampden County,MA,23226.0,1714.0,0.645911,104.785728,101.063511,528.0,453300.0
5,199000.0,2,2,0.85,Deerfield,1373,1664.0,119.591346,Franklin County,MA,5076.0,85.0,2.294487,126.845728,139.556753,139.0,224315.0
6,279000.0,6,3,0.13,Chicopee,1020,3348.0,83.333333,Hampden County,MA,30704.0,951.4,0.92514,106.769095,105.789421,550.0,828125.0
7,589000.0,4,3,1.6,Ludlow,1056,2964.0,198.717949,Hampden County,MA,21050.0,298.9,2.608102,99.921771,105.057502,412.0,747067.0
8,399900.0,3,3,0.35,Easthampton,1027,1380.0,289.782609,Hampshire County,MA,17708.0,168.5,9.873599,125.949999,124.367513,372.0,671249.0
9,359000.0,3,2,1.03,Westfield,1085,1670.0,214.97006,Hampden County,MA,41715.0,269.7,2.075971,118.450463,101.61725,845.0,1371042.0


In [57]:
real_estate_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451719 entries, 0 to 451718
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   price                  451719 non-null  float64
 1   bed                    451719 non-null  int64  
 2   bath                   451719 non-null  int64  
 3   acre_lot               451719 non-null  float64
 4   city                   451719 non-null  object 
 5   zip_code               451719 non-null  int64  
 6   house_size             451719 non-null  float64
 7   price_per_sqft         451719 non-null  float64
 8   county                 451719 non-null  object 
 9   state                  451719 non-null  object 
 10  population             451719 non-null  float64
 11  density                451719 non-null  float64
 12  dist_highway           451719 non-null  float64
 13  dist2_large_airport    451719 non-null  float64
 14  dist_to_shore          451719 non-nu

In [58]:
real_estate_features_df.describe()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size,price_per_sqft,population,density,dist_highway,dist2_large_airport,dist_to_shore,number_of_business,adjusted_gross_income
count,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0,451719.0
mean,551816.6,3.284365,2.428187,11.148105,59483.667176,1943.094021,285.082448,33545.607781,1265.905413,6.976981,76.397552,176.731627,691.471818,1374152.0
std,750813.3,1.315466,1.195087,774.598407,31654.732566,1763.175229,240.130182,19403.167687,1784.271593,14.525969,101.157852,235.796885,513.583607,1163807.0
min,1.0,1.0,1.0,0.0,1001.0,100.0,0.000282,0.0,0.0,0.000283,0.42442,0.011449,1.0,2925.0
25%,249950.0,3.0,2.0,0.13,31320.0,1303.0,150.142993,19787.0,244.8,1.365326,19.630244,18.578272,319.0,615253.0
50%,395000.0,3.0,2.0,0.19,70601.0,1714.0,214.863498,31480.0,811.7,2.912591,36.311981,65.523818,585.0,1108688.0
75%,625000.0,4.0,3.0,0.32,91106.0,2300.0,333.236996,44608.0,1707.0,6.40378,90.491868,270.690594,943.0,1785317.0
max,80000000.0,444.0,222.0,100000.0,99402.0,927828.0,15015.015015,130352.0,58289.6,299.940968,994.951376,1330.442647,7263.0,15224670.0


In [59]:
real_estate_features_df.describe(include = 'object')

Unnamed: 0,city,county,state
count,451719,451719,451719
unique,8038,1243,49
top,Houston,Los Angeles County,CA
freq,10570,20835,93286
