# USA Housing Analysis and Prediction

## Step 1: Load the dependencies

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

## Step 2: Data Understanding

In [4]:
# Loading the Datasets using pandas Library
df = pd.read_csv('USA_Housing.csv')

In [5]:
df

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.002900,6.730821,3.09,40173.072174,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.865890,8.512727,5.13,36882.159400,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1.260617e+06,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,6.309435e+05,USNS Raymond\nFPO AE 09386
...,...,...,...,...,...,...,...
4995,60567.944140,7.830362,6.137356,3.46,22837.361035,1.060194e+06,USNS Williams\nFPO AP 30153-7653
4996,78491.275435,6.999135,6.576763,4.02,25616.115489,1.482618e+06,"PSC 9258, Box 8489\nAPO AA 42991-3352"
4997,63390.686886,7.250591,4.805081,2.13,33266.145490,1.030730e+06,"4215 Tracy Garden Suite 076\nJoshualand, VA 01..."
4998,68001.331235,5.534388,7.130144,5.44,42625.620156,1.198657e+06,USS Wallace\nFPO AE 73316


In [6]:
# Displaying the first 5 records
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [7]:
# Displaying the shape

df.shape

(5000, 7)

In [8]:
print(f'The data has {df.shape[0]} rows, and {df.shape[1]} columns')

The data has 5000 rows, and 7 columns


In [12]:
# Displaying the Statistics

df.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,68583.108984,5.977222,6.987792,3.98133,36163.516039,1232073.0
std,10657.991214,0.991456,1.005833,1.234137,9925.650114,353117.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61480.562388,5.322283,6.29925,3.14,29403.928702,997577.1
50%,68804.286404,5.970429,7.002902,4.05,36199.406689,1232669.0
75%,75783.338666,6.650808,7.665871,4.49,42861.290769,1471210.0
max,107701.748378,9.519088,10.759588,6.5,69621.713378,2469066.0


In [14]:
# Displaying the information of the Dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


In [15]:
# Checking for missing values
df.isnull().sum()

Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
Address                         0
dtype: int64

In [16]:
# Checking the datatypes

df.dtypes

Avg. Area Income                float64
Avg. Area House Age             float64
Avg. Area Number of Rooms       float64
Avg. Area Number of Bedrooms    float64
Area Population                 float64
Price                           float64
Address                          object
dtype: object

In [19]:
# Checking for duplicate records
df.duplicated().sum()

0

## Step 3: Data Wrangling

In [20]:
df.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

In [21]:
# Converting all columns to lower case
df.columns = df.columns.str.lower()
df.columns

Index(['avg. area income', 'avg. area house age', 'avg. area number of rooms',
       'avg. area number of bedrooms', 'area population', 'price', 'address'],
      dtype='object')

In [26]:
# Checking the Highest, Lowest, and Average House Price
print('maximum price: ', {df['price'].max()})
print('minimum price: ', {df['price'].min()})
print('average price: ', {df['price'].mean()})

maximum price:  {2469065.5941747027}
minimum price:  {15938.657923287848}
average price:  {1232072.65414236}


In [36]:
df['address']

0       208 Michael Ferry Apt. 674\nLaurabury, NE 3701...
1       188 Johnson Views Suite 079\nLake Kathleen, CA...
2       9127 Elizabeth Stravenue\nDanieltown, WI 06482...
3                               USS Barnett\nFPO AP 44820
4                              USNS Raymond\nFPO AE 09386
                              ...                        
4995                     USNS Williams\nFPO AP 30153-7653
4996                PSC 9258, Box 8489\nAPO AA 42991-3352
4997    4215 Tracy Garden Suite 076\nJoshualand, VA 01...
4998                            USS Wallace\nFPO AE 73316
4999    37778 George Ridges Apt. 509\nEast Holly, NV 2...
Name: address, Length: 5000, dtype: object

In [37]:
df.head()

Unnamed: 0,avg. area income,avg. area house age,avg. area number of rooms,avg. area number of bedrooms,area population,price,address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


## Correlation

In [38]:
# Selecting only the numerical columns

numerical_columns = df.select_dtypes(include = [int, float])

numerical_columns

Unnamed: 0,avg. area income,avg. area house age,avg. area number of rooms,avg. area number of bedrooms,area population,price
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1.059034e+06
1,79248.642455,6.002900,6.730821,3.09,40173.072174,1.505891e+06
2,61287.067179,5.865890,8.512727,5.13,36882.159400,1.058988e+06
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1.260617e+06
4,59982.197226,5.040555,7.839388,4.23,26354.109472,6.309435e+05
...,...,...,...,...,...,...
4995,60567.944140,7.830362,6.137356,3.46,22837.361035,1.060194e+06
4996,78491.275435,6.999135,6.576763,4.02,25616.115489,1.482618e+06
4997,63390.686886,7.250591,4.805081,2.13,33266.145490,1.030730e+06
4998,68001.331235,5.534388,7.130144,5.44,42625.620156,1.198657e+06


In [39]:
numerical_columns.corr()

Unnamed: 0,avg. area income,avg. area house age,avg. area number of rooms,avg. area number of bedrooms,area population,price
avg. area income,1.0,-0.002007,-0.011032,0.019788,-0.016234,0.639734
avg. area house age,-0.002007,1.0,-0.009428,0.006149,-0.018743,0.452543
avg. area number of rooms,-0.011032,-0.009428,1.0,0.462695,0.00204,0.335664
avg. area number of bedrooms,0.019788,0.006149,0.462695,1.0,-0.022168,0.171071
area population,-0.016234,-0.018743,0.00204,-0.022168,1.0,0.408556
price,0.639734,0.452543,0.335664,0.171071,0.408556,1.0


## Step 4: Data Preparation for Prediction

In [40]:
df.columns

Index(['avg. area income', 'avg. area house age', 'avg. area number of rooms',
       'avg. area number of bedrooms', 'area population', 'price', 'address'],
      dtype='object')

In [45]:
# Splitting into dependent and independent variables

x = df.drop(['price', 'address'], axis = 1)
y = df.drop(['avg. area income', 'avg. area house age', 'avg. area number of rooms',
       'avg. area number of bedrooms', 'area population', 'address'], axis = 1)

In [46]:
print(x)
print(y)

      avg. area income  avg. area house age  avg. area number of rooms  \
0         79545.458574             5.682861                   7.009188   
1         79248.642455             6.002900                   6.730821   
2         61287.067179             5.865890                   8.512727   
3         63345.240046             7.188236                   5.586729   
4         59982.197226             5.040555                   7.839388   
...                ...                  ...                        ...   
4995      60567.944140             7.830362                   6.137356   
4996      78491.275435             6.999135                   6.576763   
4997      63390.686886             7.250591                   4.805081   
4998      68001.331235             5.534388                   7.130144   
4999      65510.581804             5.992305                   6.792336   

      avg. area number of bedrooms  area population  
0                             4.09     23086.800503  
1  

In [49]:
Scaler = StandardScaler()
Scaler

StandardScaler()

In [51]:
# Fit and transform our x variable
scaledx = Scaler.fit_transform(x)
scaledx

array([[ 1.02865969, -0.29692705,  0.02127433,  0.08806222, -1.31759867],
       [ 1.00080775,  0.02590164, -0.25550611, -0.72230146,  0.40399945],
       [-0.68462916, -0.11230283,  1.5162435 ,  0.93084045,  0.07240989],
       ...,
       [-0.48723454,  1.28447022, -2.17026949, -1.50025059, -0.29193658],
       [-0.05459152, -0.44669439,  0.14154061,  1.18205319,  0.65111608],
       [-0.28831272,  0.01521477, -0.19434166,  0.07185495,  1.04162464]])

## Split into training and testing set

In [54]:
x_train, x_test, y_train, y_test = train_test_split(scaledx, y, test_size = 0.2, random_state = 42)
display(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(4000, 5)

(1000, 5)

(4000, 1)

(1000, 1)

## Model Build -- Linear Regression

In [57]:
from sklearn.linear_model import LinearRegression
regress = LinearRegression()
regress

LinearRegression()

In [59]:
regress.fit(x_train, y_train)

LinearRegression()

In [60]:
y_pred = regress.predict(x_test)
display(y_test, y_pred)

Unnamed: 0,price
1501,1.339096e+06
2586,1.251794e+06
2653,1.340095e+06
1055,1.431508e+06
705,1.042374e+06
...,...
4711,1.107031e+06
2313,1.405505e+06
3214,1.924156e+06
2732,1.571254e+06


array([[1308587.92699753],
       [1237037.22949428],
       [1243429.34030687],
       [1228900.21360379],
       [1063320.9071082 ],
       [1544058.05034856],
       [1094774.70493022],
       [ 833284.72339228],
       [ 788412.85578724],
       [1469714.86615707],
       [ 671728.43662066],
       [1606818.21977935],
       [1004166.61331062],
       [1796798.97595927],
       [1288566.96221017],
       [1087782.93301077],
       [1423072.37492526],
       [1078178.68169673],
       [ 802286.03537901],
       [ 930761.03695714],
       [1134829.86477819],
       [ 916398.42023136],
       [1489972.69335422],
       [1284580.15538819],
       [1582071.3532273 ],
       [1132519.15991993],
       [1089888.39644513],
       [ 974510.51872158],
       [ 924057.96820669],
       [1740759.72092272],
       [1286481.5951232 ],
       [1621289.9517161 ],
       [1435264.20161716],
       [1234014.77924484],
       [1485434.57300368],
       [1718335.00753687],
       [1538953.74882846],
 

In [61]:
from sklearn.metrics import r2_score

In [62]:
print(r2_score(y_test, y_pred))

0.9179971706834331
