# Dependencies

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

# Preprocessing the Data

In [14]:
# read csv into dataframe
filepath = "Austin Properties 2019.1.6.csv"
csv_df = pd.read_csv(filepath)

In [15]:
# check the column names
csv_df.columns

Index(['Address', 'Building Name', 'Zip Code', 'Bed', 'Bath', 'Avg SF',
       '# Units', 'Mix %', 'Units Available Units', 'Units Available Percent',
       'Avg Asking Rent/Unit', 'Avg Asking Rent/SF', 'Avg Effective Rent/Unit',
       'Avg Effective Rent/SF', 'Concessions %', 'Year Built',
       'Year Renovated', 'Units', 'Lande (Acres)', 'Vacancy %', 'Washer/Dryer',
       'Walk in Closet', 'Hardwood/Vinyl Floor', 'Walk Score', 'Transit Score',
       '1 Mile Population', '1 Mile Median Household Income',
       'Miles from Domain', 'Miles from Downtown', 'Unnamed: 29'],
      dtype='object')

In [16]:
csv_df = pd.get_dummies(data=csv_df, columns=['Washer/Dryer'])

In [17]:
# select certain variables
original_df = csv_df[['Bed', 'Bath', 'Avg SF', 'Concessions %', 'Year Built', 
   'Walk in Closet', 'Hardwood/Vinyl Floor',
   'Washer/Dryer_Yes but not in unit', 'Washer/Dryer_Yes in unit', 'Washer/Dryer_no',
   'Walk Score', 'Transit Score', '1 Mile Population', '1 Mile Median Household Income',
   'Miles from Domain', 'Miles from Downtown', 
   'Avg Effective Rent/Unit']]

In [18]:
# check na values
original_df.count()

Bed                                  5543
Bath                                 5544
Avg SF                               5544
Concessions %                        5544
Year Built                           5544
Walk in Closet                       5544
Hardwood/Vinyl Floor                 5544
Washer/Dryer_Yes but not in unit    11079
Washer/Dryer_Yes in unit            11079
Washer/Dryer_no                     11079
Walk Score                           5526
Transit Score                        5526
1 Mile Population                    5490
1 Mile Median Household Income       5490
Miles from Domain                    5544
Miles from Downtown                  5544
Avg Effective Rent/Unit              5541
dtype: int64

In [19]:
# process na values
austin_properties_df = original_df.dropna()
# check the result of na value processing
austin_properties_df.count()

Bed                                 5468
Bath                                5468
Avg SF                              5468
Concessions %                       5468
Year Built                          5468
Walk in Closet                      5468
Hardwood/Vinyl Floor                5468
Washer/Dryer_Yes but not in unit    5468
Washer/Dryer_Yes in unit            5468
Washer/Dryer_no                     5468
Walk Score                          5468
Transit Score                       5468
1 Mile Population                   5468
1 Mile Median Household Income      5468
Miles from Domain                   5468
Miles from Downtown                 5468
Avg Effective Rent/Unit             5468
dtype: int64

In [20]:
# check the data type
austin_properties_df.dtypes

Bed                                 float64
Bath                                float64
Avg SF                              float64
Concessions %                       float64
Year Built                          float64
Walk in Closet                      float64
Hardwood/Vinyl Floor                float64
Washer/Dryer_Yes but not in unit      uint8
Washer/Dryer_Yes in unit              uint8
Washer/Dryer_no                       uint8
Walk Score                          float64
Transit Score                       float64
1 Mile Population                   float64
1 Mile Median Household Income      float64
Miles from Domain                   float64
Miles from Downtown                 float64
Avg Effective Rent/Unit             float64
dtype: object

In [21]:
# overview of the new dataframe
austin_properties_df.head()

Unnamed: 0,Bed,Bath,Avg SF,Concessions %,Year Built,Walk in Closet,Hardwood/Vinyl Floor,Washer/Dryer_Yes but not in unit,Washer/Dryer_Yes in unit,Washer/Dryer_no,Walk Score,Transit Score,1 Mile Population,1 Mile Median Household Income,Miles from Domain,Miles from Downtown,Avg Effective Rent/Unit
0,1.0,1.0,560.0,0.01,2016.0,1.0,1.0,0,1,0,88.0,44.0,20115.0,70385.0,12.6,2.1,1528.0
1,1.0,1.0,612.0,0.01,2016.0,1.0,1.0,0,1,0,88.0,44.0,20115.0,70385.0,12.6,2.1,1633.0
2,1.0,1.0,629.0,0.01,2016.0,1.0,1.0,0,1,0,88.0,44.0,20115.0,70385.0,12.6,2.1,1740.0
3,1.0,1.0,774.0,0.01,2016.0,1.0,1.0,0,1,0,88.0,44.0,20115.0,70385.0,12.6,2.1,1778.0
4,1.0,1.0,778.0,0.01,2016.0,1.0,1.0,0,1,0,88.0,44.0,20115.0,70385.0,12.6,2.1,1852.0


# Determine X and y for Machine Learning

In [23]:
X = austin_properties_df.iloc[:, :-1]
y = austin_properties_df['Avg Effective Rent/Unit']

# Splitting the data into training and testing sets

In [26]:
from sklearn.model_selection import train_test_split

# 80/20 training and testing data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [27]:
# check the shape of training and test data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4374, 16)
(1094, 16)
(4374,)
(1094,)


# Model (I) - Linear Regression

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [29]:
# train the model
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [30]:
# model evaluation
y_test_predicted = lin_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_predicted)
r2_test = r2_score(y_test, y_test_predicted)
print(f"Mean Squared Error (MSE): {mse_test}")
print(f"R-quared (R2): {r2_test}")

Mean Squared Error (MSE): 296512.9266368084
R-quared (R2): 0.5344241825987113


## SGD

In [40]:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(n_iter=100, penalty=None)

In [41]:
sgd_reg.fit(X_train, y_train.ravel())



SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=100, n_iter_no_change=5, penalty=None, power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [42]:
print(mean_squared_error(y_test, sgd_reg.predict(X_test)))
print(r2_score(y_test, sgd_reg.predict(X_test)))

1.1584558766154334e+36
-1.8189731145825983e+30


# Model (II) - Polynomial Regression

In [44]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=3)
X_train_poly = poly_reg.fit_transform(X_train)
X_test_poly = poly_reg.fit_transform(X_test)
poly_reg.fit(X_train_poly, y_train)

PolynomialFeatures(degree=3, include_bias=True, interaction_only=False)

In [45]:
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [47]:
print(mean_squared_error(y_test, lin_reg_2.predict(X_test_poly)))
print(r2_score(y_test, lin_reg_2.predict(X_test_poly)))

1942868.716456647
-2.0506349285596617


# Residuals

In [None]:
plt.scatter(y_test_predicted, y_test_predicted-y_test)
plt.hlines(y=0, xmin=y_test_predicted.min(), xmax=y_test_predicted.max())
plt.show()