# Performing Linear Regression with Bag of Words

In [138]:
import csv
import gzip
import math
import hashlib
import numpy as np
import pandas as pd
import re

In [139]:
df = pd.read_csv('all.csv', dtype=str, keep_default_na=False)

In [140]:
df["Price"] = pd.to_numeric(df["Price"])
df["PropertyType"] = pd.to_numeric(df["PropertyType"])
df["Beds"] = pd.to_numeric(df["Beds"])
df["Baths"] = pd.to_numeric(df["Baths"])
df["Sqft"] = pd.to_numeric(df["Sqft"])
df["YearBuilt"] = pd.to_numeric(df["YearBuilt"], downcast='integer')
df["WalkScore"] = pd.to_numeric(df["WalkScore"])
df["TransitScore"] = pd.to_numeric(df["TransitScore"])
df["ParkingPrice"] = pd.to_numeric(df["ParkingPrice"])
df["ParkingType"] = pd.to_numeric(df["ParkingType"])
df["Cooling"] = pd.to_numeric(df["Cooling"])
df["Laundry"] = pd.to_numeric(df["Laundry"])

price = df['Price']
df.drop(labels=['Price'], axis=1,inplace = True)
df.insert(0, 'Price', price)

df1 = df.drop(['Address', 'Description'], axis=1)

## Baseline Regression

In [141]:
def split_data(df):
    """split the data into training and validation sets, and convert them to np.ndarray. (Step 1 and 2 above.)

    args:
        df : pandas.DataFrame -- the parsed data, as returned by parse_stackoverflow_data()

    returns: X_train, y_train, X_val, y_val
      X_train  : np.ndarray -- the second 80% of the data features
      y_train : np.ndarray -- the second 80% of the target values
      X_val : np.ndarray -- the first 20% (rounded down) of the data features
      y_val : np.ndarray -- the first 20% of the target valuesn
    """
    n = len(df)
    df['final'] = 1.0
    df = df.to_numpy()
    i = int(np.floor(n*0.2))
    val = df[0:i,:]
    train = df[i:,:]
    Y_train = train[:, 0]
    X_train = train[:, 1:]
    Y_val = val[:, 0]
    X_val = val[:, 1:]
    return(X_train, Y_train, X_val, Y_val)
X_train, Y_train, X_val, Y_val = split_data(df1)

In [169]:
#https://stattrek.com/regression/slope-confidence-interval.aspx?Tutorial=AP
#https://online.stat.psu.edu/stat462/node/102/

def squared_error(y_pred, y):
    return(np.mean(np.square(np.subtract(y_pred,y))))

def standard_error(X_train, y_train):
    lm = LinearRegression(X_train, y_train)
    y_hat = lm.predict(X_train)
    x_bar = np.mean(X_train)
    n = len(y_train)
    print(np.sqrt(np.sum(np.square(y_train - y_hat), axis=0)/(n - 2))/np.sqrt(np.sum((X_train - x_bar)**2, axis=0)))
    return np.sqrt(np.sum(np.square(y_train - y_hat), axis=0)/(n - 2))/np.sqrt(np.sum(np.square(X_train - x_bar), axis=0)), lm.theta

class LinearRegression():
    def __init__(self, X, y):
        self.theta = np.linalg.solve(X.T @ X, X.T @ y)

    def predict(self, X): 
        return(X @ self.theta)

In [170]:
y_hat = lm.predict(X_train)
y_hat
#np.square(Y_train - y_hat)**2

array([ 968.48899749, 1295.08264675, 1451.1358168 , 1049.53054859,
       1072.67147448, 1021.79458475,  964.86565541, 1363.23629534,
       1560.42712941,  737.22235677, 1552.50343611, 1134.50594958,
       2842.67175374, 1052.3616799 , 1248.07432472, 1153.12432828,
        979.65686807, 1391.15495802, 1102.4116207 , 1089.45493061,
       1218.13884248, 1364.36018942, 1037.37233915, 1130.70280366,
       1370.91814531, 1068.21090987,  899.47950069, 1465.62294687,
       1254.57077327, 1364.64469216, 1269.37437625, 2074.95267381,
       1081.39109214, 1248.280247  , 2035.49890621, 1375.18238024,
       1531.94710901, 1167.96539111, 1348.57974297, 1166.30167899,
       1339.83534086, 1432.83480005, 1368.8988148 , 1068.10015137,
       1470.48993237, 1166.45025474, 1756.98314727, 1123.02034151,
        800.94419825, 1833.41178056, 1089.56063868, 1512.61123328,
       1943.01184759, 3382.10942386,  895.46358804, 1408.2058252 ,
        965.33853648, 1112.81257111, 1760.02484129, 2354.61556

In [171]:
def evaluate_linear_regression(X_train, y_train, X_val, y_val):
    lm = LinearRegression(X_train, y_train)
    baseline_mse = squared_error(np.mean(y_train), y_val)
    validation_mse = squared_error(lm.predict(X_val), y_val)
    return((validation_mse, baseline_mse))
evaluate_linear_regression(X_train, Y_train, X_val, Y_val)   

(123607.94780833411, 319678.33238067484)

In [184]:
std_err

array([25.28980765, 25.43445194, 25.39368967,  7.45652624,  3.85629954,
       35.31965988, 32.96810895, 26.34757083, 25.33533365, 25.33377822,
       25.3628136 , 25.37063606])

In [177]:
std_err, coefs = standard_error(X_train, Y_train)
cols = list(df1.drop(['Price'], axis=1).columns)

[0.04010835 0.04033775 0.0402731  0.01182567 0.0061159  0.05601519
 0.05228575 0.04178591 0.04018055 0.04017809 0.04022414 0.04023654]


In [178]:
import statsmodels.api as sm

mod = sm.OLS(Y_train,X_train)

fii = mod.fit()

p_values = fii.summary2().tables[1]['P>|t|']
fii.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.622
Dependent Variable:,y,AIC:,11487.4692
Date:,2021-05-16 13:29,BIC:,11543.7891
No. Observations:,807,Log-Likelihood:,-5731.7
Df Model:,11,F-statistic:,121.4
Df Residuals:,795,Prob (F-statistic):,8.94e-162
R-squared:,0.627,Scale:,87741.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
x1,9.2367,32.9928,0.2800,0.7796,-55.5266,73.9999
x2,192.0743,15.1414,12.6854,0.0000,162.3525,221.7961
x3,368.2880,28.9261,12.7320,0.0000,311.5073,425.0686
x4,0.1487,0.0288,5.1593,0.0000,0.0921,0.2053
x5,2.0750,0.4099,5.0619,0.0000,1.2703,2.8797
x6,3.4606,0.6646,5.2075,0.0000,2.1562,4.7651
x7,4.5972,0.8239,5.5797,0.0000,2.9799,6.2146
x8,-0.1336,0.2648,-0.5045,0.6140,-0.6534,0.3862
x9,49.5426,20.3334,2.4365,0.0150,9.6290,89.4561

0,1,2,3
Omnibus:,90.042,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,156.453
Skew:,0.724,Prob(JB):,0.0
Kurtosis:,4.6,Condition No.:,169056.0


In [179]:
#https://online.stat.psu.edu/stat501/lesson/2/2.12
#https://www.statisticshowto.com/probability-and-statistics/coefficient-of-determination-r-squared/
#https://en.wikipedia.org/wiki/Simple_linear_regression
#https://calcworkshop.com/linear-regression/t-test/

from scipy import stats
t_value = coefs/std_err
p_value = 2*(1-stats.t.cdf(abs(t_value),df=len(Y_train)-2))

In [183]:
std_err

array([25.28980765, 25.43445194, 25.39368967,  7.45652624,  3.85629954,
       35.31965988, 32.96810895, 26.34757083, 25.33533365, 25.33377822,
       25.3628136 , 25.37063606])

In [180]:
from scipy.stats.distributions import  t
n = len(Y_train)
k = len(p)

sigma2 = np.sum((Y_train - np.dot(X_train, p))**2) / (n - k)
C = sigma2 * np.linalg.inv(np.dot(X_train.T, X_train)) # covariance matrix
np.sqrt(np.diag(C))

array([3.29927955e+01, 1.51413701e+01, 2.89261471e+01, 2.88291188e-02,
       4.09924563e-01, 6.64551413e-01, 8.23920768e-01, 2.64805499e-01,
       2.03334217e+01, 2.37612253e+01, 1.48760029e+01, 8.00209573e+02])

In [181]:
summary = pd.DataFrame()
summary['variables'] = cols
summary['coefficients'] = coefs
summary['std error'] = std_err
summary['t_value'] = t_value
summary['p_value'] = p_value
print(summary)

       variables  coefficients  std error     t_value       p_value
0   PropertyType      9.236659  25.289808    0.365232  7.150338e-01
1           Beds    192.074261  25.434452    7.551736  1.165734e-13
2          Baths    368.287970  25.393690   14.503129  0.000000e+00
3           Sqft      0.148738   7.456526    0.019947  9.840904e-01
4      YearBuilt      2.075000   3.856300    0.538081  5.906701e-01
5      WalkScore      3.460642  35.319660    0.097981  9.219721e-01
6   TransitScore      4.597247  32.968109    0.139445  8.891332e-01
7   ParkingPrice     -0.133602  26.347571   -0.005071  9.959554e-01
8    ParkingType     49.542558  25.335334    1.955473  5.087289e-02
9        Cooling     83.001999  25.333778    3.276337  1.096651e-03
10       Laundry     59.428875  25.362814    2.343150  1.936426e-02
11         final  -4295.699246  25.370636 -169.317759  0.000000e+00


### Cleaning Text Data

In [110]:
def clean_description(description):
    description = description.lower()
    return ' '.join(re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([-,\"@\'?\.$%_\d\+\:])', ' ', description).split())

df["Clean Description"] = df['Description'].apply(lambda x: clean_description(x))

In [111]:
df["Clean Description"][1]

'this town home is for rent in the east end section of pittsburgh swiss helm park area near regent square and has been completely renovated complete with new kitchen bathroom and hardwood floors the first floor has been updated to have an open floor plan so the living room dining room and kitchen all flow into each other granite counter tops in kitchen wet room style bath and exposed brick the second floor is loft style with an open floor plan at sq feet the home is flooded with natural light park views and is very contemporary great location sits you a block from frick park it has both front and back porches and has a fenced in backyard complete with large private patio garden with raised beds for planting and a green rain barrel the home is conveniently located near regent square swiss helm park squirrel hill and frick park a few exits away from upmc pitt oakland sq hill and downtown pittsburgh home includes central ac refrigerator dishwasher gas range microwave washer and dryer gara

## Bag of Words

In [112]:
import nltk

stopwords = {'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
                       'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
                       'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
                       'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
                       'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'an',
                       'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
                       'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before',
                       'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
                       'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
                       'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
                       'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can',
                       'just', 'should', 'now', '', 'a', 's'}

def getAllWords(df):
    allWords = {}
    for index, row in df.iterrows():
        tmpDict = row['Clean Description'].split()
        for word in tmpDict:
            if word not in stopwords:
                if word not in allWords:
                    allWords[word] = 1
                else:
                    allWords[word] += 1
    allWords = pd.DataFrame(list(allWords.items()),columns = ['word','count']) 
    return allWords

wordcounts = getAllWords(df).sort_values(by=['count'], ascending=False)
vocab = wordcounts[1:25]
print(wordcounts[1:25])

           word  count
13    apartment    739
99      kitchen    717
297     bedroom    700
98          new    592
108        room    558
73      parking    553
189        less    551
193  apartments    547
83         rent    535
34      located    532
166   available    520
104       floor    510
186       water    508
385         one    492
160         gas    488
192      street    470
3        living    465
260      access    454
102      floors    452
77       center    450
302    features    437
67         site    434
313     laundry    433
154    downtown    428


In [113]:
df2 = df
for word in vocab["word"]: 
    df2[word] = 0

In [114]:
bow = []
for index, row in df2.iterrows():
    tmpDict = row['Clean Description']
    for word in vocab["word"]:
        df2.loc[index, word] = tmpDict.count(word)

In [115]:
print(df2)

       Price                                  Address  PropertyType  Beds  \
0     1285.0  2565 Boyce Plaza Rd Pittsburgh PA 15241           0.0   1.0   
1      925.0      7122 Whipple St Pittsburgh PA 15218           1.0   1.0   
2     1125.0     5510 Stanton Ave Pittsburgh PA 15206           1.0   2.0   
3      990.0      10 Allegheny Ct Pittsburgh PA 15212           0.0   0.0   
4     1270.0     340 Highland Ave Pittsburgh PA 15206           0.0   2.0   
...      ...                                      ...           ...   ...   
1003   645.0        306 Saline St Pittsburgh PA 15207           0.0   1.0   
1004  1190.0       3 Allegheny Ct Pittsburgh PA 15212           0.0   1.0   
1005  1295.0       5515 Baum Blvd Pittsburgh PA 15232           0.0   1.0   
1006  1495.0         909 Penn Ave Pittsburgh PA 15222           0.0   1.0   
1007  1489.0      157 Fairmont St Pittsburgh PA 15206           0.0   2.0   

      Baths   Sqft  YearBuilt  WalkScore  TransitScore  ParkingPrice  ...  

In [116]:
df2.to_csv('data_with_text.csv')

In [117]:
df2

Unnamed: 0,Price,Address,PropertyType,Beds,Baths,Sqft,YearBuilt,WalkScore,TransitScore,ParkingPrice,...,gas,street,living,access,floors,center,features,site,laundry,downtown
0,1285.0,2565 Boyce Plaza Rd Pittsburgh PA 15241,0.0,1.0,1.0,828.0,1950,20.0,0.0,0.0,...,0,0,2,0,0,1,0,2,0,0
1,925.0,7122 Whipple St Pittsburgh PA 15218,1.0,1.0,1.0,800.0,2012,56.0,47.0,0.0,...,2,0,1,0,1,0,0,0,0,1
2,1125.0,5510 Stanton Ave Pittsburgh PA 15206,1.0,2.0,1.0,900.0,1950,59.0,50.0,0.0,...,0,1,0,0,0,0,0,0,0,0
3,990.0,10 Allegheny Ct Pittsburgh PA 15212,0.0,0.0,1.0,630.0,1950,83.0,62.0,0.0,...,0,0,0,2,0,4,0,0,0,0
4,1270.0,340 Highland Ave Pittsburgh PA 15206,0.0,2.0,1.0,900.0,1913,94.0,69.0,0.0,...,1,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,645.0,306 Saline St Pittsburgh PA 15207,0.0,1.0,1.0,845.0,1970,48.0,38.0,0.0,...,1,1,0,0,0,0,0,0,0,0
1004,1190.0,3 Allegheny Ct Pittsburgh PA 15212,0.0,1.0,1.0,670.0,1970,87.0,83.0,0.0,...,0,1,0,0,0,1,0,0,0,1
1005,1295.0,5515 Baum Blvd Pittsburgh PA 15232,0.0,1.0,1.0,845.0,1970,94.0,68.0,0.0,...,1,0,1,0,2,0,2,0,1,1
1006,1495.0,909 Penn Ave Pittsburgh PA 15222,0.0,1.0,1.0,866.0,2014,99.0,99.0,0.0,...,1,0,1,0,0,0,0,0,0,1


In [127]:
import statsmodels.api as sm

mod = sm.OLS(Y_train,X_train)

fii = mod.fit()

p_values = fii.summary2().tables[1]['P>|t|']
fii.summary2()

fii.predict(X_val)

array([ 914.29169176, 1000.16125483, 1079.21309688,  934.64706851,
       1182.16320968, 1169.84747026, 1894.77799222, 2445.43819535,
       1509.77812923, 1658.81022951, 1176.40092042, 1416.44276628,
       1043.03554527, 1638.31468764, 1086.07981628, 1200.9507411 ,
       2098.36108296, 1993.81972585, 1414.51369747, 1594.16346745,
       1089.63220986, 1513.35193269, 1170.48425734, 1034.52238178,
       1108.67883618, 2092.63193947,  932.24434684,  891.33120773,
       1178.04363864, 1661.11767462, 1532.74158619, 1161.66442409,
       1581.28977175,  815.70828681, 1019.64771796, 1565.21622966,
        810.78561642,  863.96486475,  844.11954924, 1177.88034185,
       1266.16284656, 1338.81190138, 1318.54638534, 2199.32040166,
       1098.56814333,  849.486789  , 1099.92687882, 1853.92925673,
        890.16503608, 1644.55836265, 1623.99992979, 1297.25655794,
       1137.40601841, 1885.01106856, 2188.47237504, 1158.52677646,
        914.94715192, 1277.71020696, 1042.87375872, 1439.59970

In [120]:
#df2 = df2.drop(['Description'], axis=1)
df2 = df2.drop(['Clean Description'], axis=1)
df2 = df2.drop(['Address'], axis=1)
df2.astype("int")
X_train, Y_train, X_val, Y_val = split_data(df2)
evaluate_linear_regression(X_train, Y_train, X_val, Y_val) 

import statsmodels.api as sm

mod = sm.OLS(Y_train,X_train)

fii = mod.fit()

p_values = fii.summary2().tables[1]['P>|t|']
fii.summary2()


std_err, coefs = standard_error(X_train, Y_train)
cols = list(df2.drop(['Price'], axis=1).columns)
t_value = coefs/std_err
p_value = (1-stats.t.cdf(abs(t_value),df=len(Y_train)-2))*2

In [121]:
fii.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.653
Dependent Variable:,y,AIC:,11440.6666
Date:,2021-05-14 18:39,BIC:,11609.6262
No. Observations:,807,Log-Likelihood:,-5684.3
Df Model:,35,F-statistic:,44.35
Df Residuals:,771,Prob (F-statistic):,2.26e-159
R-squared:,0.668,Scale:,80445.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
x1,-19.8161,34.8581,-0.5685,0.5699,-88.2440,48.6119
x2,203.2766,15.1948,13.3780,0.0000,173.4484,233.1048
x3,352.6297,28.4644,12.3885,0.0000,296.7528,408.5066
x4,0.1357,0.0287,4.7231,0.0000,0.0793,0.1921
x5,1.5440,0.4321,3.5735,0.0004,0.6958,2.3922
x6,3.6701,0.7316,5.0165,0.0000,2.2339,5.1063
x7,3.8502,0.8987,4.2841,0.0000,2.0859,5.6144
x8,-0.9920,0.3335,-2.9742,0.0030,-1.6468,-0.3373
x9,65.8814,21.4542,3.0708,0.0022,23.7657,107.9970

0,1,2,3
Omnibus:,80.222,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,156.145
Skew:,0.619,Prob(JB):,0.0
Kurtosis:,4.764,Condition No.:,186805.0


In [122]:
summary = pd.DataFrame()
summary['variables'] = cols
summary['coefficients'] = coefs
summary['std error'] = std_err
summary['t_value'] = t_value
summary['p_value'] = p_value
print(summary)

       variables  coefficients   std error    t_value       p_value
0   PropertyType    -19.816074   68.384848  -0.289773  7.720646e-01
1           Beds    203.276628   69.564309   2.922140  3.573667e-03
2          Baths    352.629734   69.230944   5.093528  4.380677e-07
3           Sqft      0.135732    5.771718   0.023517  9.812439e-01
4      YearBuilt      1.544015    3.168299   0.487332  6.261554e-01
5      WalkScore      3.670092  238.348967   0.015398  9.877185e-01
6   TransitScore      3.850152  186.568059   0.020637  9.835406e-01
7   ParkingPrice     -0.992020   67.894794  -0.014611  9.883460e-01
8    ParkingType     65.881352   68.753319   0.958228  3.382355e-01
9        Cooling     57.944439   68.741103   0.842937  3.995139e-01
10       Laundry     81.343361   68.976413   1.179292  2.386302e-01
11     apartment      6.735627   69.237672   0.097283  9.225261e-01
12       kitchen    -14.532978   68.993287  -0.210643  8.332189e-01
13       bedroom    -15.368410   68.993269  -0.2

In [None]:
import matplotlib.pyplot as plt
#plt.plot(X_train, Y_train)