# Performing Linear Regression with Bag of Words

In [61]:
import csv
import gzip
import math
import hashlib
import numpy as np
import pandas as pd
import re

In [130]:
df = pd.read_csv('all.csv', dtype=str, keep_default_na=False)

In [131]:
df["Price"] = pd.to_numeric(df["Price"])
df["PropertyType"] = pd.to_numeric(df["PropertyType"])
df["Beds"] = pd.to_numeric(df["Beds"])
df["Baths"] = pd.to_numeric(df["Baths"])
df["Sqft"] = pd.to_numeric(df["Sqft"])
df["YearBuilt"] = pd.to_numeric(df["YearBuilt"], downcast='integer')
df["WalkScore"] = pd.to_numeric(df["WalkScore"])
df["TransitScore"] = pd.to_numeric(df["TransitScore"])
df["ParkingPrice"] = pd.to_numeric(df["ParkingPrice"])
df["ParkingType"] = pd.to_numeric(df["ParkingType"])
df["Cooling"] = pd.to_numeric(df["Cooling"])
df["Laundry"] = pd.to_numeric(df["Laundry"])

df1 = df.drop(['Address', 'Description'], axis=1)

## Baseline Regression

In [74]:
def split_data(df):
    """split the data into training and validation sets, and convert them to np.ndarray. (Step 1 and 2 above.)

    args:
        df : pandas.DataFrame -- the parsed data, as returned by parse_stackoverflow_data()

    returns: X_train, y_train, X_val, y_val
      X_train  : np.ndarray -- the second 80% of the data features
      y_train : np.ndarray -- the second 80% of the target values
      X_val : np.ndarray -- the first 20% (rounded down) of the data features
      y_val : np.ndarray -- the first 20% of the target valuesn
    """
    n = len(df)
    df['final'] = 1.0
    df = df.to_numpy()
    i = int(np.floor(n*0.2))
    val = df[0:i,:]
    train = df[i:,:]
    Y_train = train[:, 0]
    X_train = train[:, 1:]
    Y_val = val[:, 0]
    X_val = val[:, 1:]
    return(X_train, Y_train, X_val, Y_val)
X_train, Y_train, X_val, Y_val = split_data(df1)

In [145]:
#https://stattrek.com/regression/slope-confidence-interval.aspx?Tutorial=AP

def squared_error(y_pred, y):
    return(np.mean(np.square(np.subtract(y_pred,y))))

def standard_error(X_train, y_train):
    lm = LinearRegression(X_train, y_train)
    y_hat = lm.predict(X_train)
    x_bar = np.mean(X_train)
    n = len(y_train)
    return np.sqrt(sum((y_train - y_hat)**2)/(n - 2))/np.sqrt(sum((X_train - x_bar)**2)), lm.theta

class LinearRegression():
    def __init__(self, X, y):
        self.theta = np.linalg.solve(X.T @ X, X.T @ y)

    def predict(self, X): 
        return(X @ self.theta)

In [77]:
def evaluate_linear_regression(X_train, y_train, X_val, y_val):
    lm = LinearRegression(X_train, y_train)
    baseline_mse = squared_error(np.mean(y_train), y_val)
    validation_mse = squared_error(lm.predict(X_val), y_val)
    return((validation_mse, baseline_mse))
evaluate_linear_regression(X_train, Y_train, X_val, Y_val)   

(143821.39515347502, 292466.19087478047)

In [108]:
std_err, coefs = standard_error(X_train, Y_train)
cols = list(df.drop(['Price','Description', 'Address'], axis=1).columns)

In [109]:
#https://online.stat.psu.edu/stat501/lesson/2/2.12
#https://stats.stackexchange.com/questions/324260/manually-calculate-the-parameters-std-error-of-lm-output-in-r
#https://www.statisticshowto.com/probability-and-statistics/coefficient-of-determination-r-squared/
#https://en.wikipedia.org/wiki/Simple_linear_regression

from scipy import stats
t_value = coefs/std_err
p_value = (1-stats.t.cdf(abs(t_value),df=len(Y_train)-2))*2

In [110]:
summary = pd.DataFrame()
summary['variables'] = cols
summary['coefficients'] = coefs
summary['std error'] = std_err
summary['t_value'] = t_value
summary['p_value'] = p_value
print(summary)

       variables  coefficients  std error      t_value   p_value
0   PropertyType     59.885976   0.021990  2723.330558  0.000000
1           Beds    199.515307   0.022092  9031.088123  0.000000
2          Baths    136.130106   0.022097  6160.647255  0.000000
3           Sqft      0.157870   0.007820    20.187685  0.000000
4      YearBuilt      0.010993   0.002974     3.696187  0.000225
5      WalkScore      2.379358   0.037789    62.963957  0.000000
6   TransitScore      6.384853   0.036178   176.482136  0.000000
7   ParkingPrice      0.080419   0.038492     2.089211  0.036816
8    ParkingType    -14.604286   0.022070  -661.736832  0.000000
9        Cooling    153.803979   0.022074  6967.776690  0.000000
10       Laundry     38.744494   0.022083  1754.503895  0.000000
11         final     56.575380   0.022087  2561.449559  0.000000


### Cleaning Text Data

In [134]:
def clean_description(description):
    description = description.lower()
    return ' '.join(re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([-,\"@\'?\.$%_\d\+\:])', ' ', description).split())

df["Clean Description"] = df['Description'].apply(lambda x: clean_description(x))

In [135]:
df["Clean Description"][1]

'from virtual to reality call us for an in person social distancing tour located in the heart of pittsburgh s north shore park view apartments offer a full range of amenities to its residents that include controlled entrance access system beautiful roof top decks fitness center large windows and balconies residents will also enjoy convenient access to a number of the city s most popular cultural and athletic attractions including pnc park heinz field the benedum center o reilly theatre carnegie science center andy warhol museum convention center children s museum and three rivers casino park view prides itself in providing affordable luxury in the heart of an urban park environment'

## Bag of Words

In [136]:
import nltk

stopwords = {'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
                       'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
                       'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
                       'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
                       'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'an',
                       'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
                       'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before',
                       'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
                       'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
                       'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
                       'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can',
                       'just', 'should', 'now', '', 'a', 's'}

def getAllWords(df):
    allWords = {}
    for index, row in df.iterrows():
        tmpDict = row['Clean Description'].split()
        for word in tmpDict:
            if word not in stopwords:
                if word not in allWords:
                    allWords[word] = 1
                else:
                    allWords[word] += 1
    allWords = pd.DataFrame(list(allWords.items()),columns = ['word','count']) 
    return allWords

wordcounts = getAllWords(df).sort_values(by=['count'], ascending=False)
vocab = wordcounts[1:25]
print(wordcounts[1:25])

              word  count
51          access   5602
237           site   5585
324       features   5542
528         floors   4293
58          center   4293
250     appliances   4222
57         fitness   4187
465          rooms   4071
1970         fifth   3985
2827         grand   3967
3383      kaufmann   3948
38      pittsburgh   3651
135           room   3151
379           unit   2961
256       hardwood   2947
125         washer   2939
126          dryer   2930
298            air   2886
375           walk   2853
297     dishwasher   2805
257           high   2790
299   conditioning   2786
267      stainless   2782
268          steel   2765


In [151]:
df2 = df
for word in vocab["word"]: 
    df2[word] = 0

In [152]:
bow = []
for index, row in df2.iterrows():
    tmpDict = row['Clean Description']
    for word in vocab["word"]:
        df2.loc[index, word] = tmpDict.count(word)

In [155]:
print(df2)

       Price                                Address  PropertyType  Beds  \
0     1185.0     1 Magdalene St Pittsburgh PA 15203           1.0   1.0   
1     1575.0    10 Allegheny Ct Pittsburgh PA 15212           0.0   2.0   
2      800.0    10 Allegheny Ct Pittsburgh PA 15212           0.0   0.0   
3     1575.0    10 Allegheny Ct Pittsburgh PA 15212           0.0   2.0   
4     1395.0    10 Allegheny Ct Pittsburgh PA 15212           0.0   1.0   
...      ...                                    ...           ...   ...   
2475   780.0  957 Bockstoce Ave Pittsburgh PA 15234           0.0   1.0   
2476   950.0  957 Bockstoce Ave Pittsburgh PA 15234           0.0   2.0   
2477   645.0         97 23rd St Pittsburgh PA 15203           1.0  -1.0   
2478   995.0         97 27th St Pittsburgh PA 15203           1.0   1.0   
2479   950.0   978 Garfield Ave Pittsburgh PA 15221           0.0   1.0   

      Baths    Sqft  YearBuilt  WalkScore  TransitScore  ParkingPrice  ...  \
0       1.5  1250.0  

In [165]:
df2['Description']

0       Great house in The Slopes. Quiet Dead End. Gre...
1       FROM VIRTUAL. TO REALITY.CALL US FOR AN IN-PER...
2       FROM VIRTUAL. TO REALITY.CALL US FOR AN IN-PER...
3       FROM VIRTUAL. TO REALITY.CALL US FOR AN IN-PER...
4       FROM VIRTUAL. TO REALITY.CALL US FOR AN IN-PER...
                              ...                        
2475    Two bedroom apartment available at $930 + elec...
2476    Two bedroom apartment available at $930 + elec...
2477    Studio layout, open room plan on second floor,...
2478    This 1000 square foot single family home has 1...
2479    Now Available!\n\nWelcome home to this beautif...
Name: Description, Length: 2480, dtype: object

In [166]:
df2 = df2.drop(['Description'], axis=1)
#df2 = df2.drop(['Clean Description'], axis=1)
#df2 = df2.drop(['Address'], axis=1)
df2.astype("int")
X_train, Y_train, X_val, Y_val = split_data(df2)
evaluate_linear_regression(X_train, Y_train, X_val, Y_val) 

std_err, coefs = standard_error(X_train, Y_train)
cols = list(df2.drop(['Price'], axis=1).columns)
t_value = coefs/std_err
p_value = (1-stats.t.cdf(abs(t_value),df=len(Y_train)-2))*2

In [167]:
summary = pd.DataFrame()
summary['variables'] = cols
summary['coefficients'] = coefs
summary['std error'] = std_err
summary['t_value'] = t_value
summary['p_value'] = p_value
print(summary)

       variables  coefficients  std error      t_value  p_value
0   PropertyType     67.823641   0.060651  1118.266162      0.0
1           Beds    202.927401   0.061483  3300.520415      0.0
2          Baths    122.481763   0.061524  1990.793373      0.0
3           Sqft      0.174062   0.006144    28.329910      0.0
4      YearBuilt      0.033825   0.002589    13.063840      0.0
5      WalkScore      3.106867   0.186160    16.689265      0.0
6   TransitScore      4.541271   0.172450    26.333891      0.0
7   ParkingPrice     -0.810168   0.044247   -18.310158      0.0
8    ParkingType    -43.114156   0.061300  -703.328808      0.0
9        Cooling    115.739533   0.061334  1887.046867      0.0
10       Laundry     60.580073   0.061409   986.497619      0.0
11        access     74.846210   0.064130  1167.105084      0.0
12          site    -50.281972   0.062973  -798.470470      0.0
13      features     62.369849   0.062930   991.105445      0.0
14        floors     34.191209   0.06236

In [None]:
import matplotlib.pyplot as plt
#plt.plot(X_train, Y_train)