# Performing Linear Regression with Bag of Words

In [21]:
import csv
import gzip
import math
import hashlib
import numpy as np
import pandas as pd
import re

In [22]:
df = pd.read_csv('all.csv', dtype=str, keep_default_na=False)

In [23]:
df["Price"] = pd.to_numeric(df["Price"])
df["PropertyType"] = pd.to_numeric(df["PropertyType"])
df["Beds"] = pd.to_numeric(df["Beds"])
df["Baths"] = pd.to_numeric(df["Baths"])
df["Sqft"] = pd.to_numeric(df["Sqft"])
df["YearBuilt"] = pd.to_numeric(df["YearBuilt"], downcast='integer')
df["WalkScore"] = pd.to_numeric(df["WalkScore"])
df["TransitScore"] = pd.to_numeric(df["TransitScore"])
df["ParkingPrice"] = pd.to_numeric(df["ParkingPrice"])
df["ParkingType"] = pd.to_numeric(df["ParkingType"])
df["Cooling"] = pd.to_numeric(df["Cooling"])
df["Laundry"] = pd.to_numeric(df["Laundry"])

price = df['Price']
df.drop(labels=['Price'], axis=1,inplace = True)
df.insert(0, 'Price', price)

df1 = df.drop(['Address', 'Description'], axis=1)

## Baseline Regression

In [24]:
def split_data(df):
    """split the data into training and validation sets, and convert them to np.ndarray. (Step 1 and 2 above.)

    args:
        df : pandas.DataFrame -- the parsed data, as returned by parse_stackoverflow_data()

    returns: X_train, y_train, X_val, y_val
      X_train  : np.ndarray -- the second 80% of the data features
      y_train : np.ndarray -- the second 80% of the target values
      X_val : np.ndarray -- the first 20% (rounded down) of the data features
      y_val : np.ndarray -- the first 20% of the target valuesn
    """
    n = len(df)
    df['final'] = 1.0
    df = df.to_numpy()
    i = int(np.floor(n*0.2))
    val = df[0:i,:]
    train = df[i:,:]
    Y_train = train[:, 0]
    X_train = train[:, 1:]
    Y_val = val[:, 0]
    X_val = val[:, 1:]
    return(X_train, Y_train, X_val, Y_val)
X_train, Y_train, X_val, Y_val = split_data(df1)

In [25]:
#https://stattrek.com/regression/slope-confidence-interval.aspx?Tutorial=AP

def squared_error(y_pred, y):
    return(np.mean(np.square(np.subtract(y_pred,y))))

def standard_error(X_train, y_train):
    lm = LinearRegression(X_train, y_train)
    y_hat = lm.predict(X_train)
    x_bar = np.mean(X_train)
    n = len(y_train)
    return np.sqrt(sum((y_train - y_hat)**2)/(n - 2))/np.sqrt(sum((X_train - x_bar)**2)), lm.theta

class LinearRegression():
    def __init__(self, X, y):
        self.theta = np.linalg.solve(X.T @ X, X.T @ y)

    def predict(self, X): 
        return(X @ self.theta)

In [26]:
def evaluate_linear_regression(X_train, y_train, X_val, y_val):
    lm = LinearRegression(X_train, y_train)
    baseline_mse = squared_error(np.mean(y_train), y_val)
    validation_mse = squared_error(lm.predict(X_val), y_val)
    return((validation_mse, baseline_mse))
evaluate_linear_regression(X_train, Y_train, X_val, Y_val)   

(123607.94780833411, 319678.33238067484)

In [28]:
std_err, coefs = standard_error(X_train, Y_train)
cols = list(df1.drop(['Price'], axis=1).columns)

In [29]:
#https://online.stat.psu.edu/stat501/lesson/2/2.12
#https://stats.stackexchange.com/questions/324260/manually-calculate-the-parameters-std-error-of-lm-output-in-r
#https://www.statisticshowto.com/probability-and-statistics/coefficient-of-determination-r-squared/
#https://en.wikipedia.org/wiki/Simple_linear_regression

from scipy import stats
t_value = coefs/std_err
p_value = (1-stats.t.cdf(abs(t_value),df=len(Y_train)-2))*2

In [30]:
df1

Unnamed: 0,Price,PropertyType,Beds,Baths,Sqft,YearBuilt,WalkScore,TransitScore,ParkingPrice,ParkingType,Cooling,Laundry,final
0,1285.0,0.0,1.0,1.0,828.0,1950,20.0,0.0,0.0,1.0,1.0,2.0,1.0
1,925.0,1.0,1.0,1.0,800.0,2012,56.0,47.0,0.0,1.0,1.0,1.0,1.0
2,1125.0,1.0,2.0,1.0,900.0,1950,59.0,50.0,0.0,0.0,0.0,0.0,1.0
3,990.0,0.0,0.0,1.0,630.0,1950,83.0,62.0,0.0,1.0,0.0,1.0,1.0
4,1270.0,0.0,2.0,1.0,900.0,1913,94.0,69.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,645.0,0.0,1.0,1.0,845.0,1970,48.0,38.0,0.0,1.0,0.0,0.0,1.0
1004,1190.0,0.0,1.0,1.0,670.0,1970,87.0,83.0,0.0,1.0,1.0,0.0,1.0
1005,1295.0,0.0,1.0,1.0,845.0,1970,94.0,68.0,0.0,0.0,1.0,0.0,1.0
1006,1495.0,0.0,1.0,1.0,866.0,2014,99.0,99.0,0.0,1.0,1.0,2.0,1.0


In [31]:
summary = pd.DataFrame()
summary['variables'] = cols
summary['coefficients'] = coefs
summary['std error'] = std_err
summary['t_value'] = t_value
summary['p_value'] = p_value
print(summary)

       variables  coefficients  std error        t_value   p_value
0   PropertyType      9.236659   0.040108     230.292655  0.000000
1           Beds    192.074261   0.040338    4761.650206  0.000000
2          Baths    368.287970   0.040273    9144.762398  0.000000
3           Sqft      0.148738   0.011826      12.577509  0.000000
4      YearBuilt      2.075000   0.006116     339.279857  0.000000
5      WalkScore      3.460642   0.056015      61.780420  0.000000
6   TransitScore      4.597247   0.052286      87.925423  0.000000
7   ParkingPrice     -0.133602   0.041786      -3.197307  0.001441
8    ParkingType     49.542558   0.040181    1232.998350  0.000000
9        Cooling     83.001999   0.040178    2065.852401  0.000000
10       Laundry     59.428875   0.040224    1477.443165  0.000000
11         final  -4295.699246   0.040237 -106761.142657  0.000000


### Cleaning Text Data

In [32]:
def clean_description(description):
    description = description.lower()
    return ' '.join(re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([-,\"@\'?\.$%_\d\+\:])', ' ', description).split())

df["Clean Description"] = df['Description'].apply(lambda x: clean_description(x))

In [33]:
df["Clean Description"][1]

'this town home is for rent in the east end section of pittsburgh swiss helm park area near regent square and has been completely renovated complete with new kitchen bathroom and hardwood floors the first floor has been updated to have an open floor plan so the living room dining room and kitchen all flow into each other granite counter tops in kitchen wet room style bath and exposed brick the second floor is loft style with an open floor plan at sq feet the home is flooded with natural light park views and is very contemporary great location sits you a block from frick park it has both front and back porches and has a fenced in backyard complete with large private patio garden with raised beds for planting and a green rain barrel the home is conveniently located near regent square swiss helm park squirrel hill and frick park a few exits away from upmc pitt oakland sq hill and downtown pittsburgh home includes central ac refrigerator dishwasher gas range microwave washer and dryer gara

## Bag of Words

In [34]:
import nltk

stopwords = {'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
                       'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
                       'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
                       'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
                       'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'an',
                       'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
                       'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before',
                       'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
                       'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
                       'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
                       'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can',
                       'just', 'should', 'now', '', 'a', 's'}

def getAllWords(df):
    allWords = {}
    for index, row in df.iterrows():
        tmpDict = row['Clean Description'].split()
        for word in tmpDict:
            if word not in stopwords:
                if word not in allWords:
                    allWords[word] = 1
                else:
                    allWords[word] += 1
    allWords = pd.DataFrame(list(allWords.items()),columns = ['word','count']) 
    return allWords

wordcounts = getAllWords(df).sort_values(by=['count'], ascending=False)
vocab = wordcounts[1:25]
print(wordcounts[1:25])

           word  count
13    apartment    739
99      kitchen    717
297     bedroom    700
98          new    592
108        room    558
73      parking    553
189        less    551
193  apartments    547
83         rent    535
34      located    532
166   available    520
104       floor    510
186       water    508
385         one    492
160         gas    488
192      street    470
3        living    465
260      access    454
102      floors    452
77       center    450
302    features    437
67         site    434
313     laundry    433
154    downtown    428


In [35]:
df2 = df
for word in vocab["word"]: 
    df2[word] = 0

In [36]:
bow = []
for index, row in df2.iterrows():
    tmpDict = row['Clean Description']
    for word in vocab["word"]:
        df2.loc[index, word] = tmpDict.count(word)

In [37]:
print(df2)

       Price                                  Address  PropertyType  Beds  \
0     1285.0  2565 Boyce Plaza Rd Pittsburgh PA 15241           0.0   1.0   
1      925.0      7122 Whipple St Pittsburgh PA 15218           1.0   1.0   
2     1125.0     5510 Stanton Ave Pittsburgh PA 15206           1.0   2.0   
3      990.0      10 Allegheny Ct Pittsburgh PA 15212           0.0   0.0   
4     1270.0     340 Highland Ave Pittsburgh PA 15206           0.0   2.0   
...      ...                                      ...           ...   ...   
1003   645.0        306 Saline St Pittsburgh PA 15207           0.0   1.0   
1004  1190.0       3 Allegheny Ct Pittsburgh PA 15212           0.0   1.0   
1005  1295.0       5515 Baum Blvd Pittsburgh PA 15232           0.0   1.0   
1006  1495.0         909 Penn Ave Pittsburgh PA 15222           0.0   1.0   
1007  1489.0      157 Fairmont St Pittsburgh PA 15206           0.0   2.0   

      Baths   Sqft  YearBuilt  WalkScore  TransitScore  ParkingPrice  ...  

In [38]:
df2['Description']

0       Experience the best in luxury living with brea...
1       This Town Home is for rent in the East End sec...
2       Highland Park - 2BR Apartment With Off Street ...
3       FROM VIRTUAL. TO REALITY.CALL US FOR AN IN-PER...
4       2 Bedroom 1 Bathroom Apartment in Shadyside\n\...
                              ...                        
1003    One Bedroom in 4-Mile Run Available Now\n - On...
1004    Sublet: 1 Bedroom 1Bath with Spacious Balcony ...
1005    This is a first floor apartment located in gro...
1006    Live the Lando Life. Walking, dining, socializ...
1007    **Pictures shown are showing what the finishes...
Name: Description, Length: 1008, dtype: object

In [166]:
df2 = df2.drop(['Description'], axis=1)
#df2 = df2.drop(['Clean Description'], axis=1)
#df2 = df2.drop(['Address'], axis=1)
df2.astype("int")
X_train, Y_train, X_val, Y_val = split_data(df2)
evaluate_linear_regression(X_train, Y_train, X_val, Y_val) 

std_err, coefs = standard_error(X_train, Y_train)
cols = list(df2.drop(['Price'], axis=1).columns)
t_value = coefs/std_err
p_value = (1-stats.t.cdf(abs(t_value),df=len(Y_train)-2))*2

In [167]:
summary = pd.DataFrame()
summary['variables'] = cols
summary['coefficients'] = coefs
summary['std error'] = std_err
summary['t_value'] = t_value
summary['p_value'] = p_value
print(summary)

       variables  coefficients  std error      t_value  p_value
0   PropertyType     67.823641   0.060651  1118.266162      0.0
1           Beds    202.927401   0.061483  3300.520415      0.0
2          Baths    122.481763   0.061524  1990.793373      0.0
3           Sqft      0.174062   0.006144    28.329910      0.0
4      YearBuilt      0.033825   0.002589    13.063840      0.0
5      WalkScore      3.106867   0.186160    16.689265      0.0
6   TransitScore      4.541271   0.172450    26.333891      0.0
7   ParkingPrice     -0.810168   0.044247   -18.310158      0.0
8    ParkingType    -43.114156   0.061300  -703.328808      0.0
9        Cooling    115.739533   0.061334  1887.046867      0.0
10       Laundry     60.580073   0.061409   986.497619      0.0
11        access     74.846210   0.064130  1167.105084      0.0
12          site    -50.281972   0.062973  -798.470470      0.0
13      features     62.369849   0.062930   991.105445      0.0
14        floors     34.191209   0.06236

In [None]:
import matplotlib.pyplot as plt
#plt.plot(X_train, Y_train)