In [153]:
#Recap::
#with linear models, the actual numeric range of data doesnt matter in terms of final predictions 
##standardizing the input data will however rescale your coefficients

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

url = "/Users/theodoreplotkin/desktop/postmalone/GA_Data_Science/DAT-06-24/class material/Unit 3/data/bikeshare.csv"

bikes = pd.read_csv(url, index_col = 'datetime', parse_dates = True)

In [154]:
#a statistical penalty (L1-L2 regularization) 
#can allow us to narrow down the feature space for our linear models, these penalties work best on standardized data

#hierarchical clustering allows us to deduce the relationships between variables
#downside of PCA is it changes the interpretation of your results 

In [155]:
bikes.head(10)

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,16
2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,40
2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,32
2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,13
2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,1
2011-01-01 05:00:00,Spring,0,0,Partly Cloudy,9.84,12.88,75,6.0032,1
2011-01-01 06:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,2
2011-01-01 07:00:00,Spring,0,0,Clear Skies,8.2,12.88,86,0.0,3
2011-01-01 08:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,8
2011-01-01 09:00:00,Spring,0,0,Clear Skies,13.12,17.425,76,0.0,14


In [156]:
#todays agenda:
## -variable handling
## -feature engineering (with lab)
## -training/test sets & cross-validation

In [157]:
set(bikes["weather"]) #note this needs to be converted to a numeric encoding <- ordered category

{'Clear Skies', 'Heavy Storms/Rain', 'Light Storms/Rain', 'Partly Cloudy'}

In [158]:
set(bikes["season"]) #note this needs to be converted to a numeric encoding <- unordered category

{'Fall', 'Spring', 'Summer', 'Winter'}

In [159]:
bikes.dtypes

season         object
holiday         int64
workingday      int64
weather        object
temp          float64
atemp         float64
humidity        int64
windspeed     float64
count           int64
dtype: object

In [160]:
#note if we feed something into scikit learn, the arrays must all be numeric np arrays!
#-- this leaves a few additional steps for categorical variables

# Note the distinction between ordered categories and unordered categories

## -with `unordered categories` we use dummy "one-hot" encodings and each category becomes a seperate feature


## -with `ordered categories` we transform to sensible numeric values (ex/ small = 1, medium = 2, large = 3)

In [161]:
bikes.season.value_counts() #note this is an unordered category 

Winter    2734
Summer    2733
Fall      2733
Spring    2686
Name: season, dtype: int64

In [162]:
bikes.weather.value_counts() #note this is an ordered category since there exists a natural hierarchy

                             #clear skies > partly cloudy > light storms > heavy storms 

Clear Skies          7192
Partly Cloudy        2834
Light Storms/Rain     859
Heavy Storms/Rain       1
Name: weather, dtype: int64

In [163]:
# def weather_encoding(row):
#     if "Clear Skies" in bikes["weather"]:
#         return 4
#     elif "Partly Cloudy" in bikes["weather"]:
#         return 3
#     elif "Light Storms/Rain" in bikes["weather"]:
#         return 2
#     elif "Heavy Storms/Rain" in bikes["weather"]:
#         return 1

# bikes["weather_encoding"] = bikes.apply(weather_encoding, axis = 1 )

In [164]:
weather_map = {
    'Clear Skies':       4,
    'Partly Cloudy':     3,
    'Light Storms/Rain': 2,
    'Heavy Storms/Rain': 1
}

bikes.weather = bikes.weather.map(weather_map) #for ordered categoricals we use the .map() method
#note that we are simply overwriting the current weather column with the numeric encoding 

In [165]:
bikes.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-01-01 00:00:00,Spring,0,0,4,9.84,14.395,81,0.0,16
2011-01-01 01:00:00,Spring,0,0,4,9.02,13.635,80,0.0,40
2011-01-01 02:00:00,Spring,0,0,4,9.02,13.635,80,0.0,32
2011-01-01 03:00:00,Spring,0,0,4,9.84,14.395,75,0.0,13
2011-01-01 04:00:00,Spring,0,0,4,9.84,14.395,75,0.0,1


In [166]:
pd.get_dummies(bikes.season).head() 

#the pd.get_dummies() method allows us to quickly create a "one-hot" encoding of unordered categoricals

Unnamed: 0_level_0,Fall,Spring,Summer,Winter
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-01 00:00:00,0,1,0,0
2011-01-01 01:00:00,0,1,0,0
2011-01-01 02:00:00,0,1,0,0
2011-01-01 03:00:00,0,1,0,0
2011-01-01 04:00:00,0,1,0,0


In [167]:
#the best way to do a one-hot (if there are n-categories), is to create n-1 categories 
#and then the nth category will get captured in the intercept (i.e. when all variables are 0)

pd.get_dummies(bikes.season, drop_first = True)
#note setting "drop_first = True" sets us up with n-1 categories, where the n-th category is captured by
#setting all other categories to zero

#keeping columns down to a minimum is actually very important with the "one-hot encoding"
#you want to try to reduce the horizontal dimensionality of your dataset as much as possible

Unnamed: 0_level_0,Spring,Summer,Winter
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-01-01 00:00:00,1,0,0
2011-01-01 01:00:00,1,0,0
2011-01-01 02:00:00,1,0,0
2011-01-01 03:00:00,1,0,0
2011-01-01 04:00:00,1,0,0
2011-01-01 05:00:00,1,0,0
2011-01-01 06:00:00,1,0,0
2011-01-01 07:00:00,1,0,0
2011-01-01 08:00:00,1,0,0
2011-01-01 09:00:00,1,0,0


In [168]:
#to actually apply this to the original dataset do the following

bikes = pd.get_dummies(bikes, drop_first = True)
bikes.head()

Unnamed: 0_level_0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,season_Spring,season_Summer,season_Winter
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,0,0,4,9.84,14.395,81,0.0,16,1,0,0
2011-01-01 01:00:00,0,0,4,9.02,13.635,80,0.0,40,1,0,0
2011-01-01 02:00:00,0,0,4,9.02,13.635,80,0.0,32,1,0,0
2011-01-01 03:00:00,0,0,4,9.84,14.395,75,0.0,13,1,0,0
2011-01-01 04:00:00,0,0,4,9.84,14.395,75,0.0,1,1,0,0


In [169]:
#run regression on the entire dataset 

#step 1 import the API for regression
from sklearn.linear_model import LinearRegression

In [170]:
#step 2 create an instance

entire_data = LinearRegression()
type(entire_data)

sklearn.linear_model.base.LinearRegression

In [171]:
#step 3 create the design matrix of standardized data

bikes["weather_standard"] = (bikes["weather"] - bikes["weather"].mean())/(bikes["weather"].std())
bikes["temp_standard"] = (bikes["temp"] - bikes["temp"].mean())/(bikes["temp"].std())
bikes["atemp_standard"] = (bikes["atemp"] - bikes["atemp"].mean())/(bikes["atemp"].std())
bikes["humidity_standard"] = (bikes["humidity"] - bikes["humidity"].mean())/(bikes["humidity"].std())
bikes["windspeed_standard"] = (bikes["windspeed"] - bikes["windspeed"].mean())/(bikes["windspeed"].std())

In [172]:
bikes.head()

Unnamed: 0_level_0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,season_Spring,season_Summer,season_Winter,weather_standard,temp_standard,atemp_standard,humidity_standard,windspeed_standard
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-01 00:00:00,0,0,4,9.84,14.395,81,0.0,16,1,0,0,0.660148,-1.333599,-1.092687,0.993167,-1.567682
2011-01-01 01:00:00,0,0,4,9.02,13.635,80,0.0,40,1,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682
2011-01-01 02:00:00,0,0,4,9.02,13.635,80,0.0,32,1,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682
2011-01-01 03:00:00,0,0,4,9.84,14.395,75,0.0,13,1,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682
2011-01-01 04:00:00,0,0,4,9.84,14.395,75,0.0,1,1,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682


In [173]:
X_entire = bikes[["holiday", "workingday", "weather_standard", "temp_standard", 
                  "atemp_standard", "humidity_standard", "windspeed_standard", 
                  "season_Spring", "season_Summer", "season_Winter"]]
y = bikes["count"]

X_entire.head()

Unnamed: 0_level_0,holiday,workingday,weather_standard,temp_standard,atemp_standard,humidity_standard,windspeed_standard,season_Spring,season_Summer,season_Winter
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011-01-01 00:00:00,0,0,0.660148,-1.333599,-1.092687,0.993167,-1.567682,1,0,0
2011-01-01 01:00:00,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682,1,0,0
2011-01-01 02:00:00,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682,1,0,0
2011-01-01 03:00:00,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682,1,0,0
2011-01-01 04:00:00,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682,1,0,0


In [174]:
entire_data.fit(X_entire,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [175]:
entire_data.score(X_entire,y) #R-squared for todays model which includes categorical features

0.275250323209943

In [176]:
entire_data.coef_

array([ -8.63047234,  -2.86154948,  -1.68074593,  62.03831631,
        24.65228382, -54.25768584,   4.42002657,  37.77071609,
        34.97948629, 102.97920297])

In [177]:
#A nice way to view the coefficents when they are done

coeffs = pd.DataFrame({
    "Variable": X_entire.columns,
    "Weight": entire_data.coef_    
}).sort_values(by = "Weight", ascending = False)

coeffs

Unnamed: 0,Variable,Weight
9,season_Winter,102.979203
3,temp_standard,62.038316
7,season_Spring,37.770716
8,season_Summer,34.979486
4,atemp_standard,24.652284
6,windspeed_standard,4.420027
2,weather_standard,-1.680746
1,workingday,-2.861549
0,holiday,-8.630472
5,humidity_standard,-54.257686


In [178]:
entire_data.intercept_

149.80467217555673

In [179]:
#also to get a nice regression summary we can use the statsmodel.api 
import statsmodels.api as sm

mod = sm.OLS(y, X_entire)
results = mod.fit()
print(results.summary())

#the sm.OLS() is more traditional form or regression, oriented towards traditional statistical tests

#the SkLearn version uses modern versions of machine learning, this is best for larger datasets
#as it has more optimization built into it

                            OLS Regression Results                            
Dep. Variable:                  count   R-squared:                       0.625
Model:                            OLS   Adj. R-squared:                  0.625
Method:                 Least Squares   F-statistic:                     1816.
Date:                Wed, 31 Jul 2019   Prob (F-statistic):               0.00
Time:                        19:40:05   Log-Likelihood:                -70787.
No. Observations:               10886   AIC:                         1.416e+05
Df Residuals:                   10876   BIC:                         1.417e+05
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
holiday               45.0705      9

In [None]:
#note that having higher sample size naturally makes p-values smaller, so this can bias our decision
#to remove a variable from the dataset 

#this is why we often rely on more modern forms of variable selection that p-values alone 

In [117]:
#now fit on the variables from monday
X_mon = bikes[["weather_standard", "temp_standard", "atemp_standard", "humidity_standard"]]

entire_data.fit(X_mon,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [118]:
entire_data.score(X_mon,y) #R-squared for monday's model 

0.24307517355292296

In [119]:
entire_data.coef_

array([ -2.91057674,  23.16714273,  45.5475192 , -55.18486553])

In [120]:
entire_data.intercept_

191.57413191254764

In [None]:
#note to standardize a bunch of columns at the same time, "select dtypes"

#first we grab the design matrix 
#X = bikes.loc[:, df.columns != "count"]

#next perform the ordered categorical conversion with .map()

#then the following code grabs just the numeric part of the design matrix and standardizes all columns at once

#num_cols = X.select_dtypes(include=np.number).columns.tolist() <- select only numeric types from X
##note that include = np.number allows us to include only numeric data


#X[num_cols] = (X[num_cols] - X[num_cols].mean())/(X[num_cols].std()) <- simultaniously standardize all numeric data

In [None]:
#notes about categorical variables:

#if you're unsure if a variable is truly 'ordered' then just go with "one-hot encoding"

#if our dataset has more columns than rows, then the covariance matrix becomes non-invertible
##more categorical variables creates a higher number of unique columns 

#How to deal with high-dimensional categories::
##group them into a smaller number of categories (compress categories)
    #ex/ bin street address into different neighborhoods 

##extract a smaller piece of information from them
    #ex/someone's surname or greeting from their full name
    #someone's deck level from their seat on a ticket number (there may be 1000s of ticket numbers but only 3 levels)

##group together values with a low count into one aggregate value i.e. "other or N/A"

#generally we want at least 25-30 occurances of a particular category in order to use it
##t-tests done on linear regression coefficents will work best with 25 or more observations per variable

In [378]:
sac_csv = '/Users/theodoreplotkin/desktop/postmalone/GA_Data_Science/DAT-06-24/class material/Unit 3/data/sac_re_cat.csv'

In [379]:
data = pd.read_csv(sac_csv)

In [380]:
data = data.set_index("sale_date") #note the .set_index() method allows us to reset an index column to a
                                   #column in the dataset

In [381]:
data.head()

Unnamed: 0_level_0,street,city,zip,state,beds,baths,sq__ft,type,price
sale_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Wed May 21 00:00:00 EDT 2008,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,59222
Wed May 21 00:00:00 EDT 2008,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,68212
Wed May 21 00:00:00 EDT 2008,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,68880
Wed May 21 00:00:00 EDT 2008,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,69307
Wed May 21 00:00:00 EDT 2008,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,81900


In [382]:
#'type' is a categorical variable
#'city' is a categorical variable
#'street' is a categorical variable <- would require lots of extra work to get into a category 
#'zip' is a categorical variable

In [383]:
data["city"].value_counts()

SACRAMENTO         439
ELK GROVE          114
LINCOLN             72
ROSEVILLE           48
CITRUS HEIGHTS      35
ANTELOPE            33
RANCHO CORDOVA      28
EL DORADO HILLS     23
NORTH HIGHLANDS     21
GALT                21
CARMICHAEL          20
ROCKLIN             17
FOLSOM              17
RIO LINDA           13
ORANGEVALE          11
PLACERVILLE         10
CAMERON PARK         9
FAIR OAKS            9
AUBURN               5
WILTON               5
ELVERTA              4
GOLD RIVER           4
GRANITE BAY          3
RANCHO MURIETA       3
WEST SACRAMENTO      3
POLLOCK PINES        3
LOOMIS               2
EL DORADO            2
GREENWOOD            1
PENRYN               1
COOL                 1
MEADOW VISTA         1
GARDEN VALLEY        1
WALNUT GROVE         1
DIAMOND SPRINGS      1
SHINGLE SPRINGS      1
FORESTHILL           1
SLOUGHHOUSE          1
MATHER               1
Name: city, dtype: int64

In [384]:
data["zip"].value_counts()

95648    72
95823    61
95828    45
95758    44
95838    37
95835    37
95757    36
95624    34
95843    33
95621    28
95822    24
95820    23
95762    23
95842    22
95834    22
95632    21
95660    21
95670    21
95608    20
95678    20
95833    20
95747    20
95815    18
95826    18
95630    17
95825    13
95673    13
95824    12
95832    12
95742    11
         ..
95661     8
95610     7
95817     7
95818     7
95841     7
95677     6
95821     6
95864     5
95603     5
95693     5
95819     4
95816     4
95683     4
95626     4
95726     3
95691     3
95814     3
95746     3
95623     2
95650     2
95811     2
95614     1
95619     1
95663     1
95631     1
95633     1
95635     1
95655     1
95690     1
95722     1
Name: zip, Length: 68, dtype: int64

In [385]:
data["zip"].dtype #we need to start to group various zip codes into blocks
                  #since many zip codes have so few occurances, they wont do well as "one-hot" categories

dtype('int64')

In [386]:
data.zip = data.zip.astype("category") 
            #we must first rewrite these ints into "category" types

In [387]:
data.groupby("zip").zip.count() #list of value counts for each unique zip code in dataset
                                #note that here the index is the value you want to group by

zip
95603     5
95608    20
95610     7
95614     1
95619     1
95621    28
95623     2
95624    34
95626     4
95628     9
95630    17
95631     1
95632    21
95633     1
95635     1
95648    72
95650     2
95655     1
95660    21
95661     8
95662    11
95663     1
95667    10
95670    21
95673    13
95677     6
95678    20
95682    10
95683     4
95690     1
         ..
95758    44
95762    23
95765    11
95811     2
95814     3
95815    18
95816     4
95817     7
95818     7
95819     4
95820    23
95821     6
95822    24
95823    61
95824    12
95825    13
95826    18
95827     9
95828    45
95829    11
95831    10
95832    12
95833    20
95834    22
95835    37
95838    37
95841     7
95842    22
95843    33
95864     5
Name: zip, Length: 68, dtype: int64

In [388]:
#entire the .transform() method which only applies to .groupby() objects

data.groupby("zip").zip.transform("count")
#transform the zip column via the "count" aggregator

#this makes the index the original index of the dataset.. the groups are linked to the original index

sale_date
Wed May 21 00:00:00 EDT 2008    37
Wed May 21 00:00:00 EDT 2008    61
Wed May 21 00:00:00 EDT 2008    18
Wed May 21 00:00:00 EDT 2008    18
Wed May 21 00:00:00 EDT 2008    12
Wed May 21 00:00:00 EDT 2008     7
Wed May 21 00:00:00 EDT 2008    22
Wed May 21 00:00:00 EDT 2008    23
Wed May 21 00:00:00 EDT 2008    21
Wed May 21 00:00:00 EDT 2008    13
Wed May 21 00:00:00 EDT 2008    37
Wed May 21 00:00:00 EDT 2008    61
Wed May 21 00:00:00 EDT 2008    18
Wed May 21 00:00:00 EDT 2008    24
Wed May 21 00:00:00 EDT 2008    22
Wed May 21 00:00:00 EDT 2008    22
Wed May 21 00:00:00 EDT 2008    13
Wed May 21 00:00:00 EDT 2008    28
Wed May 21 00:00:00 EDT 2008    20
Wed May 21 00:00:00 EDT 2008    13
Wed May 21 00:00:00 EDT 2008    28
Wed May 21 00:00:00 EDT 2008    61
Wed May 21 00:00:00 EDT 2008    21
Wed May 21 00:00:00 EDT 2008    22
Wed May 21 00:00:00 EDT 2008    21
Wed May 21 00:00:00 EDT 2008    33
Wed May 21 00:00:00 EDT 2008    23
Wed May 21 00:00:00 EDT 2008    12
Wed May 21

In [389]:
transformed = data.groupby("zip").zip.transform("count")

In [390]:
data["zip"] = np.where(transformed < 25, "Other", data["zip"])

#so if the transformed group in that row has a population of smaller than 25, put "other"
#if the transformed group in that row is greater than 25, put the zip code value

In [391]:
data.head()

Unnamed: 0_level_0,street,city,zip,state,beds,baths,sq__ft,type,price
sale_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Wed May 21 00:00:00 EDT 2008,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,59222
Wed May 21 00:00:00 EDT 2008,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,68212
Wed May 21 00:00:00 EDT 2008,2796 BRANCH ST,SACRAMENTO,Other,CA,2,1,796,Residential,68880
Wed May 21 00:00:00 EDT 2008,2805 JANETTE WAY,SACRAMENTO,Other,CA,2,1,852,Residential,69307
Wed May 21 00:00:00 EDT 2008,6001 MCMAHON DR,SACRAMENTO,Other,CA,2,1,797,Residential,81900


In [392]:
data["zip"].value_counts() #now we've created a "other" bin for all the categories which were smaller than 25

Other    558
95648     72
95823     61
95828     45
95758     44
95838     37
95835     37
95757     36
95624     34
95843     33
95621     28
Name: zip, dtype: int64

In [393]:
#lets try to do the same grouping for "city"
data["city"].value_counts()

SACRAMENTO         439
ELK GROVE          114
LINCOLN             72
ROSEVILLE           48
CITRUS HEIGHTS      35
ANTELOPE            33
RANCHO CORDOVA      28
EL DORADO HILLS     23
NORTH HIGHLANDS     21
GALT                21
CARMICHAEL          20
ROCKLIN             17
FOLSOM              17
RIO LINDA           13
ORANGEVALE          11
PLACERVILLE         10
CAMERON PARK         9
FAIR OAKS            9
AUBURN               5
WILTON               5
ELVERTA              4
GOLD RIVER           4
GRANITE BAY          3
RANCHO MURIETA       3
WEST SACRAMENTO      3
POLLOCK PINES        3
LOOMIS               2
EL DORADO            2
GREENWOOD            1
PENRYN               1
COOL                 1
MEADOW VISTA         1
GARDEN VALLEY        1
WALNUT GROVE         1
DIAMOND SPRINGS      1
SHINGLE SPRINGS      1
FORESTHILL           1
SLOUGHHOUSE          1
MATHER               1
Name: city, dtype: int64

In [394]:
data.city = data.city.astype("category")

In [395]:
transformed_city = data.groupby("city").city.transform("count")

In [396]:
data["city"] = np.where(transformed_city < 25, "Other", data["city"])

In [397]:
data.city.value_counts() #note this does the same with "city" as we've done with "zip"

SACRAMENTO        439
Other             216
ELK GROVE         114
LINCOLN            72
ROSEVILLE          48
CITRUS HEIGHTS     35
ANTELOPE           33
RANCHO CORDOVA     28
Name: city, dtype: int64

In [398]:
data.type.value_counts() #type looks good, although we might need to remove the one unknown row

Residential     917
Condo            54
Multi-Family     13
Unkown            1
Name: type, dtype: int64

In [399]:
#lets transform "city" and "zip" to a 1-hot encoding
#since as far as we know there is no ordering here 

#city and zip will most likely be collinear, so lets try just city
data.drop("street", axis = 1, inplace = True)

In [400]:
data = pd.get_dummies(data, drop_first = True)

In [401]:
data.head() #we've created one-hot encodings for city, zip and type 

Unnamed: 0_level_0,beds,baths,sq__ft,price,city_CITRUS HEIGHTS,city_ELK GROVE,city_LINCOLN,city_Other,city_RANCHO CORDOVA,city_ROSEVILLE,...,zip_95823,zip_95828,zip_95835,zip_95838,zip_95843,zip_Other,state_CA,type_Multi-Family,type_Residential,type_Unkown
sale_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wed May 21 00:00:00 EDT 2008,2,1,836,59222,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
Wed May 21 00:00:00 EDT 2008,3,1,1167,68212,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
Wed May 21 00:00:00 EDT 2008,2,1,796,68880,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
Wed May 21 00:00:00 EDT 2008,2,1,852,69307,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
Wed May 21 00:00:00 EDT 2008,2,1,797,81900,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0


In [402]:
data.columns

Index(['beds', 'baths', 'sq__ft', 'price', 'city_CITRUS HEIGHTS',
       'city_ELK GROVE', 'city_LINCOLN', 'city_Other', 'city_RANCHO CORDOVA',
       'city_ROSEVILLE', 'city_SACRAMENTO', 'zip_95624', 'zip_95648',
       'zip_95757', 'zip_95758', 'zip_95823', 'zip_95828', 'zip_95835',
       'zip_95838', 'zip_95843', 'zip_Other', 'state_CA', 'type_Multi-Family',
       'type_Residential', 'type_Unkown'],
      dtype='object')

In [403]:
model = LinearRegression()

data["sqft_standard"] = (data["sq__ft"] - data["sq__ft"].mean())/(data["sq__ft"].std())

X_dat = data[["sqft_standard"]+data.columns.tolist()[4:11]]
y_hat = data["price"]

In [404]:
X_dat.shape

(985, 8)

In [405]:
y_hat.shape

(985,)

In [406]:
model.fit(X_dat,y_hat)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [407]:
model.score(X_dat,y_hat)

#note that this regression includes only the "sqft" numeric type, and the new "city" category

0.2965858017988928

In [408]:
#note the same model with sqft alone

#X_dat = np.array(data["sqft_standard"])
#y_hat = data["price"]

In [409]:
#Evaluation metrics for regression problems in addition to the R-squared 

#can also use the Mean-absolute Error (MAE) a
#and the Mean-squared error (MSE)

In [410]:
#lets create a column of predictions using our model

data["prediction"] = model.predict(X_dat)

In [413]:
data[["price","prediction"]].head(100)

Unnamed: 0_level_0,price,prediction
sale_date,Unnamed: 1_level_1,Unnamed: 2_level_1
Wed May 21 00:00:00 EDT 2008,59222,168664.646605
Wed May 21 00:00:00 EDT 2008,68212,185466.572940
Wed May 21 00:00:00 EDT 2008,68880,166634.202335
Wed May 21 00:00:00 EDT 2008,69307,169476.824313
Wed May 21 00:00:00 EDT 2008,81900,166684.963442
Wed May 21 00:00:00 EDT 2008,89921,183182.323136
Wed May 21 00:00:00 EDT 2008,90895,182268.623215
Wed May 21 00:00:00 EDT 2008,91002,185974.184008
Wed May 21 00:00:00 EDT 2008,94905,242031.876836
Wed May 21 00:00:00 EDT 2008,98937,312542.271728


In [415]:
#calculate the MAE by hand::

MAE = np.mean(np.abs(data["price"] - data["prediction"]))
MAE

85397.3063626454

In [417]:
#calculate the MSE by hand::

MSE = np.mean((data["price"] - data["prediction"])**2)
MSE

13594224463.226055

In [None]:
#SKlearn for preprocessing and metrics

#preprocessing : https://scikit-learn.org/stable/modules/preprocessing.html
#metrics : https://scikit-learn.org/stable/modules/classes.html#regression-metrics

#to import basically do from sklearn.metrics import "name of method"

#preprocessing helps us automatically scale the data without having to do it line by line

In [426]:
from sklearn import metrics
print("MAE:", "$"+format(metrics.mean_absolute_error(data["price"],data["prediction"]), ",.0f") )

#on average the model is off by 85,397

MAE: $85,397


In [429]:
RMSE = MSE ** (1/2)
print("RMSE:", "$"+format(RMSE, ",.0f"))

RMSE: $116,594


In [430]:
#feature engineering :: creating new columns out of old ones 

##ex log-transforming numeric data to make it follow a bell-shaped distribution
##ex taking the ratio of two different columns
## transforming low value count categorical values into something useful
##gathering the time stamp from the time data

bikes.head()

Unnamed: 0_level_0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,season_Spring,season_Summer,season_Winter,weather_standard,temp_standard,atemp_standard,humidity_standard,windspeed_standard
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-01 00:00:00,0,0,4,9.84,14.395,81,0.0,16,1,0,0,0.660148,-1.333599,-1.092687,0.993167,-1.567682
2011-01-01 01:00:00,0,0,4,9.02,13.635,80,0.0,40,1,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682
2011-01-01 02:00:00,0,0,4,9.02,13.635,80,0.0,32,1,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682
2011-01-01 03:00:00,0,0,4,9.84,14.395,75,0.0,13,1,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682
2011-01-01 04:00:00,0,0,4,9.84,14.395,75,0.0,1,1,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682


In [432]:
bikes.index #note that this is encoded as a timestamp in pandas


DatetimeIndex(['2011-01-01 00:00:00', '2011-01-01 01:00:00',
               '2011-01-01 02:00:00', '2011-01-01 03:00:00',
               '2011-01-01 04:00:00', '2011-01-01 05:00:00',
               '2011-01-01 06:00:00', '2011-01-01 07:00:00',
               '2011-01-01 08:00:00', '2011-01-01 09:00:00',
               ...
               '2012-12-19 14:00:00', '2012-12-19 15:00:00',
               '2012-12-19 16:00:00', '2012-12-19 17:00:00',
               '2012-12-19 18:00:00', '2012-12-19 19:00:00',
               '2012-12-19 20:00:00', '2012-12-19 21:00:00',
               '2012-12-19 22:00:00', '2012-12-19 23:00:00'],
              dtype='datetime64[ns]', name='datetime', length=10886, freq=None)

In [433]:
bikes.index.hour #one way of feature engineering could be to change the units of time

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
            14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
           dtype='int64', name='datetime', length=10886)

In [434]:
bikes["hour"] = bikes.index.hour

In [436]:
bikes.head(100)

Unnamed: 0_level_0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,season_Spring,season_Summer,season_Winter,weather_standard,temp_standard,atemp_standard,humidity_standard,windspeed_standard,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-01-01 00:00:00,0,0,4,9.84,14.395,81,0.0000,16,1,0,0,0.660148,-1.333599,-1.092687,0.993167,-1.567682,0
2011-01-01 01:00:00,0,0,4,9.02,13.635,80,0.0000,40,1,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682,1
2011-01-01 02:00:00,0,0,4,9.02,13.635,80,0.0000,32,1,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682,2
2011-01-01 03:00:00,0,0,4,9.84,14.395,75,0.0000,13,1,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682,3
2011-01-01 04:00:00,0,0,4,9.84,14.395,75,0.0000,1,1,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682,4
2011-01-01 05:00:00,0,0,3,9.84,12.880,75,6.0032,1,1,0,0,-0.917541,-1.333599,-1.271456,0.681399,-0.832404,5
2011-01-01 06:00:00,0,0,4,9.02,13.635,80,0.0000,2,1,0,0,0.660148,-1.438841,-1.182367,0.941206,-1.567682,6
2011-01-01 07:00:00,0,0,4,8.20,12.880,86,0.0000,3,1,0,0,0.660148,-1.544083,-1.271456,1.252975,-1.567682,7
2011-01-01 08:00:00,0,0,4,9.84,14.395,75,0.0000,8,1,0,0,0.660148,-1.333599,-1.092687,0.681399,-1.567682,8
2011-01-01 09:00:00,0,0,4,13.12,17.425,76,0.0000,14,1,0,0,0.660148,-0.912633,-0.735148,0.733360,-1.567682,9


In [None]:
#note that if the date-time is not the default index of the dataframe, we would do the following::

#bikes["datetime"].dt.hour, note we must call .dt in order for this to work 

#we could use similar notation to get seconds, quarters, year, etc...