In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

%matplotlib inline
plt.style.use('fivethirtyeight')

df = pd.read_csv('../data/bikeshare.csv',
                 index_col = 'datetime',
                 parse_dates= True)


In [21]:
df.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,16
2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,40
2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,32
2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,13
2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,1


In [22]:
weather_map = {'Clear Skies':4,
               'Partly Cloudy':3,
               'Light Storms/Rain':2,
               'Heavy Storms/Rain':1}

df['weather'] = df['weather'].map(weather_map)

In [23]:
pd.get_dummies(df['season'],
               drop_first = True).head()

Unnamed: 0_level_0,Spring,Summer,Winter
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-01-01 00:00:00,1,0,0
2011-01-01 01:00:00,1,0,0
2011-01-01 02:00:00,1,0,0
2011-01-01 03:00:00,1,0,0
2011-01-01 04:00:00,1,0,0


In [24]:
df = pd.get_dummies(df,
                    drop_first = True)
df.head()

Unnamed: 0_level_0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,season_Spring,season_Summer,season_Winter
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,0,0,4,9.84,14.395,81,0.0,16,1,0,0
2011-01-01 01:00:00,0,0,4,9.02,13.635,80,0.0,40,1,0,0
2011-01-01 02:00:00,0,0,4,9.02,13.635,80,0.0,32,1,0,0
2011-01-01 03:00:00,0,0,4,9.84,14.395,75,0.0,13,1,0,0
2011-01-01 04:00:00,0,0,4,9.84,14.395,75,0.0,1,1,0,0


In [25]:
df['temp'] = (df['temp'] - df['temp'].mean()) / df['temp'].std()
df['atemp'] = (df['atemp'] - df['atemp'].mean()) / df['atemp'].std()
df['humidity'] = (df['humidity'] - df['humidity'].mean()) / df['humidity'].std()
df['windspeed'] = (df['windspeed'] - df['windspeed'].mean()) / df['windspeed'].std()

In [26]:
features_some = ['temp',
                 'atemp',
                 'weather',
                 'humidity']

In [27]:
features_all = ['holiday',
                'workingday',
                'weather',
                'temp',
                'atemp',
                'humidity',
                'windspeed',
                'season_Spring',
                'season_Summer',
                'season_Winter']

In [28]:
X_some = df[features_some]
X_all = df[features_all]
y = df['count']

In [29]:
lr = LinearRegression()

In [30]:
lr.fit(X_some, y)
print(lr.score(X_some, y))
print(list(zip(features_some,lr.coef_)))

0.24307517355292274
[('temp', 23.167142726257005), ('atemp', 45.5475191985618), ('weather', -4.591984143850331), ('humidity', -55.18486553237941)]


In [31]:
lr.fit(X_all, y)
print(lr.score(X_all, y))
print(list(zip(features_all, lr.coef_)))

0.2752503232099429
[('holiday', -8.630472341320143), ('workingday', -2.861549482797657), ('weather', -2.6516939362713394), ('temp', 62.03831630929894), ('atemp', 24.652283817352092), ('humidity', -54.257685839318555), ('windspeed', 4.420026574191246), ('season_Spring', 37.770716087580034), ('season_Summer', 34.97948628938873), ('season_Winter', 102.9792029705163)]


In [32]:
# alt, quick implementation:
#############################
X = df.loc[:, df.columns != 'count']
y = df['count']

# weather_map = {'Clear Skies':4,
#                'Partly Cloudy':3,
#                'Light Storms/Rain':2,
#                'Heavy Storms/Rain':1}

# df['weather'] = df['weather'].map(weather_map)

num_cols = X.select_dtypes(include=np.number).columns.tolist()
X_std = (X[num_cols] - X[num_cols].mean()) / X[num_cols].std()

In [33]:
lr.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [34]:
coeffs = pd.DataFrame({
    'Vars': X_std.columns,
    'Weight': lr.coef_
}).sort_values(by='Weight', ascending=False)
coeffs

Unnamed: 0,Vars,Weight
9,season_Winter,102.979203
3,temp,62.038316
7,season_Spring,37.770716
8,season_Summer,34.979486
4,atemp,24.652284
6,windspeed,4.420027
2,weather,-2.651694
1,workingday,-2.861549
0,holiday,-8.630472
5,humidity,-54.257686


In [35]:
mod = sm.OLS(y, X)
results = mod.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  count   R-squared:                       0.651
Model:                            OLS   Adj. R-squared:                  0.651
Method:                 Least Squares   F-statistic:                     2030.
Date:                Wed, 31 Jul 2019   Prob (F-statistic):               0.00
Time:                        21:05:27   Log-Likelihood:                -70401.
No. Observations:               10886   AIC:                         1.408e+05
Df Residuals:                   10876   BIC:                         1.409e+05
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
holiday           2.0266      9.246      0.219

In [37]:
df['hour'] = df.index.hour

In [38]:
df

Unnamed: 0_level_0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,season_Spring,season_Summer,season_Winter,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2011-01-01 00:00:00,0,0,4,-1.333599,-1.092687,0.993167,-1.567682,16,1,0,0,0
2011-01-01 01:00:00,0,0,4,-1.438841,-1.182367,0.941206,-1.567682,40,1,0,0,1
2011-01-01 02:00:00,0,0,4,-1.438841,-1.182367,0.941206,-1.567682,32,1,0,0,2
2011-01-01 03:00:00,0,0,4,-1.333599,-1.092687,0.681399,-1.567682,13,1,0,0,3
2011-01-01 04:00:00,0,0,4,-1.333599,-1.092687,0.681399,-1.567682,1,1,0,0,4
2011-01-01 05:00:00,0,0,3,-1.333599,-1.271456,0.681399,-0.832404,1,1,0,0,5
2011-01-01 06:00:00,0,0,4,-1.438841,-1.182367,0.941206,-1.567682,2,1,0,0,6
2011-01-01 07:00:00,0,0,4,-1.544083,-1.271456,1.252975,-1.567682,3,1,0,0,7
2011-01-01 08:00:00,0,0,4,-1.333599,-1.092687,0.681399,-1.567682,8,1,0,0,8
2011-01-01 09:00:00,0,0,4,-0.912633,-0.735148,0.733360,-1.567682,14,1,0,0,9


In [None]:
prices = pd.read_csv('../data/sac_re_cat.csv',
                     index_col = 'sale_date')

In [None]:
prices.head()

In [None]:
prices.info()

In [None]:
prices['zip'] = prices['zip'].astype('category')

In [None]:
transformed = prices.groupby('zip')['zip'].transform('count')

In [None]:
prices['zip'] = np.where(transformed < 25,
                         'Other',
                         prices['zip'])

In [None]:
prices['zip'].value_counts()

In [None]:
prices.loc[prices['sq__ft'] < 0, 'sq__ft'] = prices.loc[prices['sq__ft'] < 0, 'sq__ft'] * -1

In [None]:
prices.loc[prices['price'] < 0, 'price'] = prices.loc[prices['price'] < 0, 'price'] * - 1

In [None]:
prices = prices[(prices['sq__ft'] > 0)
               & (prices['baths'] > 0)
               & (prices['beds'] > 0)]

In [None]:
prices.head()

In [None]:
transformed_city = prices.groupby('city')['city'].transform('count')

In [None]:
prices['city'] = np.where(transformed_city < 25,
                         'OtherC',
                          prices['city'])

In [None]:
prices['city'].value_counts()

In [None]:
prices['state'].value_counts()

In [None]:
pd.get_dummies(prices.drop(['state','street'], axis = 1),
               drop_first = True)

In [None]:
prices = pd.get_dummies(prices.drop(['state','street'], axis = 1),
                        drop_first = True)
prices.head()

In [None]:
X_p = prices.loc[:, prices.columns != 'price']
y_p = prices['price']


num_cols = X_p.select_dtypes(include=np.number).columns.tolist()
X_p_std = (X_p[num_cols] - X_p[num_cols].mean()) / X_p[num_cols].std()

In [None]:
lr.fit(X_p, y_p)

In [None]:
lr.score(X_p, y_p)

In [None]:
mod = sm.OLS(y_p, X_p)
results_p = mod.fit()
print(results_p.summary())

In [None]:
prices['prediction']  = lr.predict(X_p)

In [None]:
mae = np.mean(np.abs(prices['price'] - prices['prediction']))

In [None]:
mae

In [None]:
mse = np.mean((prices['price'] - prices['prediction'])**2)

In [None]:
mse

In [None]:
print('MAE:', metrics.mean_absolute_error(prices['price'], prices['prediction']))
print('MSE:', metrics.mean_squared_error(prices['price'], prices['prediction']))