In [193]:
# reprinted for educational purposes with minor tweaking from Alice Zheng's "Mastering Feature Engineering"

In [194]:
# Relevant linear regression 

import pandas as pd
from sklearn import linear_model

In [195]:
# We start with 3 cities:  SF, NYC, Seattle
df = pd.DataFrame({
    'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC','Seattle', 'Seattle', 'Seattle'],
    'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501]
})

In [196]:
df

Unnamed: 0,City,Rent
0,SF,3999
1,SF,4000
2,SF,4001
3,NYC,3499
4,NYC,3500
5,NYC,3501
6,Seattle,2499
7,Seattle,2500
8,Seattle,2501


In [197]:
# the global means equals the full dummy intercept as per below
df['Rent'].mean()

3333.3333333333335

In [198]:
### Convert the categorical variables in the dataframe to dummies
df1 = pd.get_dummies(df, prefix=['city'])
df1

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,0,1,0
1,4000,0,1,0
2,4001,0,1,0
3,3499,1,0,0
4,3500,1,0,0
5,3501,1,0,0
6,2499,0,0,1
7,2500,0,0,1
8,2501,0,0,1


In [200]:
### to one-hot encoding and fit a linear regression model

model = linear_model.LinearRegression()
model.fit(df1[['city_NYC', 'city_SF', 'city_Seattle']], df1['Rent'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [202]:
model.coef_

array([ 166.66666667,  666.66666667, -833.33333333])

In [204]:
# New York is coef_[0] and adding it's coefficient contribution to the 
# Global average yields NYC's average rental rate
model.coef_[0] + model.intercept_

3500.0000000000005

In [207]:
# How does one obtain an intercept that already embeds both the global average plus NYC
# Baseline here is NYC
df_nyc = pd.get_dummies(df, prefix=['city'], drop_first=True)
df_nyc

Unnamed: 0,Rent,city_SF,city_Seattle
0,3999,1,0
1,4000,1,0
2,4001,1,0
3,3499,0,0
4,3500,0,0
5,3501,0,0
6,2499,0,1
7,2500,0,1
8,2501,0,1


In [208]:
model.fit(df_nyc[['city_SF', 'city_Seattle']], df_nyc['Rent'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [209]:
# Note how the coefficients have changed to become relative to the
# original intercept plus the new baseline coefficient
model.coef_

array([  500., -1000.])

In [212]:
# Now you obtain NYC's rental value via the intercept directly
model.intercept_ 

3500.0

In [213]:
# The overall global mean remains the same
df_nyc['Rent'].mean()

3333.3333333333335

#### Effects' Adjustment Below for original cities and then later adding Miami

In [216]:
effect_df_1 = df1.copy()
effect_df_1 # full dummies

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,0,1,0
1,4000,0,1,0
2,4001,0,1,0
3,3499,1,0,0
4,3500,1,0,0
5,3501,1,0,0
6,2499,0,0,1
7,2500,0,0,1
8,2501,0,0,1


In [221]:
# We redefine NYC rows in all of the non-NYC columns with a -1.0
effect_df_1.ix[3:5, ['city_SF', 'city_Seattle']] = -1.0

In [222]:
effect_df_1

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,0,1.0,0.0
1,4000,0,1.0,0.0
2,4001,0,1.0,0.0
3,3499,1,-1.0,-1.0
4,3500,1,-1.0,-1.0
5,3501,1,-1.0,-1.0
6,2499,0,0.0,1.0
7,2500,0,0.0,1.0
8,2501,0,0.0,1.0


In [226]:
 model.fit(effect_df[['city_SF', 'city_Seattle']], effect_df['Rent'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [229]:
# Now the coefficients are all relative to the global average
model.coef_

array([ 666.66666667, -833.33333333])

In [230]:
# NYC's average rent can be obtained by subtracting all the other coefficients
model.intercept_  - model.coef_[1] - model.coef_[0] # global mean

3500.0

### What if we add a fourth city?

In [234]:
# Addition of Miami
df_m = pd.DataFrame({
    'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC','Seattle', 'Seattle', 'Seattle', 'Miami', 'Miami'],
    'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501, 5000, 1800]
})

In [235]:
# Dropping Miami will not affect the final results below - try it and see

df_m = pd.get_dummies(df_m, prefix=['city'], drop_first = True)
df_m

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,0,1,0
1,4000,0,1,0
2,4001,0,1,0
3,3499,1,0,0
4,3500,1,0,0
5,3501,1,0,0
6,2499,0,0,1
7,2500,0,0,1
8,2501,0,0,1
9,5000,0,0,0


In [236]:
effect_df_2 = df_m.copy()

In [238]:
# Here we must localize relevant Miami rows and set all the rows to -1
effect_df_2.ix[9:10, ['city_SF', 'city_Seattle', 'city_NYC']] = -1.0
effect_df_2

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,0.0,1.0,0.0
1,4000,0.0,1.0,0.0
2,4001,0.0,1.0,0.0
3,3499,1.0,0.0,0.0
4,3500,1.0,0.0,0.0
5,3501,1.0,0.0,0.0
6,2499,0.0,0.0,1.0
7,2500,0.0,0.0,1.0
8,2501,0.0,0.0,1.0
9,5000,-1.0,-1.0,-1.0


In [239]:
model.fit(effect_df_2[['city_NYC', 'city_SF', 'city_Seattle']], effect_df_2['Rent'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [240]:
# The new coefficients are all relative to the new global average with Miami included
model.coef_

array([ 150.,  650., -850.])

In [246]:
# the lone intercept is NO LONGER meaningful
model.intercept_

3350.0

In [248]:
# Here is Miami's average rent which we verify below
model.intercept_  - model.coef_[2] - model.coef_[1] - model.coef_[0] # global mean

3400.0

In [249]:
# Miami's average rent verified manually
import numpy as np
np.mean(df_m.ix[9:10, 'Rent'])

3400.0

In [244]:
# NYC rent obtained by adding global average to first coefficient
model.intercept_ + model.coef_[0]

3500.0

In [254]:
# Note that once Effects is imposed, the global average no longer corresponds exactly
# to the intercept term four cells above
effect_df_2['Rent'].mean()

3345.4545454545455