Load data
---

In [None]:
import pandas as pd

# Load data
data_df = pd.read_csv('bike-sharing.csv')

# Create Numpy arrays
temp = data_df.temp.values
users = data_df.users.values

# First five rows
data_df.head()

1st scenario: perfect collinearity
---

In [None]:
# Create collinear feature
temp_C = 47*temp - 8

In [None]:
import numpy as np

# Create input matrix X
X = np.c_[temp, temp_C]

# Add a column of ones
X1 = np.c_[np.ones(X.shape[0]), X] 

# Compute rank
rank = np.linalg.matrix_rank(X1)
print('Rank', rank)

In [None]:
from scipy.linalg import lstsq

# Compute OLS using lstsq
w, rss, rank, _ = lstsq(X1, users)

print('rank:', rank)
print('RSS:', rss)

In [None]:
print('w:', w)

In [None]:
from sklearn.metrics import r2_score

# R^2 coefficient of simple linear regression
coefs = np.polyfit(temp, users, deg=1)
y_pred_normal = np.polyval(coefs, temp)
r2_normal = r2_score(users, y_pred_normal)
print('R^2 normal:', r2_normal)

# R^2 coefficient with collinear features
y_pred_collinear = np.matmul(X1, w)
r2_collinear = r2_score(users, y_pred_collinear)
print('R^2 collinear:', r2_collinear)

2nd scenario: nearly collinear features
---

In [None]:
# Convert to degrees Celsius to Fahrenheit
temp_F = 1.8*temp_C + 32

In [None]:
# Convert to degrees Fahrenheit
temp_F = 1.8*temp_C + 32

# Add small variations
noise = np.random.normal(loc=0, scale=0.01, size=temp_F.shape)
temp_F += noise

# Create input matrix X
X = np.c_[temp_C, temp_F]

# Compute OLS using lstsq
X1 = np.c_[np.ones(X.shape[0]), X] # Create X1 matrix
w, rss, rank, _ = lstsq(X1, users) # OLS

print('rank:', rank)
print('RMSE:', np.sqrt(rss/len(users))) # Depends on the noise value
print('w:', w) # Depends on the noise value

In [None]:
# run: rank, RMSE, coefficients
# 1: 3, 233.311949333, [-15888.54114451   -870.94761598    501.53502237]
# 2: 3, 233.358483738, [ 6360.7676213    380.60388517  -193.77200641]
# 3: 3, 233.204776769, [-30144.45548463  -1672.99380321    947.06415472]
# ..

In [None]:
# Condition number
cn = np.linalg.cond(X1)
print('Condition number:', cn) # Depends on the noise value

In [None]:
# Same with the nearly collinear matrix
y_pred_nearcol = np.matmul(X1, w)
r2_nearcol = r2_score(users, y_pred_nearcol)

# R^2 coefficient with nearly collinear features
print('R^2 nearly collinear:', r2_nearcol)
# should be around 0.59

In [None]:
from sklearn.linear_model import Ridge

# Add small variations
noise = np.random.normal(loc=0, scale=0.01, size=temp_F.shape)
temp_F = (1.8*temp_C + 32) + noise

# Create input matrix X
X = np.c_[temp_C, temp_F]

# Fit a Ridge regression
ridge = Ridge(alpha=100)
ridge.fit(X, users)

print('Coefficients:', ridge.coef_)
print('Intercept:', ridge.intercept_)
print('R^2:', ridge.score(X, users))

In [None]:
# run: coefficients, intercept, R^2
# 1: [  7.60970645  13.43336788], -269.389715324, 0.595405394009
# 2: [  7.2575577   13.63021335], -275.733836067, 0.595460937325
# 3: [  7.72013938  13.37129656], -267.372846183, 0.595388255988
# ..