In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## One-hot encoding

In [2]:
v = np.array(['Chrome','Chrome','Firefox','Chrome','Firefox','Safari'])
v

array(['Chrome', 'Chrome', 'Firefox', 'Chrome', 'Firefox', 'Safari'],
      dtype='<U7')

In [3]:
labels = np.unique(v)
labels

array(['Chrome', 'Firefox', 'Safari'], dtype='<U7')

In [4]:
dic_labels = {'Chrome' : 0, 'Firefox' : 1, 'Safari' : 2}
dic_labels

{'Chrome': 0, 'Firefox': 1, 'Safari': 2}

In [5]:
dic_labels['Chrome']

0

In [6]:
# using dictionary comprehension
dic_labels = {labels[i]:i for i in range(len(labels))}
dic_labels

{'Chrome': 0, 'Firefox': 1, 'Safari': 2}

In [7]:
# ordinal encoding
ord_labels = np.array([dic_labels[v[i]] for i in range(len(v))])
ord_labels

array([0, 0, 1, 0, 1, 2])

In [8]:
# one-hot encoding
V = np.zeros((len(v),len(labels)))
V[np.arange(len(v)),ord_labels] = 1
V

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [9]:
def one_hot_encoding(v):
    # labels
    labels = np.unique(v)
    # ordinal encoding
    dic_labels = {labels[i]:i for i in range(len(labels))}
    ord_labels = np.array([dic_labels[v[i]] for i in range(len(v))])
    # one-hot encoding
    V = np.zeros((len(v),len(labels)))
    V[np.arange(len(v)),ord_labels] = 1
    return V

In [10]:
# check that our function works
one_hot_encoding(v)

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

## Fremont bridge traffic dataset

In [11]:
# load Fremont traffic dataset
data = pd.read_csv('Fremont.csv',index_col='Date',parse_dates=True)
data

FileNotFoundError: [Errno 2] No such file or directory: 'Fremont.csv'

The columns are:

- traffic: number of bikes that cross the Fremont bridge (Seattle)
- day_of_week: 0 (Monday), 1 (Tuesday), etc
- month: 1 (Jan), 2 (Feb), 3 (Mar), etc
- year: 
- covid: 1 (pandemic), 0 (no pandemic)
- holiday: 1 (holiday), 0 (no holiday)
- hours_daylight: hours of daylight
- TAVG: Average temperature
- PRCP: Precipitation
- AWND: Average wind speed
- SNOW: Snowfall

In [None]:
plt.figure(figsize=(20,10))
data['traffic'].plot()

In [None]:
# target vector 
y = data.traffic.to_numpy()

In [None]:
# numerical features: 
X = data[['year','covid','holiday','hours_daylight','TAVG','PRCP','AWND','SNOW']].to_numpy()
X.shape

In [None]:
# day of the week
X = np.c_[X,one_hot_encoding(data['day_of_week'].to_numpy())]

In [None]:
X.shape

In [None]:
# month
X = np.c_[X,one_hot_encoding(data['month'].to_numpy())]

In [None]:
X.shape

In [None]:
# polynomial features 
def build_poly_features(X,degree):
    from itertools import combinations_with_replacement as comb_w_r
    from itertools import chain
    
    # number of datapoints (rows), number of features (columns)
    try:
        m,n = X.shape # this won't work if X is a vector (n=1 features)
    except: 
        m = len(X)
        n = 1
        X = X.reshape(m,1) #  
    
    # number of polynomial features
    combinations = chain.from_iterable(comb_w_r(range(n),i) for i in range(degree+1))
    n_poly = sum(1 for combination in combinations) 
    
    # polynomial features matrix
    X_poly = np.ones((m,n_poly))
    combinations = chain.from_iterable(comb_w_r(range(n),i) for i in range(degree+1))\
    
    
    for column_index, combination in enumerate(combinations):
        X_poly[:,column_index] = np.prod(X[:,combination],axis=1)
        
    return X_poly



In [None]:
# no polynomial features
X_poly = build_poly_features(X,degree=1) # it will add the column of all-ones

In [None]:
X_poly.shape

In [None]:
# least squares problem
theta = np.linalg.lstsq(X_poly,y,rcond=None)[0]
theta.shape

In [None]:
# prediction
data['prediction'] = X_poly.dot(theta)

In [None]:
# plot actual traffic and the predictions
data[['traffic','prediction']].plot(figsize=(15,7),alpha=0.6)

In [None]:
# plot actual against predicted
plt.scatter(data['prediction'],data['traffic'])
plt.ylabel('actual traffic')
plt.xlabel('predicted traffic')
plt.plot([0,12000],[0,12000],'--', color='red')

In [None]:
# with  polynomial features
X_poly = build_poly_features(X,degree=2)
X_poly.shape

In [None]:
# least squares problem
theta = np.linalg.lstsq(X_poly,y,rcond=None)[0]
theta.shape

In [None]:
# prediction
data['prediction'] = X_poly.dot(theta)

In [None]:
# plot actual traffic and the predictions
data[['traffic','prediction']].plot(figsize=(15,7),alpha=0.6)

In [None]:
# plot actual against predicted
plt.scatter(data['prediction'],data['traffic'])
plt.ylabel('actual traffic')
plt.xlabel('predicted traffic')
plt.plot([0,12000],[0,12000],'--', color='red')