In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import seaborn as sns
import datetime
from group_lasso import GroupLasso

In [2]:
def standardize(X,y):
    # Standardize X to have mean = 0 std = 1
    # Standardize y to have mean = 0
    X_scaled = (X-np.mean(X,axis=0))/np.std(X,axis=0)
    y_scaled = y-np.mean(y)
    return X_scaled, y_scaled


def plot_coefficients(beta,alpha,alpha_lim=10,name=None):
    # Plotting regression coefficients vs lambda
    beta_opt = beta[:,np.argmin(np.abs(alpha-alpha_lim))]
    plt.figure()
    plt.plot(alpha,beta.T,'-')
    plt.plot(alpha_lim*np.array([1,1]), [np.min(beta), np.max(beta)], 'k--')
    plt.xlabel(r'$\lambda$')
    plt.ylabel(r'$\beta$')
    plt.title(name)
    plt.show()
    

In [None]:
df = pd.read_csv('energydata_complete.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')

for i in range(len(df.columns)):
    plt.figure()
    plt.plot(df[df.columns[i]])
    plt.ylabel(df.columns[i])
    plt.show()

### Taking an n-hour mean

In [4]:
#df = df.resample('2h').mean()

### Generating extra features to describe time
weekday: number [0,6]\
weekstatus: binary describing weekend (1) or not (0)\
NSM: Number of Seconds from Midnight

These are used for filtering the data

In [5]:
weekday = np.zeros(len(df))
weekstatus = np.zeros(len(df))
NSM = np.zeros(len(df))
month = np.zeros(len(df))

for i in range(len(df)):
    weekday[i] = df.index[i].weekday()
    weekstatus[i] = True if weekday[i] >= 5 else False  # False for workday, True for weekend
    NSM[i] = (df.index[i] - df.index[i].replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()
    month[i] = df.index[i].month

df['weekday'] = weekday
df['week status'] = weekstatus
df['NSM'] = NSM
df['month'] = month

In [None]:
plt.figure()
plt.scatter(df['NSM'],df['Appliances'])
plt.xlabel('NSM')
plt.ylabel('Appliances')
plt.show()

### Filtering data and making training set
Example: Only february, after 16:00 and workday

In [7]:
df_train = df[(df.index.month == 2) & (df['NSM']>54000) & (df['week status'] == 0)]
df_train = df_train.drop(['weekday', 'week status','month','NSM'], axis=1) # dropping the features used for filtering

# Training data
y = np.array(df_train['Appliances']).reshape(-1,1)
X = np.array(df_train[df_train.columns[1:]])
X, y = standardize(X,y)

### Lasso

In [9]:
n_alpha = 50
alpha_vals = np.linspace(0.1,15,n_alpha)
beta_lasso = np.zeros((X.shape[1],n_alpha))

for i in range(n_alpha):
    reg = linear_model.Lasso(alpha=alpha_vals[i], max_iter = 1000, fit_intercept = False)
    reg.fit(X,y)
    beta_lasso[:,i] = reg.coef_

### Ridge

In [10]:
n_alpha = 50
alpha_vals = np.linspace(0.1,50,n_alpha)
beta_ridge = np.zeros((X.shape[1],n_alpha))

for i in range(n_alpha):
    beta_ridge[:,i] = (np.linalg.inv(X.T @ X + alpha_vals[i] * np.eye(len(X.T @ X))) @ X.T @ y).T

### Elastic net

In [11]:
n_alpha = 50
alpha_vals = np.linspace(0.1,15,n_alpha)
beta_elnet = np.zeros((X.shape[1],n_alpha))

for i in range(n_alpha):
    reg = linear_model.ElasticNet(alpha=alpha_vals[i], max_iter = 1000, l1_ratio=.5, fit_intercept = False)
    reg.fit(X,y)
    beta_elnet[:,i] = reg.coef_

### Grouped Lasso
Does not work yet

In [12]:
n_alpha = 50
alpha_vals = np.linspace(0.1,15,n_alpha)
beta_glasso = np.zeros((X.shape[1],n_alpha))

for i in range(n_alpha):
    reg = GroupLasso(
    group_reg=5,
    l1_reg=1,
    frobenius_lipschitz=True,
    scale_reg="inverse_group_size",
    subsampling_scheme=1,
    supress_warning=True,
    n_iter=1000,
    tol=1e-3,
    )
    reg.fit(X, y)
    beta_glasso[:,i] = reg.coef_.reshape(-1,)

### Plotting coefficients vs lambda

In [None]:
alpha_opt = 8   # Should be chosen for each regression using CV
plot_coefficients(beta_ridge,alpha_vals,alpha_opt,name='Ridge')
plot_coefficients(beta_lasso,alpha_vals,alpha_opt,name='Lasso')
plot_coefficients(beta_elnet,alpha_vals,alpha_opt,name='Elastic net')
plot_coefficients(beta_glasso,alpha_vals,alpha_opt,name='Grouped lasso')