Assignment 1

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt

## Fuel Economy

In [4]:
DATA_PATH = "FuelEconomy.csv"
fuel_df = pd.read_csv(DATA_PATH)

print("Shape:", fuel_df.shape)
print("\nColumns:")
print(fuel_df.columns.tolist())

display(fuel_df.head())

print("\nSummary statistics:")
display(fuel_df.describe(include="all"))

print("\nMissing values per column:")
display(fuel_df.isna().sum())

Shape: (100, 2)

Columns:
['Horse Power', 'Fuel Economy (MPG)']


Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.95201
3,187.310009,23.384546
4,218.59434,23.426739



Summary statistics:


Unnamed: 0,Horse Power,Fuel Economy (MPG)
count,100.0,100.0
mean,213.67619,23.178501
std,62.061726,4.701666
min,50.0,10.0
25%,174.996514,20.439516
50%,218.928402,23.143192
75%,251.706476,26.089933
max,350.0,35.0



Missing values per column:


Horse Power           0
Fuel Economy (MPG)    0
dtype: int64

In [5]:
# Label inputs and outputs
x = fuel_df.drop(columns=['Horse Power'])
y = fuel_df['Horse Power']

# Split into train and test
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.3, random_state=344)

In [6]:
# Train models
models = []

fuel_LR = LinearRegression()
fuel_LR.fit(x, y)
models.append(('Linear', fuel_LR))

def generate_poly_reg(degree = 2):
    model = Pipeline([
        ('poly_features', PolynomialFeatures(degree, include_bias=False)),
        ('linear_regression', LinearRegression())
    ])

    model.fit(x, y)
    return model

for deg in range (2, 5):
    models.append(('Polynomial deg. ' + str(deg), generate_poly_reg(deg)))


In [7]:
# Model evaluation
eval_table = []

for (name, model) in models:
    # Predict
    yhat_train = model.predict(x_train)
    yhat_test  = model.predict(x_test)

    # Metrics
    train_MSE = mean_squared_error(y_train, yhat_train)
    train_MAE = mean_squared_error(y_train, yhat_train)
    train_R2 = r2_score(y_train, yhat_train)
    
    test_MSE = mean_squared_error(y_test, yhat_test)
    test_MAE = mean_squared_error(y_test, yhat_test)
    test_R2 = r2_score(y_test, yhat_test)

    eval_table.append([
        name,
        train_MSE, train_MAE, train_R2,
        test_MSE, test_MAE, test_R2
    ])

# Display table
eval_cols = [
    'Model',
    'Train MSE', 'Train MAE', 'Train R^2',
    'Test MSE', 'Test MAE', 'Test R^2'
]
display(pd.DataFrame(eval_table, columns=eval_cols))

Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2
0,Linear,188.197275,188.197275,0.94166,412.113335,412.113335,0.892553
1,Polynomial deg. 2,188.932218,188.932218,0.941432,408.581222,408.581222,0.893473
2,Polynomial deg. 3,185.511847,185.511847,0.942492,399.394567,399.394567,0.895869
3,Polynomial deg. 4,189.212346,189.212346,0.941345,391.590724,391.590724,0.897903


With this dataset the fourth-degree model performs best on the test set, although the differences between models are quite small. Here increasing the degree of the model improves test performance in all cases (test MSE, MAE, and R^2 each increase every time degree is increased). This would suggest that the higher-order polynomials are more accurately modelling the real distribution of the data (which itself may be some sort of smooth not-quite-polynomial shape). 

Although the training set performs better than the test set, the model does not appear to be severely overfitted as the training and test R^2 values are not vastly different, and the gulf between them does not explode in the higher-order polynomial models with more freedom.

## Electricity Consumption

In [8]:
DATA_PATH = "electricity_consumption_based_weather_dataset.csv"
elec_df = pd.read_csv(DATA_PATH)

print("Shape:", elec_df.shape)
print("\nColumns:")
print(elec_df.columns.tolist())

display(elec_df.head())

print("\nSummary statistics:")
display(elec_df.describe(include="all"))

print("\nMissing values per column:")
display(elec_df.isna().sum())

Shape: (1433, 6)

Columns:
['date', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'daily_consumption']


Unnamed: 0,date,AWND,PRCP,TMAX,TMIN,daily_consumption
0,2006-12-16,2.5,0.0,10.6,5.0,1209.176
1,2006-12-17,2.6,0.0,13.3,5.6,3390.46
2,2006-12-18,2.4,0.0,15.0,6.7,2203.826
3,2006-12-19,2.4,0.0,7.2,2.2,1666.194
4,2006-12-20,2.4,0.0,7.2,1.1,2225.748



Summary statistics:


Unnamed: 0,date,AWND,PRCP,TMAX,TMIN,daily_consumption
count,1433,1418.0,1433.0,1433.0,1433.0,1433.0
unique,1433,,,,,
top,2006-12-16,,,,,
freq,1,,,,,
mean,,2.642313,3.800488,17.187509,9.141242,1561.078061
std,,1.140021,10.973436,10.136415,9.028417,606.819667
min,,0.0,0.0,-8.9,-14.4,14.218
25%,,1.8,0.0,8.9,2.2,1165.7
50%,,2.4,0.0,17.8,9.4,1542.65
75%,,3.3,1.3,26.1,17.2,1893.608



Missing values per column:


date                  0
AWND                 15
PRCP                  0
TMAX                  0
TMIN                  0
daily_consumption     0
dtype: int64

In [None]:
# Drop incomplete rows
elec_df = elec_df.dropna()

# elec_df['date'] = pd.to_datetime(elec_df['date'], format='%Y-%m-%d').astype(int)
elec_df = elec_df.drop(columns=['date'])
# Unsure how to correctly format date column; removed it because that seemed to improve model performance somewhat

# Label inputs and outputs
x = elec_df.drop(columns=['daily_consumption'])
y = elec_df['daily_consumption']

# Split into train and test
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.3, random_state=344)

In [10]:
# Train models
models = []

elec_LR = LinearRegression()
elec_LR.fit(x, y)
models.append(('Linear', elec_LR))

def generate_poly_reg(degree = 2):
    model = Pipeline([
        ('poly_features', PolynomialFeatures(degree, include_bias=False)),
        ('linear_regression', LinearRegression())
    ])

    model.fit(x, y)
    return model

for deg in range (2, 5):
    models.append(('Polynomial deg. ' + str(deg), generate_poly_reg(deg)))


In [11]:
# Model evaluation
eval_table = []

for (name, model) in models:
    # Predict
    yhat_train = model.predict(x_train)
    yhat_test  = model.predict(x_test)

    # Metrics
    train_MSE = mean_squared_error(y_train, yhat_train)
    train_MAE = mean_squared_error(y_train, yhat_train)
    train_R2 = r2_score(y_train, yhat_train)
    
    test_MSE = mean_squared_error(y_test, yhat_test)
    test_MAE = mean_squared_error(y_test, yhat_test)
    test_R2 = r2_score(y_test, yhat_test)

    eval_table.append([
        name,
        train_MSE, train_MAE, train_R2,
        test_MSE, test_MAE, test_R2
    ])

# Display table
eval_cols = [
    'Model',
    'Train MSE', 'Train MAE', 'Train R^2',
    'Test MSE', 'Test MAE', 'Test R^2'
]
display(pd.DataFrame(eval_table, columns=eval_cols))

Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2
0,Linear,272161.966181,272161.966181,0.321706,261737.557279,261737.557279,0.26482
1,Polynomial deg. 2,265030.948836,265030.948836,0.339478,255347.839335,255347.839335,0.282768
2,Polynomial deg. 3,257451.905521,257451.905521,0.358367,248912.319865,248912.319865,0.300844
3,Polynomial deg. 4,248880.198336,248880.198336,0.37973,244582.721746,244582.721746,0.313005


The linear model seems be far more successful on at generalizing to the test data (it is also significantly better on the training data). I removed the date column entirely since I was not sure how to format it correctly. With the date column included, both training and test columns had atrocious performance (R^2 ~= 0.01) on the polynomial models, but with that column removed the training metrics improve with model degree. Here the test R^2 values slowly increase with the degree of the model, suggesting that there is some additional non-linear association between the weather metrics and energy usage that the model is not fully capturing.

One reason that weather patterns and electricity usage may be non-linearly related is that multiple types of extreme weather could increase (or maybe decrease?) energy usage. This is obvious with temperature, where hot days will use a lot of energy on A/C, cold days will use a lot on heating, and moderate days will use less.

The R^2 values are only around 0.3, suggesting that the majority of variance in electricity consumption is not weather-related. This seems plausible, although I'm not sure how to reasonably verify this decomposition without assuming that the model is successfully capturing most of the weather-related variance.