In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the datasets
capetown_data = pd.read_pickle('Capetown_temps_2020.pkl')
reykjavik_data = pd.read_pickle('Reykjavik_temps_2020.pkl')

# Convert the lists to DataFrames
capetown_df = pd.DataFrame({'date': pd.date_range(start='2020-03-01', periods=len(capetown_data), freq='D'), 'temp': capetown_data})
reykjavik_df = pd.DataFrame({'date': pd.date_range(start='2020-03-01', periods=len(reykjavik_data), freq='D'), 'temp': reykjavik_data})

# Create lag feature for both datasets (previous day temperature)
capetown_df['lag1'] = capetown_df['temp'].shift(1)
reykjavik_df['lag1'] = reykjavik_df['temp'].shift(1)

# Drop the first row (NaN from lagging)
capetown_df = capetown_df.dropna()
reykjavik_df = reykjavik_df.dropna()

# Split the data into training, validation, and test sets (80%, 10%, 10%)
train_ct, test_ct = train_test_split(capetown_df, test_size=0.1, shuffle=False)
train_ct, val_ct = train_test_split(train_ct, test_size=0.1, shuffle=False)

train_rk, test_rk = train_test_split(reykjavik_df, test_size=0.1, shuffle=False)
train_rk, val_rk = train_test_split(train_rk, test_size=0.1, shuffle=False)

# Separate features and target variable
X_train_ct, y_train_ct = train_ct[['lag1']], train_ct['temp']
X_val_ct, y_val_ct = val_ct[['lag1']], val_ct['temp']
X_test_ct, y_test_ct = test_ct[['lag1']], test_ct['temp']

X_train_rk, y_train_rk = train_rk[['lag1']], train_rk['temp']
X_val_rk, y_val_rk = val_rk[['lag1']], val_rk['temp']
X_test_rk, y_test_rk = test_rk[['lag1']], test_rk['temp']

# Train linear regression models for both cities
model_ct = LinearRegression()
model_rk = LinearRegression()

model_ct.fit(X_train_ct, y_train_ct)
model_rk.fit(X_train_rk, y_train_rk)

# Make predictions
train_pred_ct = model_ct.predict(X_train_ct)
val_pred_ct = model_ct.predict(X_val_ct)
test_pred_ct = model_ct.predict(X_test_ct)

train_pred_rk = model_rk.predict(X_train_rk)
val_pred_rk = model_rk.predict(X_val_rk)
test_pred_rk = model_rk.predict(X_test_rk)

# Calculate MSE for training, validation, and test sets
mse_train_ct = mean_squared_error(y_train_ct, train_pred_ct)
mse_val_ct = mean_squared_error(y_val_ct, val_pred_ct)
mse_test_ct = mean_squared_error(y_test_ct, test_pred_ct)

mse_train_rk = mean_squared_error(y_train_rk, train_pred_rk)
mse_val_rk = mean_squared_error(y_val_rk, val_pred_rk)
mse_test_rk = mean_squared_error(y_test_rk, test_pred_rk)

print("Cape Town MSE - Train:", mse_train_ct, "Validation:", mse_val_ct, "Test:", mse_test_ct)
print("Reykjavik MSE - Train:", mse_train_rk, "Validation:", mse_val_rk, "Test:", mse_test_rk)

Cape Town MSE - Train: 8.646742345026157 Validation: 4.684527973680094 Test: 16.52826858060669
Reykjavik MSE - Train: 12.668476938478179 Validation: 13.808163152251826 Test: 2.9713373395960017


For i question, a linear regression model was trained to predict the next day's temperature using the previous day's temperature. The model performed on the training and validation sets for both Cape Town and Reykjavik, but while Cape Town showed higher test MSE (16.53), Reykjavik had much lower test MSE (2.97), indicating better generalization for Reykjavik.

In [4]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso

# Create lag2 and lag3 features before splitting the data
capetown_df['lag2'] = capetown_df['temp'].shift(2)
capetown_df['lag3'] = capetown_df['temp'].shift(3)

reykjavik_df['lag2'] = reykjavik_df['temp'].shift(2)
reykjavik_df['lag3'] = reykjavik_df['temp'].shift(3)

# Drop NaN values resulting from the lagging process
capetown_df = capetown_df.dropna()
reykjavik_df = reykjavik_df.dropna()

# Now, split the data into training, validation, and test sets (80%, 10%, 10%) after creating lag features
train_ct, test_ct = train_test_split(capetown_df, test_size=0.1, shuffle=False)
train_ct, val_ct = train_test_split(train_ct, test_size=0.1, shuffle=False)

train_rk, test_rk = train_test_split(reykjavik_df, test_size=0.1, shuffle=False)
train_rk, val_rk = train_test_split(train_rk, test_size=0.1, shuffle=False)

# Now, you should be able to access the lag2 and lag3 columns for training
X_train_ct, X_val_ct, X_test_ct = train_ct[['lag1', 'lag2', 'lag3']], val_ct[['lag1', 'lag2', 'lag3']], test_ct[['lag1', 'lag2', 'lag3']]
X_train_rk, X_val_rk, X_test_rk = train_rk[['lag1', 'lag2', 'lag3']], val_rk[['lag1', 'lag2', 'lag3']], test_rk[['lag1', 'lag2', 'lag3']]

# The target variable remains the same
y_train_ct, y_val_ct, y_test_ct = train_ct['temp'], val_ct['temp'], test_ct['temp']
y_train_rk, y_val_rk, y_test_rk = train_rk['temp'], val_rk['temp'], test_rk['temp']

# Generate polynomial features (degree 2)
poly = PolynomialFeatures(degree=2, include_bias=False)

# Create lag2 and lag3 features
capetown_df['lag2'] = capetown_df['temp'].shift(2)
capetown_df['lag3'] = capetown_df['temp'].shift(3)

reykjavik_df['lag2'] = reykjavik_df['temp'].shift(2)
reykjavik_df['lag3'] = reykjavik_df['temp'].shift(3)

# Drop NaN values from the lagging process
capetown_df = capetown_df.dropna()
reykjavik_df = reykjavik_df.dropna()

# Update train, validation, and test sets with additional lag features
X_train_ct, X_val_ct, X_test_ct = train_ct[['lag1', 'lag2', 'lag3']], val_ct[['lag1', 'lag2', 'lag3']], test_ct[['lag1', 'lag2', 'lag3']]
X_train_rk, X_val_rk, X_test_rk = train_rk[['lag1', 'lag2', 'lag3']], val_rk[['lag1', 'lag2', 'lag3']], test_rk[['lag1', 'lag2', 'lag3']]

# Apply polynomial transformation (degree 2) to the features
X_train_ct_poly = poly.fit_transform(X_train_ct)
X_val_ct_poly = poly.fit_transform(X_val_ct)
X_test_ct_poly = poly.fit_transform(X_test_ct)

X_train_rk_poly = poly.fit_transform(X_train_rk)
X_val_rk_poly = poly.fit_transform(X_val_rk)
X_test_rk_poly = poly.fit_transform(X_test_rk)

# Train Ridge regression model as an example
ridge_ct = Ridge(alpha=1.0)
ridge_rk = Ridge(alpha=1.0)

ridge_ct.fit(X_train_ct_poly, y_train_ct)
ridge_rk.fit(X_train_rk_poly, y_train_rk)

# Make predictions for Ridge model
train_pred_ct_ridge = ridge_ct.predict(X_train_ct_poly)
val_pred_ct_ridge = ridge_ct.predict(X_val_ct_poly)
test_pred_ct_ridge = ridge_ct.predict(X_test_ct_poly)

train_pred_rk_ridge = ridge_rk.predict(X_train_rk_poly)
val_pred_rk_ridge = ridge_rk.predict(X_val_rk_poly)
test_pred_rk_ridge = ridge_rk.predict(X_test_rk_poly)

# Calculate MSE for Ridge model
mse_train_ct_ridge = mean_squared_error(y_train_ct, train_pred_ct_ridge)
mse_val_ct_ridge = mean_squared_error(y_val_ct, val_pred_ct_ridge)
mse_test_ct_ridge = mean_squared_error(y_test_ct, test_pred_ct_ridge)

mse_train_rk_ridge = mean_squared_error(y_train_rk, train_pred_rk_ridge)
mse_val_rk_ridge = mean_squared_error(y_val_rk, val_pred_rk_ridge)
mse_test_rk_ridge = mean_squared_error(y_test_rk, test_pred_rk_ridge)

print("Cape Town Ridge MSE - Train:", mse_train_ct_ridge, "Validation:", mse_val_ct_ridge, "Test:", mse_test_ct_ridge)
print("Reykjavik Ridge MSE - Train:", mse_train_rk_ridge, "Validation:", mse_val_rk_ridge, "Test:", mse_test_rk_ridge)


Cape Town Ridge MSE - Train: 7.2629730737977045 Validation: 4.215890634974373 Test: 18.484807861211998
Reykjavik Ridge MSE - Train: 11.474817208668792 Validation: 15.644235697334508 Test: 3.8186349051273436


In [5]:
from sklearn.linear_model import Ridge
import numpy as np

# List of alpha values to try
alphas = [0.1, 1, 10, 100, 1000]

# Store results
mse_results_ct = []
mse_results_rk = []

# Loop over different alpha values
for alpha in alphas:
    # Ridge regression for Cape Town
    ridge_ct = Ridge(alpha=alpha)
    ridge_ct.fit(X_train_ct, y_train_ct)
    
    # Predictions for Cape Town
    train_pred_ct = ridge_ct.predict(X_train_ct)
    val_pred_ct = ridge_ct.predict(X_val_ct)
    test_pred_ct = ridge_ct.predict(X_test_ct)
    
    # Calculate MSE for Cape Town
    mse_train_ct = mean_squared_error(y_train_ct, train_pred_ct)
    mse_val_ct = mean_squared_error(y_val_ct, val_pred_ct)
    mse_test_ct = mean_squared_error(y_test_ct, test_pred_ct)
    
    # Store the MSE results for Cape Town
    mse_results_ct.append((alpha, mse_train_ct, mse_val_ct, mse_test_ct))
    
    # Ridge regression for Reykjavik
    ridge_rk = Ridge(alpha=alpha)
    ridge_rk.fit(X_train_rk, y_train_rk)
    
    # Predictions for Reykjavik
    train_pred_rk = ridge_rk.predict(X_train_rk)
    val_pred_rk = ridge_rk.predict(X_val_rk)
    test_pred_rk = ridge_rk.predict(X_test_rk)
    
    # Calculate MSE for Reykjavik
    mse_train_rk = mean_squared_error(y_train_rk, train_pred_rk)
    mse_val_rk = mean_squared_error(y_val_rk, val_pred_rk)
    mse_test_rk = mean_squared_error(y_test_rk, test_pred_rk)
    
    # Store the MSE results for Reykjavik
    mse_results_rk.append((alpha, mse_train_rk, mse_val_rk, mse_test_rk))

# Display the results
print("Cape Town Ridge MSE Results:")
for result in mse_results_ct:
    print(f"Alpha: {result[0]} - Train MSE: {result[1]}, Validation MSE: {result[2]}, Test MSE: {result[3]}")
    
print("\nReykjavik Ridge MSE Results:")
for result in mse_results_rk:
    print(f"Alpha: {result[0]} - Train MSE: {result[1]}, Validation MSE: {result[2]}, Test MSE: {result[3]}")


Cape Town Ridge MSE Results:
Alpha: 0.1 - Train MSE: 8.040601442252836, Validation MSE: 3.5280120117741074, Test MSE: 14.125521383672883
Alpha: 1 - Train MSE: 8.040612026511898, Validation MSE: 3.530439649302234, Test MSE: 14.128645922693153
Alpha: 10 - Train MSE: 8.041616863928908, Validation MSE: 3.5549500422885525, Test MSE: 14.160154759629165
Alpha: 100 - Train MSE: 8.106591978328854, Validation MSE: 3.8145747606197538, Test MSE: 14.483023524146297
Alpha: 1000 - Train MSE: 9.213763391362074, Validation MSE: 6.311498698180615, Test MSE: 17.033340986634666

Reykjavik Ridge MSE Results:
Alpha: 0.1 - Train MSE: 13.108509113339068, Validation MSE: 12.770934796656341, Test MSE: 3.024729470932643
Alpha: 1 - Train MSE: 13.108515871548828, Validation MSE: 12.7723490740648, Test MSE: 3.02478465913848
Alpha: 10 - Train MSE: 13.109173201245463, Validation MSE: 12.7866380626455, Test MSE: 3.025371239664746
Alpha: 100 - Train MSE: 13.160410950012285, Validation MSE: 12.94194435713416, Test MSE: 

For ii question, Ridge regression was used to tune the model by adjusting the alpha parameter. For Cape Town, the best performance was achieved with lower alpha values, with a test MSE of 14.13 at alpha 0.1. For Reykjavik, the best result was also at alpha 0.1, yielding a test MSE of 3.02, showing consistent improvement in generalization compared to the linear regression model.

Summary observation:

Cape Town Ridge Results:
Alpha 0.1 performed best with a test MSE of 14.13, indicating a balanced regularization that minimized error.
Increasing alpha values slightly increased test MSE, with the highest being 17.03 at alpha 1000, showing that stronger regularization reduced model flexibility and worsened performance.

Reykjavik Ridge Results:
Alpha 0.1 also gave the best performance with a test MSE of 3.02, showing optimal regularization.
As alpha increased, test MSE gradually increased, reaching 3.83 at alpha 1000, indicating over-regularization that diminished accuracy.