In [16]:
# Set up the environment
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

## Loading data into python
df = pd.read_csv('climate_change.csv')

In [28]:
# Split the data
df_train = df[df['Year'] <= 2002]
df_test = df[df['Year'] > 2002]

a) Build a linear regression model to predict the dependent variable Temp, using CO2,
CH4, N2O, CFC-11, CFC-12, Aerosols, TSI and MEI as features (Year and Month
should NOT be used as features in the model). As always, use only the training set to
train your model. What are the in-sample and out-of-sample R2, MSE, and MAE?

In [29]:
# First we create a "blank" linear regression model for training dataset
LR_train1 = LinearRegression()

# Then we use the "fit" function on training dataset
features = [
 'MEI',
 'CO2',
 'CH4',
 'N2O',
 'CFC-11',
 'CFC-12',
 'TSI',
 'Aerosols',
]
X_train1 = df_train[features]
Y_train1 = df_train['Temp']
LR_train1.fit(X_train1,Y_train1)

# After fitting, we can check the coefficients with the "coef_" attribute
print("training dataset coefficients:", LR_train1.coef_)

# The intercept is accessed separately with the "intercept_" attribute
print("training dataset intercept:", LR_train1.intercept_)

# Now that our model is trained, we can make predictions.
y_pred_train1 = LR_train1.predict(X_train1)

# Check for in-sample R2, MSE, and MAE
print("in-sample R2:", r2_score(Y_train1,y_pred_train1))
print("in-sample MSE:", mean_squared_error(Y_train1,y_pred_train1))
print("in-sample MAE:", mean_absolute_error(Y_train1,y_pred_train1))

training dataset coefficients: [ 6.59008764e-02  6.24077568e-03  2.62189354e-04 -3.48478075e-02
 -8.87194950e-03  5.48441303e-03  1.19394890e-01 -1.65036522e+00]
training dataset intercept: -155.18672675256192
in-sample R2: 0.6920595959984741
in-sample MSE: 0.008731426409911177
in-sample MAE: 0.07260918612938931


In [33]:
# Testing the model on test dataset
X_test1 = df_test[features]
Y_test1 = df_test['Temp']

# Make predictions on the test dataset
y_pred_test1 = LR_train1.predict(X_test1)

# Check for out-of-sample R2, MSE, and MAE
print("out-of-sample R2:", r2_score(Y_test1, y_pred_test1))
print("out-of-sample MSE:", mean_squared_error(Y_test1, y_pred_test1))
print("out-of-sample MAE:", mean_absolute_error(Y_test1, y_pred_test1))

out-of-sample R2: -0.541325583402297
out-of-sample MSE: 0.012206974835137179
out-of-sample MAE: 0.09312747891276273


b) Build another linear regression model, this time with only N2O, Aerosols, TSI, and
MEI as features. What are the in-sample and out-of-sample R2

In [34]:
# First we create a "blank" linear regression model for training dataset
LR_train2 = LinearRegression()

# Then we use the "fit" function on training dataset
features2 = [
 'MEI',
 'N2O',
 'TSI',
 'Aerosols'
]
X_train2 = df_train[features2]
Y_train2 = df_train['Temp']
LR_train2.fit(X_train2,Y_train2)

# After fitting, we can check the coefficients with the "coef_" attribute
print("training dataset coefficients:", LR_train2.coef_)

# The intercept is accessed separately with the "intercept_" attribute
print("training dataset intercept:", LR_train2.intercept_)

# Now that our model is trained, we can make predictions.
y_pred_train2 = LR_train2.predict(X_train2)

# Check for in-sample R2, MSE, and MAE
print("in-sample R2:", r2_score(Y_train2,y_pred_train2))
print("in-sample MSE:", mean_squared_error(Y_train2,y_pred_train2))
print("in-sample MAE:", mean_absolute_error(Y_train2,y_pred_train2))

training dataset coefficients: [ 0.06549568  0.02427612  0.08577046 -1.72465971]
training dataset intercept: -124.4841255734033
in-sample R2: 0.6490120806760372
in-sample MSE: 0.009952007429105784
in-sample MAE: 0.07666650280233205


In [36]:
# Testing the model on test dataset
X_test2 = df_test[features2]
Y_test2 = df_test['Temp']

# Make predictions on the test dataset
y_pred_test2 = LR_train2.predict(X_test2)

# Check for out-of-sample R2, MSE, and MAE
print("out-of-sample R2:", r2_score(Y_test2, y_pred_test2))
print("out-of-sample MSE:", mean_squared_error(Y_test2, y_pred_test2))
print("out-of-sample MAE:", mean_absolute_error(Y_test2, y_pred_test2))

out-of-sample R2: 0.20031861104556226
out-of-sample MSE: 0.006333308611894036
out-of-sample MAE: 0.06154027269393422
