In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
%matplotlib inline

# Regression Analysis: Seasonal Effects with Sklearn Linear Regression
In this notebook, you will build a SKLearn linear regression model to predict Yen futures ("settle") returns with *lagged* CAD/JPY exchange rate returns. 

In [2]:
# Currency pair exchange rates for CAD/JPY
cad_jpy_df = pd.read_csv(
    Path("cad_jpy.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
cad_jpy_df.head()

Unnamed: 0_level_0,Price,Open,High,Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1982-01-05,184.65,184.65,184.65,184.65
1982-01-06,185.06,185.06,185.06,185.06
1982-01-07,186.88,186.88,186.88,186.88
1982-01-08,186.58,186.58,186.58,186.58
1982-01-11,187.64,187.64,187.64,187.64


In [3]:
# Trim the dataset to begin on January 1st, 1990
cad_jpy_df = cad_jpy_df.loc["1990-01-01":, :]
cad_jpy_df.head()

Unnamed: 0_level_0,Price,Open,High,Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-02,126.37,126.31,126.37,126.31
1990-01-03,125.3,125.24,125.3,125.24
1990-01-04,123.46,123.41,123.46,123.41
1990-01-05,124.54,124.48,124.54,124.48
1990-01-08,124.27,124.21,124.27,124.21


# Data Preparation

### Returns

In [4]:
# Create a series using "Price" percentage returns, drop any nan"s, and check the results:

# (Make sure to multiply the pct_change() results by 100)
# In this case, you may have to replace inf, -inf values with np.nan"s
cad_jpy_df['Returns']=cad_jpy_df.Price.pct_change().dropna() *100
cad_jpy_df.dropna(inplace=True)
cad_jpy_df.tail()

Unnamed: 0_level_0,Price,Open,High,Low,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-29,78.29,78.21,78.41,77.75,0.076697
2020-06-01,79.27,78.21,79.36,78.04,1.251756
2020-06-02,80.4,79.26,80.56,79.15,1.425508
2020-06-03,80.7,80.4,80.82,79.96,0.373134
2020-06-04,80.71,80.8,80.89,80.51,0.012392


### Lagged Returns 

In [5]:
# Create a lagged return using the shift function
cad_jpy_df['Lagged_Return']=cad_jpy_df['Returns'].shift()
cad_jpy_df.dropna(inplace=True)
cad_jpy_df.tail()

Unnamed: 0_level_0,Price,Open,High,Low,Returns,Lagged_Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-05-29,78.29,78.21,78.41,77.75,0.076697,-0.114913
2020-06-01,79.27,78.21,79.36,78.04,1.251756,0.076697
2020-06-02,80.4,79.26,80.56,79.15,1.425508,1.251756
2020-06-03,80.7,80.4,80.82,79.96,0.373134,1.425508
2020-06-04,80.71,80.8,80.89,80.51,0.012392,0.373134


### Train Test Split

In [6]:
# Create a train/test split for the data using 2018-2019 for testing and the rest for training
train = cad_jpy_df[:'2017']
test = cad_jpy_df['2018':]

In [7]:
# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
X_train=train[['Lagged_Return']]
X_test=test[['Lagged_Return']]
y_train=train[['Returns']]
y_test=test[['Returns']]

In [8]:
# Preview the X_train data
X_train

Unnamed: 0_level_0,Lagged_Return
Date,Unnamed: 1_level_1
1990-01-04,-0.846720
1990-01-05,-1.468476
1990-01-08,0.874777
1990-01-09,-0.216798
1990-01-10,0.667901
...,...
2017-12-25,-0.011240
2017-12-26,0.033723
2017-12-27,0.269693
2017-12-28,0.392245


# Linear Regression Model

In [9]:
# Create a Linear Regression model and fit it to the training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Fit a SKLearn linear regression using  just the training set (X_train, Y_train):
model.fit(X_train, y_train)

LinearRegression()

# Make predictions using the Testing Data

**Note:** We want to evaluate the model using data that it has never seen before, in this case: `X_test`.

In [11]:
# Make a prediction of "y" values using just the test dataset
predictions = model.predict(y_test)
# Make a prediction of "y" values using just the test dataset
predictions = model.predict(X_test)

Feature names unseen at fit time:
- Returns
Feature names seen at fit time, yet now missing:
- Lagged_Return



In [12]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
results = y_test.to_frame()
results['Predicted_Return'] = predictions

AttributeError: 'DataFrame' object has no attribute 'to_frame'

In [13]:
# Plot the first 20 predictions vs the true values
results.head(20).plot(subplots=True, 
                  figsize=(12,10),
                  title = "First 20 Predictions from 2018")

NameError: name 'results' is not defined

# Out-of-Sample Performance

Evaluate the model using "out-of-sample" data (`X_test` and `y_test`)

In [14]:
from sklearn.metrics import mean_squared_error
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 
# (Hint: use the dataframe from above)
mse = mean_squared_error(results["Percentage"],results["Predicted_Return"])

# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
rmse = np.sqrt(mse)
print(f"The Out-of-Sample Root Mean Squared Error (RMSE) is: {rmse}")

NameError: name 'results' is not defined

# In-Sample Performance

Evaluate the model using in-sample data (X_train and y_train)

In [15]:
# Construct a dataframe using just the "y" training data:
in_sample_results_df = pd.DataFrame(y_train)

# Add a column of "in-sample" predictions to that dataframe:  
in_sample_results_df["In-sample_Predictions"] = model.predict(X_train)

In [16]:
# Display the in_sample_results_df dataframe
in_sample_results_df.head()

Unnamed: 0_level_0,Returns,In-sample_Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1990-01-04,-1.468476,0.020446
1990-01-05,0.874777,0.036249
1990-01-08,-0.216798,-0.023309
1990-01-09,0.667901,0.004436
1990-01-10,0.567546,-0.018051


In [20]:
# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
in_sample_mse = mean_squared_error(
    in_sample_results_df['Percentage'],
    in_sample_results_df['In-sample_Predictions'])

# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
in_sample_rmse = np.sqrt(in_sample_mse)

print(f"The In-sample Root Mean Squared Error (RMSE) is: {in_sample_rmse}")

KeyError: 'Percentage'

# Conclusions

**Question:** Does this model perform better or worse on out-of-sample data as compared to in-sample data?

**Answer: The model performs better on the out-of-sample data as compared to in-sample data. We can see this with the smaller RMSE value for Out-of-Sample.

The Out-of-Sample Root Mean Squared Error (RMSE) is: 0.6445805658569028
The In-sample Root Mean Squared Error (RMSE) is: 0.841994632894117
In this model, the out-of-sample data is underfitted therefore, it does not provide a good prediciton model. 