In [4]:
import pandas as pd
from sklearn import linear_model

#Read the data into a pandas DataFrame and convert it to stationarity
df = pd.read_csv('MSFT.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])


df['Close_diff'] = df['Close'].diff().diff()

In [5]:
df

Unnamed: 0_level_0,Close,Close_diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-31,336.320007,
2022-01-03,334.750000,
2022-01-04,329.010010,-4.169983
2022-01-05,316.380005,-6.890015
2022-01-06,313.880005,10.130005
...,...,...
2022-12-23,238.729996,6.779985
2022-12-27,236.960007,-2.309983
2022-12-28,234.529999,-0.660019
2022-12-29,241.009995,8.910004


In [6]:
#add two columns containing the LAG=1 and LAG=2 version of the data to the DataFrame
df['T_(i-1)'] = df['Close_diff'].shift(1)
df['T_(i-2)'] = df['Close_diff'].shift(2)

In [7]:
#drop the top four rows as they contain NaNs after shifting
df = df.drop(df.index[[0, 1, 2, 3]])

In [13]:
#fit a linear regression model on T_i and T_i-1 and add it's predictions to the DataFrame as a new column
lm = linear_model.LinearRegression()
df_X = df['T_(i-1)'].values.reshape(-1, 1)
df_y = df['Close_diff'].values.reshape(-1, 1)
model = lm.fit(df_X,df_y)
df['Predicted_T_i|T_(i-1)'] = lm.predict(df_X)

In [14]:
df

Unnamed: 0_level_0,Close,Close_diff,T_(i-1),T_(i-2),Predicted_T_i|T_(i-1)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-06,313.880005,10.130005,-6.890015,-4.169983,3.403092
2022-01-07,314.040009,2.660004,10.130005,-6.890015,-4.830544
2022-01-10,314.269989,0.069976,2.660004,10.130005,-1.216842
2022-01-11,314.980011,0.480042,0.069976,2.660004,0.036114
2022-01-12,318.269989,2.579956,0.480042,0.069976,-0.162260
...,...,...,...,...,...
2022-12-23,238.729996,6.779985,-8.869981,1.279984,4.360924
2022-12-27,236.960007,-2.309983,6.779985,-8.869981,-3.209932
2022-12-28,234.529999,-0.660019,-2.309983,6.779985,1.187448
2022-12-29,241.009995,8.910004,-0.660019,-2.309983,0.389258


In [15]:
#create the time series of residuals corresponding to the predictions of this model and add it to the DataFrame.
# This gives us the first one of the two time series we need for calculating the PACF for X at LAG=2
#Observed minus predicted
df['Residual_T_i|T_(i-1)'] = df['Close_diff'] - df['Predicted_T_i|T_(i-1)']

In [16]:
# #repeat the above procedure to calculate the second time series of residuals
lm = linear_model.LinearRegression()
df_X = df[['T_(i-1)']] #Note the double brackets! [[]]
df_y = df['T_(i-2)'] #Note the single brackets! []
model = lm.fit(df_X,df_y)
df['Predicted_T_(i-2)|T_(i-1)'] = lm.predict(df_X)
#Observed minus predicted
df['Residual_T_(i-2)|T_(i-1)'] = df['T_(i-2)'] - df['Predicted_T_(i-2)|T_(i-1)']

#Finally, apply the formula for Pearson's r to the two time series of residuals to get the value of the PACF at LAG=2
print(df.corr(method='pearson')['Residual_T_i|T_(i-1)']['Residual_T_(i-2)|T_(i-1)'])

-0.31214641310046


In [18]:
# Question: Based on the above process, please complete the partial correlation for Apple 
# close stock prices with lag 2 from scratch and test its result by using statsmodels library.

import pandas as pd
from sklearn import linear_model

#Read the data into a pandas DataFrame and convert it to stationarity
df = pd.read_csv('MSFT.csv', header=0, infer_datetime_format=True, parse_dates=[0], index_col=[0])
df['Close_diff'] = df['Close'].diff().diff()
#add two columns containing the LAG=1 and LAG=2 version of the data to the DataFrame
df['T_(i-1)'] = df['Close_diff'].shift(1)
df['T_(i-2)'] = df['Close_diff'].shift(2)
#drop the top four rows as they contain NaNs after shifting
df = df.drop(df.index[[0, 1, 2, 3]])
#fit a linear regression model on T_i and T_i-1 and add it's predictions to the DataFrame as a new column
lm = linear_model.LinearRegression()
df_X = df[['T_(i-1)']] #Note the double brackets! [[]]
df_y = df['Close_diff'] #Note the single brackets! []
model = lm.fit(df_X,df_y)
df['Predicted_T_i|T_(i-1)'] = lm.predict(df_X)
#create the time series of residuals corresponding to the predictions of this model and add it to the DataFrame.
# This gives us the first one of the two time series we need for calculating the PACF for X at LAG=2
#Observed minus predicted
df['Residual_T_i|T_(i-1)'] = df['Close_diff'] - df['Predicted_T_i|T_(i-1)']
# #repeat the above procedure to calculate the second time series of residuals
lm = linear_model.LinearRegression()
df_X = df[['T_(i-1)']] #Note the double brackets! [[]]
df_y = df['T_(i-2)'] #Note the single brackets! []
model = lm.fit(df_X,df_y)
df['Predicted_T_(i-2)|T_(i-1)'] = lm.predict(df_X)
#Observed minus predicted
df['Residual_T_(i-2)|T_(i-1)'] = df['T_(i-2)'] - df['Predicted_T_(i-2)|T_(i-1)']

#Finally, apply the formula for Pearson's r to the two time series of residuals to get the value of the PACF at LAG=2
print(df.corr(method='pearson')['Residual_T_i|T_(i-1)']['Residual_T_(i-2)|T_(i-1)'])

-0.31214641310046


### Implementing AR from scratch

In [23]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.13.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting patsy>=0.5.2
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.8/233.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.3 statsmodels-0.13.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
pd.options.mode.chained_assignment = None  # default='warn