In [220]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [221]:
# 1a. Download 2 year stock prices of Apple and Facebook in 2018 2019

ticker = ['AAPL','FB']
df = yf.download(ticker, start='2018-01-01', end='2019-12-31', progress=False)
df = df.replace("", float("NaN"))
df = df.dropna()
df.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Close,Close,High,High,Low,Low,Open,Open,Volume,Volume
Unnamed: 0_level_1,AAPL,FB,AAPL,FB,AAPL,FB,AAPL,FB,AAPL,FB,AAPL,FB
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2018-01-02,166.8,181.42,172.26,181.42,172.3,181.58,169.26,177.55,170.16,177.68,25555900,18151900
2018-01-03,166.77,184.67,172.23,184.67,174.55,184.78,171.96,181.33,172.53,181.88,29517900,16886600
2018-01-04,167.55,184.33,173.03,184.33,173.47,186.21,172.08,184.1,172.54,184.9,22434600,13880900
2018-01-05,169.46,186.85,175.0,186.85,175.37,186.9,173.05,184.93,173.44,185.59,23660000,13574500
2018-01-08,168.83,188.28,174.35,188.28,175.61,188.9,173.93,186.33,174.35,187.2,20567800,17994700


In [222]:
# 1b. Compute log return from 'Adj Close' stock price

stock1_logreturn = np.log(df['Adj Close'][ticker[0]]) - np.log(df['Adj Close'][ticker[0]].shift(1))
stock1_logreturn = stock1_logreturn.replace(0, float("NaN"))
stock1_logreturn = stock1_logreturn.dropna()

stock2_logreturn = np.log(df['Adj Close'][ticker[1]]) - np.log(df['Adj Close'][ticker[1]].shift(1))
stock2_logreturn = stock2_logreturn.replace(0, float("NaN"))
stock2_logreturn = stock2_logreturn.dropna()

In [226]:
# 1c. Compute the mean, standard deviation, skewness, excess kurtosis of their log returns
# 1d. Repeat for both stocks

stock1_logreturn_mean = stock1_logreturn.mean()
stock1_logreturn_std = stock1_logreturn.std()
stock1_logreturn_skew = stock1_logreturn.skew()
stock1_logreturn_kurtosis = stock1_logreturn.kurtosis()
print(f"{ticker[0]} log-return mean: {stock1_logreturn_mean}")
print(f"{ticker[0]} log-return standard deviation: {stock1_logreturn_std}")
print(f"{ticker[0]} log-return skewness: {stock1_logreturn_skew}")
print(f"{ticker[0]} log-return excess kurtosis: {stock1_logreturn_kurtosis}")
print()

stock2_logreturn_mean = stock2_logreturn.mean()
stock2_logreturn_std = stock2_logreturn.std()
stock2_logreturn_skew = stock2_logreturn.skew()
stock2_logreturn_kurtosis = stock2_logreturn.kurtosis()
print(f"{ticker[1]} log return mean: {stock2_logreturn_mean}")
print(f"{ticker[1]} log return standard deviation: {stock2_logreturn_std}")
print(f"{ticker[1]} log return skewness: {stock2_logreturn_skew}")
print(f"{ticker[1]} log return excess kurtosis: {stock2_logreturn_kurtosis}")

AAPL log-return mean: 0.0011096674932137584
AAPL log-return standard deviation: 0.0174151050193874
AAPL log-return skewness: -0.5860969971892593
AAPL log-return excess kurtosis: 4.33970661778411

FB log return mean: 0.00023910419964947498
FB log return standard deviation: 0.021440505844551056
FB log return skewness: -1.8507563053613612
FB log return excess kurtosis: 20.336050052387982


In [230]:
# 1e. 
## Question 1: Compute covariance and correlation.

df_logreturn = pd.concat([stock1_logreturn, stock2_logreturn], axis=1)
stock1_stock2_covariance = df_logreturn.cov()
stock1_stock2_correlation = df_logreturn.corr()
print(f"{ticker[0]} - {ticker[1]} log return covariance")
print(stock1_stock2_covariance)
print()
print(f"{ticker[0]} - {ticker[1]} log return correlation")
print(stock1_stock2_correlation)

## Question 2: Explain their difference. How do you convert one to the other?
### Covariance measures how two variables move with respect to each other. Correlation is a step ahead of covariance as it quantifies the relationship between two random variables, it's a unit measure of how these variables change with respect to each other (normalized covariance value)
#### The Cov(AAPL, FB) = 0.000168 > 0 indicates that those stock log returns tend to move together. Cross check with Corr(AAPL, FB) = 0.450063 is the magnitude of the measurement how these two stocks change with respect to each other

### Covariance = Correlation * Standard Deviation of Stock A * Standard Deviation of Stock B
#### Cov(AAPL, FB) = Corr(AAPL, FB) * Std(AAPL) * Std(FB) = 0.450063*0.017415*0.021440 = 0.000168


AAPL - FB log return covariance
          AAPL        FB
AAPL  0.000303  0.000168
FB    0.000168  0.000460

AAPL - FB log return correlation
          AAPL        FB
AAPL  1.000000  0.450063
FB    0.450063  1.000000


In [217]:
#2.a Divide data into 2 parts: train dataset 80% data and test data set 20% data
### Test data set is the latest data 
### Train data set is older than test data set
df_train, df_test = train_test_split(df_logreturn, test_size=0.2, shuffle=False)
df_train = df_train.dropna()
df_test = df_test.dropna()

In [216]:
# 2.b This function to categorize each day in 2 year prices history as beloging to one of four category:
<ul>
<li>Both stocks up</li>
<li>Stock \#1 up, stock \#2 down</li>
<li>Stock \#1 down, stock \#2 up</li>
<li>Both stocks down</li>
</ul>
# 2.c Build transition matrix of portfolio direction
def transition_matrix(df_input):
    df_input['movement'] = ''
    df_input = df_input.dropna()

    STATE_UP_UP = 1
    STATE_DOWN_DOWN = -1
    STATE_UP_DOWN = 0.5
    STATE_DOWN_UP = -0.5

    for index, row in df_input.iterrows():
        if row[ticker[0]] > 0 and row[ticker[1]] > 0:
            df_input.loc[index, ['movement']] = STATE_UP_UP
        elif row[ticker[0]] < 0 and row[ticker[1]] < 0:
            df_input.loc[index, ['movement']] = STATE_DOWN_DOWN
        elif row[ticker[0]] > 0 and row[ticker[1]] < 0:
            df_input.loc[index, ['movement']] = STATE_UP_DOWN
        elif row[ticker[0]] < 0 and row[ticker[1]] > 0:
            df_input.loc[index, ['movement']] = STATE_DOWN_UP

    df_input['movement_tomorrow'] = df_input['movement'].shift(-1)
    together_together = 0
    together_apart = 0
    apart_together = 0
    apart_apart = 0

    for index, row in df_input.iterrows():
        test = row['movement'] + row['movement_tomorrow']
        # together then together
        if test == 2 or test == -2:
            together_together+=1
        # apart then apart
        elif test == 1 or test == -1:
            apart_apart+=1
        # distinguish together_together and apart_apart
        elif test == 0:
            if np.abs(row['movement']) == 1:
                together_together += 1 
            else:
                apart_apart += 1
        # apart_together and together_apart
        elif np.abs(test) == 0.5 or np.abs(test) == 1.5:
            if np.abs(row['movement']) == 1:
                together_apart +=1
            else:
                apart_together += 1

    columns = {
        'together':[together_together/(together_together+together_apart), apart_together/(apart_together+apart_apart)],
        'apart':[together_apart/(together_together+together_apart), apart_apart/(apart_together+apart_apart)],
    }
    transition_matrix = pd.DataFrame(columns, index=['together', 'apart'])

    return transition_matrix

In [218]:
### Transition matrix of test data set 20% data
test_transition_matrix = transition_matrix(df_test)
test_transition_matrix.head()

Unnamed: 0,together,apart
together,0.676471,0.323529
apart,0.65625,0.34375


In [219]:
### Transition matrix of train data set 80% data
train_transition_matrix = transition_matrix(df_train)
train_transition_matrix.head()

Unnamed: 0,together,apart
together,0.66537,0.33463
apart,0.621429,0.378571


In [None]:
# 2.d
### Both train and test data set shows the dominance probability of the portfolio moves from together to together, apart to together compares to other 2 scenarios that the portfolio moves from apart to apart and together to apart.
### The 20% latest data (test data set) illustrates the increase in scenarios moving together to together, apart to together. Which means the portoflio recently tends to more moving according to those 2 scenarios.
### The 80% older data (train data set) has more probability regarding the scenarios that the portfolio moves from together to apart, apart to apart. Which means the portoflio recently tends to less moving according to those 2 scenarios.

In [1]:
# 2.e Is the process Markovian?
### This process is Markovian, because it's memory-less. This process estimates the probability that the portfolio return moves from yesterday state to today state without considering the historical data