In [1]:
import pandas as pd 
import numpy as np 
import datetime
import pandas_datareader.data as web
import yfinance as yf
import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)


import random 
from ipywidgets import widgets
from chart_studio.widgets import GraphWidget
from scipy.stats import ttest_ind,probplot,bayes_mvs, linregress
from ipywidgets import interact,interact_manual,interactive_output


<IPython.core.display.Javascript object>

In [2]:
#get basic stock date from today to 2018-01-01
start = datetime.datetime(2018,1,1)
end = datetime.date.today()
symbols = ['CPB','CL','COST','SJM','K','PEP','KO','HSY','KHC','WMT']

solar_df = web.get_data_yahoo(symbols, start,end)
df = solar_df.stack().reset_index().set_index('Date')
df['log_return']= np.log(df['Close']/df['Open'])

In [3]:
symbol_names ={'CPB': 'Campbell Soup Company',
 'CL': 'Colgate-Palmolive Company',
 'COST': 'Costco Wholesale Corporation',
 'SJM': 'J.M.Smucker Company',
 'K': 'Kellogg Company',
 'PEP': 'PepsiCo, Inc.',
 'KO': 'The Coca-Cola Company',
 'HSY': 'The Hershey Company',
 'KHC': 'The Kraft Heinz Company',
 'WMT': 'Walmart Inc.'}

In [5]:
#one linear regression

In [4]:
from plotly.subplots import make_subplots
fig = make_subplots(
    rows=5, cols=2, shared_yaxes=True, shared_xaxes= True, horizontal_spacing =0.02, 
    vertical_spacing = 0.08, subplot_titles=('Campbell Soup Company',
 'Colgate-Palmolive Company',
 'Costco Wholesale Corporation',
 'J.M.Smucker Company',
 'Kellogg Company',
 'PepsiCo, Inc.',
 'The Coca-Cola Company',
 'The Hershey Company',
 'The Kraft Heinz Company',
 'Walmart Inc.')
)

for i in range(10):
    symbol=symbols[i]
    data1 =np.array(df.loc[df.Symbols==symbol]['log_return'])
    (osm, osr), (slope, intercept, r) = probplot(data1,fit=True)
    fig.add_trace(go.Scatter(x=osm,y=osr,mode='markers',opacity=0.6,name='data',marker_color='#6A8372'), row=int(i/2)+1,col=(i%2)+1)
    fig.add_trace(go.Scatter(x=osm,y=osm*slope+intercept,mode='lines',name='fit',marker_color='#70649A'), row=int(i/2)+1,col=(i%2)+1)

    
    if (i%2)==0:
        fig.update_yaxes(title_text='Ordered Values',row=int(i/2)+1,col=(i%2)+1)
    
    fig.update_xaxes(title_text="Theoretical quantiles", row=int(i/2)+1,col=(i%2)+1)
    fig.update_layout(title_text="Probability plot of log return",showlegend=False,height=1400)

fig.show()

In [5]:
slopes = []
intercepts = []
r_values=[]
p_values=[] 
std_errs = []
for i in range(10):
    symbol=symbols[i]
    
    start_price  = df.loc[df.Symbols==symbol]['Open'][0]
    regre_df = df.loc[df.Symbols==symbol]
    log_return_vs_180102 = np.array(np.log(regre_df['Close']/start_price))
    accumulate_days = np.array((regre_df.index - start).days)
    slope, intercept, r_value, p_value, std_err  = linregress(accumulate_days,log_return_vs_180102 )
    slopes.append(slope)
    intercepts.append(round(intercept,5))
    r_values.append(round(r_value**2,5))
    p_values.append(p_value)
    std_errs.append(round(std_err,6))



In [7]:
pd.DataFrame(zip(symbol_names.values(),slopes,intercepts,r_values,p_values,std_errs),
             columns=['symbol','slope', 'intercept', 'R^2', 'p_value', 'std_err'])

Unnamed: 0,symbol,slope,intercept,R^2,p_value,std_err
0,Campbell Soup Company,5.8e-05,-0.19374,0.01399,0.008971976,2.2e-05
1,Colgate-Palmolive Company,5.1e-05,-0.12593,0.02893,0.000162505,1.3e-05
2,Costco Wholesale Corporation,0.000685,-0.03219,0.85836,5.674417e-208,1.3e-05
3,J.M.Smucker Company,-0.000104,-0.07028,0.08055,1.785874e-10,1.6e-05
4,Kellogg Company,-0.000204,-0.00794,0.22824,3.9008170000000003e-29,1.7e-05
5,"PepsiCo, Inc.",0.000379,-0.14812,0.67272,1.031578e-119,1.2e-05
6,The Coca-Cola Company,0.000326,-0.07539,0.74288,3.812401e-145,9e-06
7,The Hershey Company,0.000753,-0.25727,0.77367,1.379619e-158,1.9e-05
8,The Kraft Heinz Company,-0.001608,0.00243,0.90259,2.0748200000000003e-247,2.4e-05
9,Walmart Inc.,0.000419,-0.1489,0.6343,5.191415e-108,1.4e-05


In [32]:
p_values=[]
s1 = list(df.loc[df.Symbols=='WMT']['log_return'])

for i in range(10):
    
    symbol=symbols[i]
    s2 = list(df.loc[df.Symbols==symbol]['log_return'])
    data1 = random.sample(s1, 300)
    data2 = random.sample(s2, 300)
    t_statistic, p_value = ttest_ind(data1,data2,equal_var=False)
    p_values.append(p_value)
    
pd.DataFrame(zip(symbol_names.values(),p_values),
             columns=['symbol','p_value (& Walmart)'])

Unnamed: 0,symbol,p_value (& Walmart)
0,Campbell Soup Company,0.999381
1,Colgate-Palmolive Company,0.791592
2,Costco Wholesale Corporation,0.775281
3,J.M.Smucker Company,0.651538
4,Kellogg Company,0.671545
5,"PepsiCo, Inc.",0.92001
6,The Coca-Cola Company,0.913062
7,The Hershey Company,0.35963
8,The Kraft Heinz Company,0.14738
9,Walmart Inc.,0.327049


In [38]:
new_df = pd.DataFrame(np.zeros((10,10)),columns=symbols,index=symbols)

In [39]:
for j in range(10):
    symbol_1= symbols[j]
    s1 = list(df.loc[df.Symbols==symbol_1]['log_return'])
    for i in range(10):
        symbol_2=symbols[i]
        s2 = list(df.loc[df.Symbols==symbol_2]['log_return'])
        data1 = random.sample(s1, 300)
        data2 = random.sample(s2, 300)
        t_statistic, p_value = ttest_ind(data1,data2,equal_var=False)
        new_df.iloc[i,j]=p_value

In [41]:
new_df

Unnamed: 0,CPB,CL,COST,SJM,K,PEP,KO,HSY,KHC,WMT
CPB,0.718165,0.989802,0.591267,0.457227,0.662979,0.750392,0.983437,0.248371,0.024009,0.91701
CL,0.172734,0.813814,0.571084,0.294649,0.587937,0.604898,0.436422,0.052667,0.387465,0.400404
COST,0.457059,0.869255,0.349523,0.464587,0.917758,0.258718,0.696114,0.534546,0.01787,0.486351
SJM,0.200362,0.394853,0.191246,0.938257,0.502926,0.936412,0.795883,0.667297,0.172355,0.336362
K,0.958591,0.367727,0.179787,0.939341,0.723981,0.365588,0.565557,0.585007,0.062921,0.904395
PEP,0.452726,0.562421,0.061437,0.72341,0.556718,0.698922,0.079922,0.018545,0.072043,0.744907
KO,0.809725,0.21098,0.530759,0.513776,0.981232,0.215717,0.926243,0.071656,0.013246,0.684944
HSY,0.416016,0.00401,0.266656,0.744167,0.160381,0.077589,0.094672,0.594183,0.09817,0.426158
KHC,0.018409,0.012011,0.004293,0.076765,0.060663,0.097136,0.074544,0.020123,0.977318,0.085027
WMT,0.440912,0.503412,0.366836,0.307685,0.980996,0.656081,0.419806,0.82677,0.184521,0.899292


In [8]:
symbol_1= 'WMT'
slopes=list()
intercepts=list()
r_values=list() 
p_values=list() 
std_errs =list()
for i in range(10):
    symbol_2 = symbols[i]
    filter_idx  = random.sample(list(df.index.unique()),300)
    regre_df = df.loc[filter_idx]
    plot_df= regre_df.pivot(values='log_return',columns='Symbols')
    data1 =np.array(regre_df.loc[regre_df.Symbols==symbol_1]['log_return'])
    data2 =np.array(regre_df.loc[regre_df.Symbols==symbol_2]['log_return'])
    slope, intercept, r_value, p_value, std_err  = linregress(data1,data2)
    slopes.append(slope)
    intercepts.append(round(intercept,5))
    r_values.append(round(r_value**2,5))
    p_values.append(p_value)
    std_errs.append(round(std_err,6))

pd.DataFrame(zip(symbol_names.values(),slopes,intercepts,r_values,p_values,std_errs),
             columns=['symbol','slope', 'intercept', 'R^2', 'p_value', 'std_err'])

Unnamed: 0,symbol,slope,intercept,R^2,p_value,std_err
0,Campbell Soup Company,0.415352,0.00054,0.10677,6.79761e-09,0.069593
1,Colgate-Palmolive Company,0.325577,9e-05,0.11928,7.890066e-10,0.051249
2,Costco Wholesale Corporation,0.563878,0.0003,0.29531,1.896342e-24,0.050459
3,J.M.Smucker Company,0.437779,-0.00066,0.13945,2.319894e-11,0.062998
4,Kellogg Company,0.466057,0.00015,0.16573,2.100597e-13,0.060573
5,"PepsiCo, Inc.",0.407179,-0.00023,0.18407,7.272703e-15,0.04966
6,The Coca-Cola Company,0.375594,0.00012,0.20164,2.719297e-16,0.043294
7,The Hershey Company,0.32149,0.00055,0.09307,7.005292e-08,0.058135
8,The Kraft Heinz Company,0.520755,-0.00128,0.14141,1.640627e-11,0.074333
9,Walmart Inc.,1.0,0.0,1.0,0.0,0.0
