 # STATISTICAL ANALYSIS OF BANKING STOCK RETURNS

### INTRODUCTION
Description of Data Sets:

Our data set comprises of stock prices of the following companies taken over a time period of 35 years with frequency of one month:

BAC – Bank of America

BKC - Bank of New York Mellon

C – Citi Group

PNC – PNC Financial Services

WFC - Wells Fargo

All the above companies are from the Finance Industry and we obtained the data from Yahoo Finance.

Project Goals:

Halloween Effect- It is a market timing strategy which is based on the theory that stocks perform better between September through the April period. It is popularized by the well-known saying “Sell in May and go away” which typically suggests that we should buy the stocks in the September period and sell them in May to reap positive returns. 
Our goal in this project is to understand and analyze the behavior of stock in this period every year and,  by using statistical tools to analyze the data over past years, to conclude whether it is indeed a good idea to buy stocks in September period and sell in May or otherwise.


### README

This is an ipython notebook with widgets. This will work perfectly well if the below guidelines for execution are followed:

#### Software Versions: 

User should have Python 3.7 & above installed.
User must have jupyter notebook installed.

##### Guideliness:

1. Kindly ensure that the data files attached with the submission are in the same directory as this ipython file. Storing the data in any other location may not work and may lead to "ERROR 02: File not found". If you encounter such error, kindly ensure that you supply the correct file path for the data. 

2. In the cell below is the list of libraries used for performing statistical analysis. Kindly uncomment the cell and install the libraries if your system does not have them installed.

3. Run each cell for the widgets to be visible.


In [1]:
# INSTALL ALL THE NECESSARY LIBRARIES
# !pip install --user probscale
# !pip install --user matplotlib
# !pip install --user pandas 
# !pip install --user numpy
# !pip install --user seaborn
# !pip install --user scipy
# !pip install --user sklearn


In [2]:
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import re
import numpy as np
import seaborn
import probscale
import warnings
import scipy.stats
from scipy.stats import chisquare
from scipy.stats import chi2
from sklearn.metrics import r2_score
import scipy.stats as st
from scipy import stats
from ipywidgets import HBox, Label
from dateutil.parser import parse
import datetime as dt

warnings.simplefilter('ignore')

%matplotlib nbagg
%matplotlib inline

## IMPORTING DATA INTO OUR NOTEBOOK


In [3]:
data = pd.read_csv('C.csv')
d = pd.DataFrame(data)

list_ = ['BAC.csv','WFC.csv','C.csv','BCS.csv','PNC.csv']
pattern = r'\w+'

sym_data = []    #List of Dataframes
for item in list_:
    name = re.findall(pattern, item)
    name = name[-2]
    new_d = pd.read_csv(item)
    new_data = pd.DataFrame(new_d)
    
    list_ = []
    for i in range(new_data.shape[0]):
        list_.append(name)
    new_data['Symbol'] = list_   #Symbol Column Created for All Stock Symbols

    sym_data.append(new_data)  #Dataframe added to the list of dataframes..This should be passed as a dropdown list in the 'data' widget


symbol_data=[]    # List of dataframes with log returns
for col in sym_data:    
    col.drop(col.loc[col['Close']==0].index, inplace=True)
    col.drop(col.loc[col['Open']==0].index, inplace=True)
    col['Returns'] = np.log(col['Close']) - np.log(col['Open']) #Creating log returns Column
    col.drop(col.loc[col['Returns']==0].index, inplace=True)
    col.drop(col.loc[col['Returns']==np.inf].index, inplace=True)
    symbol_data.append(col)
    

## WIDGETS FOR DASHBOARD: LOG RETURNS

### RUN THIS CELL TO DISPLAY THE PLOTS

In [4]:
## HISTOGRAM FUNCTION IN USE

files = ['BAC', 'WFC', 'C', 'BCS', 'PNC']

#DATAFRAMES FOR STOCK SYMBOLS
BAC = symbol_data[0]
WFC = symbol_data[1]
C = symbol_data[2]
BCS = symbol_data[3]
PNC = symbol_data[4]

def stock_histogram(stock,bins,color,edgecolor, confidence_mean, confidence_variance):
    
    """Constructs Histogram for Log Returns of Available Stocks""" 

    def linefitline(b):                                                                             #Function for Regression
            return intercept + slope * b

    if stock == 'BAC':
        a = 1.0 * np.array(symbol_data[0].Returns)                                                  #CI for Mean
        n = len(a)
        m = np.mean(a)
        h = scipy.stats.t.ppf((confidence_mean) / 2., n-1)
        print ("Mean:",m, "Confidence Interval:[", m-h, m+h, "]")
        
        a2 = 1.0*np.array(symbol_data[0].Returns)                                                   #CI for Variance
        n2 = len(a)
        var = np.std(a)**2
        chi_1 = chi2.isf(q = (0.5) -(confidence_variance/2), df = n-1)
        chi_2 = chi2.isf(q = 0.5 + (confidence_variance/2), df = n-1)
        print ("Variance:",var, "Confidence Interval:[",((n-1)*var/chi_2), ((n-1)*var/chi_1), "]")
    
        plt.figure(figsize=(30,5))
        plt.subplot(131)                                                                             #histogram
        plt.xlabel("Log Returns")
        plt.ylabel("Frequency")
        plt.title("Histogram for Bank Of America's Log Returns")
        MEAN = plt.axvline(symbol_data[0].Returns.mean(), color='RED', linestyle='-', linewidth=1)
        CI1 = plt.axvline(symbol_data[0].Returns.mean() + h, color='k', linestyle='dashed', linewidth=1)
        CI2 = plt.axvline(symbol_data[0].Returns.mean() - h, color='k', linestyle='dashed', linewidth=1)
        plt.legend((MEAN, CI1), ('Mean', 'Mean Confidence Interval'))
#         plt.legend(loc='best')
        hist = plt.hist(symbol_data[0].Returns , bins = bins, color = color, edgecolor= edgecolor)
        
        
        plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, hspace=0.5, wspace=0.5)
        
        plt.figure(figsize=(30,5))                                                                   #probability plot
        plt.subplot(132)
        plt.title("Normal Probability Plot")
        fig = probscale.probplot(symbol_data[0].Returns, plottype = 'prob', bestfit=True,line_kws=dict(label='Best-fit line'), probax='y', scatter_kws=dict(label='Returns', alpha = 0.5), problabel='Probabilities', datalabel='Returns')
        plt.legend(loc='best')
        seaborn.despine(fig=fig)                                                                    

        x = np.array([z for z in range(len(symbol_data[0].Returns))])                                #Logreturns Regression 
        y = np.array(symbol_data[0].Returns)
                                                                                                     #creating OLS regression
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        line1 = linefitline(x)
        r2 = round(r2_score(y, linefitline(x)), 6)
        #plot line
                                                                                                    #creating OLS regression

        #plot line
        plt.subplot(133)
        plt.xlabel("Time in Months")
        plt.ylabel("Log Returns")
        plt.title("OLS Regression")
        plt.scatter(x,y, alpha=0.5, color = "RED")
        r2 = round(r2_score(y, linefitline(x)),6)
        plt.ylim(top=0.75,bottom=-1)
        plt.text(20, 0.5, 'R\N{SUPERSCRIPT TWO}: '+ f'{r2}', horizontalalignment = 'center')
        print('\033[94m' "Slope:" '\033[0m',round(slope, 6), '\033[94m' "Intercept:" '\033[0m', round(intercept, 6))
        print('The R\N{SUPERSCRIPT TWO} value is:'  + str(r2))
        plt.plot(x,line1, c = 'black')
#         plt.legend(('data', 'line-regression r={}'.format(r_value)), 'best')
        plt.figure(figsize=(30,5))
        
    elif stock == 'WFC':    
        a = 1.0 * np.array(symbol_data[1].Returns)                                                  #CI for Mean
        n = len(a)
        m = np.mean(a)
        h = scipy.stats.t.ppf((confidence_mean) / 2., n-1)
        print ("Mean:",m, "Confidence Interval:[", m-h, m+h, "]")
        
        a2 = 1.0*np.array(symbol_data[1].Returns)                                                   #CI for Variance
        n2 = len(a)
        var = np.std(a)**2
        chi_1 = chi2.isf(q = (0.5) -(confidence_variance/2), df = n-1)
        chi_2 = chi2.isf(q = 0.5 + (confidence_variance/2), df = n-1)
        print ("Variance:",var, "Confidence Interval:[",((n-1)*var/chi_2), ((n-1)*var/chi_1), "]")
    
        plt.figure(figsize=(30,5))
        plt.subplot(131)                                                                             #histogram
        plt.xlabel("Log Returns")
        plt.ylabel("Frequency")
        plt.title("Histogram for Wells Fargo's Log Returns")
        MEAN = plt.axvline(symbol_data[1].Returns.mean(), color='RED', linestyle='-', linewidth=1)
        CI1 = plt.axvline(symbol_data[1].Returns.mean() + h, color='k', linestyle='dashed', linewidth=1)
        CI2 = plt.axvline(symbol_data[1].Returns.mean() - h, color='k', linestyle='dashed', linewidth=1)
        plt.legend((MEAN, CI1), ('Mean', 'Mean Confidence Interval'))
        hist = plt.hist(symbol_data[1].Returns , bins = bins, color = color, edgecolor= edgecolor)
        plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, hspace=0.5, wspace=0.5)
        
        plt.figure(figsize=(30,5))                                                                   #probability plot
        plt.subplot(132)
        plt.title("Normal Probability Plot")
        fig = probscale.probplot(symbol_data[1].Returns, plottype = 'prob', bestfit=True,line_kws=dict(label='Best-fit line'), probax='y', scatter_kws=dict(label='Returns', alpha = 0.5), problabel='Probabilities', datalabel='Returns')
        plt.legend(loc='best')
        seaborn.despine(fig=fig)                                                                    

        x = np.array([z for z in range(len(symbol_data[1].Returns))])                                #Logreturns Regression 
        y = np.array(symbol_data[1].Returns)
                                                                                                     #creating OLS regression
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        line1 = linefitline(x)
        r2 = round(r2_score(y, linefitline(x)), 6)
        #plot line
                                                                                                    #creating OLS regression

        #plot line
        plt.subplot(133)
        plt.xlabel("Time in Months")
        plt.ylabel("Log Returns")
        plt.title("OLS Regression")
        plt.scatter(x,y, alpha=0.5, color = "RED")
        r2 = round(r2_score(y, linefitline(x)),6)
        plt.ylim(top=0.75,bottom=-1)
        plt.text(20, 0.5, 'R\N{SUPERSCRIPT TWO}: '+ f'{r2}', horizontalalignment = 'center')
        print('\033[94m' "Slope:" '\033[0m',round(slope, 6), '\033[94m' "Intercept:" '\033[0m', round(intercept, 6))
        print('The R\N{SUPERSCRIPT TWO} value is:'  + str(r2))
        plt.plot(x,line1, c = 'black')
#         plt.legend(('data', 'line-regression r={}'.format(r_value)), 'best')
        plt.figure(figsize=(30,5))
        
        
    elif stock == 'C':
        
        a = 1.0 * np.array(symbol_data[2].Returns)                                                  #CI for Mean
        n = len(a)
        m = np.mean(a)
        h = scipy.stats.t.ppf((confidence_mean) / 2., n-1)
        print ("Mean:",m, "Confidence Interval:[", m-h, m+h, "]")
        
        a2 = 1.0*np.array(symbol_data[2].Returns)                                                   #CI for Variance
        n2 = len(a)
        var = np.std(a)**2
        chi_1 = chi2.isf(q = (0.5) -(confidence_variance/2), df = n-1)
        chi_2 = chi2.isf(q = 0.5 + (confidence_variance/2), df = n-1)
        print ("Variance:",var, "Confidence Interval:[",((n-1)*var/chi_2), ((n-1)*var/chi_1), "]")
    
        plt.figure(figsize=(30,5))
        plt.subplot(131)                                                                             #histogram
        plt.xlabel("Log Returns")
        plt.ylabel("Frequency")
        plt.title("Histogram for Citigroups's Log Returns")
        MEAN = plt.axvline(symbol_data[2].Returns.mean(), color='RED', linestyle='-', linewidth=1)
        CI1 = plt.axvline(symbol_data[2].Returns.mean() + h, color='k', linestyle='dashed', linewidth=1)
        CI2 = plt.axvline(symbol_data[2].Returns.mean() - h, color='k', linestyle='dashed', linewidth=1)
        plt.legend((MEAN, CI1), ('Mean', 'Mean Confidence Interval'))
        
        hist = plt.hist(symbol_data[2].Returns , bins = bins, color = color, edgecolor= edgecolor)     
        plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, hspace=0.5, wspace=0.5)
        
        plt.figure(figsize=(30,5))                                                                   #probability plot
        plt.subplot(132)
        plt.title("Normal Probability Plot")
        fig = probscale.probplot(symbol_data[2].Returns, plottype = 'prob', bestfit=True,line_kws=dict(label='Best-fit line'), probax='y', scatter_kws=dict(label='Returns', alpha = 0.5), problabel='Probabilities', datalabel='Returns')
        plt.legend(loc='best')
        seaborn.despine(fig=fig)                                                                    

        x = np.array([z for z in range(len(symbol_data[2].Returns))])                                #Logreturns Regression 
        y = np.array(symbol_data[2].Returns)
                                                                                                     #creating OLS regression
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        line1 = linefitline(x)
        r2 = round(r2_score(y, linefitline(x)), 6)
        #plot line
                                                                                                    #creating OLS regression

        #plot line
        plt.subplot(133)
        plt.xlabel("Time in Months")
        plt.ylabel("Log Returns")
        plt.title("OLS Regression")
        plt.scatter(x,y, alpha=0.5, color = "RED")
        r2 = round(r2_score(y, linefitline(x)),6)
        plt.ylim(top=0.75,bottom=-1)
        plt.text(20, 0.5, 'R\N{SUPERSCRIPT TWO}: '+ f'{r2}', horizontalalignment = 'center')
        print('\033[94m' "Slope:" '\033[0m',round(slope, 6), '\033[94m' "Intercept:" '\033[0m', round(intercept, 6))
        print('The R\N{SUPERSCRIPT TWO} value is:'  + str(r2))
        plt.plot(x,line1, c = 'black')
#         plt.legend(('data', 'line-regression r={}'.format(r_value)), 'best')
        plt.figure(figsize=(30,5))
    
        plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, hspace=0.5, wspace=0.5)

    elif stock == 'BCS':
        
        a = 1.0 * np.array(symbol_data[3].Returns)                                                  #CI for Mean
        n = len(a)
        m = np.mean(a)
        h = scipy.stats.t.ppf((confidence_mean) / 2., n-1)
        print ("Mean:",m, "Confidence Interval:[", m-h, m+h, "]")
        
        a2 = 1.0*np.array(symbol_data[3].Returns)                                                   #CI for Variance
        n2 = len(a)
        var = np.std(a)**2
        chi_1 = chi2.isf(q = (0.5) -(confidence_variance/2), df = n-1)
        chi_2 = chi2.isf(q = 0.5 + (confidence_variance/2), df = n-1)
        print ("Variance:",var, "Confidence Interval:[",((n-1)*var/chi_2), ((n-1)*var/chi_1), "]")
    
        plt.figure(figsize=(30,5))
        plt.subplot(131)                                                                             #histogram
        plt.xlabel("Log Returns")
        plt.ylabel("Frequency")
        plt.title("Histogram for Barclays's Log Returns")
        MEAN = plt.axvline(symbol_data[3].Returns.mean(), color='RED', linestyle='-', linewidth=1)
        CI1 = plt.axvline(symbol_data[3].Returns.mean() + h, color='k', linestyle='dashed', linewidth=1)
        CI2 = plt.axvline(symbol_data[3].Returns.mean() - h, color='k', linestyle='dashed', linewidth=1)
        plt.legend((MEAN, CI1), ('Mean', 'Mean Confidence Interval'))
        hist = plt.hist(symbol_data[3].Returns , bins = bins, color = color, edgecolor= edgecolor)        
        
        plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, hspace=0.5, wspace=0.5)
        
        plt.figure(figsize=(30,5))                                                                   #probability plot
        plt.subplot(132)
        plt.title("Normal Probability Plot")
        fig = probscale.probplot(symbol_data[3].Returns, plottype = 'prob', bestfit=True,line_kws=dict(label='Best-fit line'), probax='y', scatter_kws=dict(label='Returns', alpha = 0.5), problabel='Probabilities', datalabel='Returns')
        plt.legend(loc='best')
        seaborn.despine(fig=fig)                                                                    

        x = np.array([z for z in range(len(symbol_data[3].Returns))])                                #Logreturns Regression 
        y = np.array(symbol_data[3].Returns)
                                                                                                     #creating OLS regression
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        line1 = linefitline(x)
        r2 = round(r2_score(y, linefitline(x)), 6)
        #plot line
                                                                                                    #creating OLS regression

        #plot line
        plt.subplot(133)
        plt.xlabel("Time in Months")
        plt.ylabel("Log Returns")
        plt.title("OLS Regression")
        plt.scatter(x,y, alpha=0.5, color = "RED")
        r2 = round(r2_score(y, linefitline(x)),8)
        plt.ylim(top=0.75,bottom=-1)
        plt.text(20, 0.5, 'R\N{SUPERSCRIPT TWO}: '+ f'{r2}', horizontalalignment = 'center')
        print('\033[94m' "Slope:" '\033[0m',round(slope, 6), '\033[94m' "Intercept:" '\033[0m', round(intercept, 6))
        print('The R\N{SUPERSCRIPT TWO} value is:'  + str(r2))
        plt.plot(x,line1, c = 'black')
#         plt.legend(('data', 'line-regression r={}'.format(r_value)), 'best')
        plt.figure(figsize=(30,5))
    
        plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, hspace=0.5, wspace=0.5)
        
    elif stock == 'PNC':
        
        a = 1.0 * np.array(symbol_data[4].Returns)                                                  #CI for Mean
        n = len(a)
        m = np.mean(a)
        h = scipy.stats.t.ppf((confidence_mean) / 2., n-1)
        print ("Mean:",m, "Confidence Interval:[", m-h, m+h, "]")
        
        a2 = 1.0*np.array(symbol_data[4].Returns)                                                   #CI for Variance
        n2 = len(a)
        var = np.std(a)**2
        chi_1 = chi2.isf(q = (0.5) -(confidence_variance/2), df = n-1)
        chi_2 = chi2.isf(q = 0.5 + (confidence_variance/2), df = n-1)
        print ("Variance:",var, "Confidence Interval:[",((n-1)*var/chi_2), ((n-1)*var/chi_1), "]")
    
        plt.figure(figsize=(30,5))
        plt.subplot(131)                                                                             #histogram
        plt.xlabel("Log Returns")
        plt.ylabel("Frequency")
        plt.title("Histogram for PNC Financial Service's Log Returns")
        MEAN = plt.axvline(symbol_data[4].Returns.mean(), color='RED', linestyle='-', linewidth=1)
        CI1 = plt.axvline(symbol_data[4].Returns.mean() + h, color='k', linestyle='dashed', linewidth=1)
        CI2 = plt.axvline(symbol_data[4].Returns.mean() - h, color='k', linestyle='dashed', linewidth=1)
        plt.legend((MEAN, CI1), ('Mean', 'Mean Confidence Interval'))
        hist = plt.hist(symbol_data[4].Returns , bins = bins, color = color, edgecolor= edgecolor)        
        
        plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, hspace=0.5, wspace=0.5)
        
        plt.figure(figsize=(30,5))                                                                   #probability plot
        plt.subplot(132)
        plt.title("Normal Probability Plot")
        fig = probscale.probplot(symbol_data[4].Returns, plottype = 'prob', bestfit=True,line_kws=dict(label='Best-fit line'), probax='y', scatter_kws=dict(label='Returns', alpha = 0.5), problabel='Probabilities', datalabel='Returns')
        plt.legend(loc='best')
        seaborn.despine(fig=fig)                                                                    

        x = np.array([z for z in range(len(symbol_data[4].Returns))])                                #Logreturns Regression 
        y = np.array(symbol_data[4].Returns)
                                                                                                     #creating OLS regression
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        line1 = linefitline(x)
        r2 = round(r2_score(y, linefitline(x)), 6)
        #plot line
                                                                                                    #creating OLS regression

        #plot line
        plt.subplot(133)
        plt.xlabel("Time in Months")
        plt.ylabel("Log Returns")
        plt.title("OLS Regression")
        plt.scatter(x,y, alpha=0.5, color = "RED")
        r2 = round(r2_score(y, linefitline(x)),6)
        plt.ylim(top=0.75,bottom=-1)
        plt.text(20, 0.5, 'R\N{SUPERSCRIPT TWO}: '+ f'{r2}', horizontalalignment = 'center')
        print('\033[94m' "Slope:" '\033[0m',round(slope, 6), '\033[94m' "Intercept:" '\033[0m', round(intercept, 6))
        print('The R\N{SUPERSCRIPT TWO} value is:'  + str(r2))
        plt.plot(x,line1, c = 'black')
#         plt.legend(('data', 'line-regression r={}'.format(r_value)), 'best')
        plt.figure(figsize=(30,5))
    
        plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, hspace=0.5, wspace=0.5)

    
    
    
    
    
# WIDGETS FOR HISTOGRAM FOR LOG RETURNS FOR DIFFERENT STOCK SYMBOLS      
b1 = widgets.IntSlider(min = 10, max = 30, value = 10, description='Bins Slider')
color1 = widgets.Dropdown(options = ["YELLOW", "GREEN", "BLUE"], description ="Color" )
edgecolor1 = widgets.Dropdown(options = ["k"], description="Edge Color") 
data1 = widgets.Dropdown(options = ['BAC', 'WFC', 'C', 'BCS', 'PNC'], description = 'Stock Symbol')
confidence_mean = widgets.FloatSlider(min = 0, max = 1, value = 0.99, step = 0.05, description = r'$\mu:1-\alpha$')
confidence_variance = widgets.FloatSlider(min = 0, max = 1, value = 0.99, step = 0.05, description = r'$\sigma^2:1-\alpha$' )


#WIDGETS INTERACTION WITH HISTOGRAM FUNCTION
widgets.interactive(stock_histogram, stock = data1, bins=b1, color=color1,
                    edgecolor=edgecolor1, confidence_mean = confidence_mean, confidence_variance = confidence_variance)


    

interactive(children=(Dropdown(description='Stock Symbol', options=('BAC', 'WFC', 'C', 'BCS', 'PNC'), value='B…

## LINEAR REGRESSION FOR STOCKS

### RUN THIS CELL TO DISPLAY THE PLOTS

In [5]:
files = ['BAC', 'WFC', 'C','BCS','PNC']
data_dict = {files[i]:symbol_data[i] for i in range(len(files))}
def population_mean(stock1,stock2):
    """Returns Regression Results for Log Returns of Selected Stocks""" 
    
    x, y= data_dict[stock1], data_dict[stock2]
     
    n=0
    if len(x.Returns)>len(y.Returns):
        n = len(y.Returns)
    else:
        n = len(x.Returns)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x.Returns[:n],y.Returns[:n])
    x_axis = np.array([z for z in range(n)])
    print('\033[94m' "Slope:" '\033[0m',round(slope, 6), '\033[94m' "Intercept:" '\033[0m', round(intercept, 6))
    def linefitline(b):
        return intercept + slope * b
    line1 = linefitline(x.Returns[:n])

    #plot line
    plt.figure(figsize=(12,7))
    plt.scatter(x_axis,x.Returns[:n], color = 'magenta', alpha=0.4, zorder=0)
    plt.scatter(x_axis, y.Returns[:n], color = 'yellow', alpha=0.4, zorder=10)
    plt.plot(x_axis,line1, c = 'Black', linewidth = 5, zorder=5)
    r2 = round(r2_score(y.Returns[:n], linefitline(x.Returns)[:n]),6)
    plt.ylim(top=0.75,bottom=-1)
    plt.text(30, 0.6, 'R\N{SUPERSCRIPT TWO}: '+ f'{r2}', horizontalalignment = 'center')
    print('\033[94m' 'The R\N{SUPERSCRIPT TWO} value is:' '\033[0m'  + str(r2))
    
    
# b1 = widgets.IntSlider(min = 10, max = 30, value = 10, description='Bins Slider')
stock1 = widgets.Dropdown(options =['BAC', 'WFC', 'C','BCS','PNC'], description="Stock1") 
stock2 = widgets.Dropdown(options = ['BAC', 'WFC', 'C','BCS','PNC'], description = 'Stock2')

#WIDGETS INTERACTION WITH HISTOGRAM FUNCTION
widgets.interactive(population_mean, stock1=stock1,stock2=stock2)

interactive(children=(Dropdown(description='Stock1', options=('BAC', 'WFC', 'C', 'BCS', 'PNC'), value='BAC'), …

## TESTING POPULATION MEANS FOR STOCKS

### RUN THIS CELL TO DISPLAY THE TEST RESULTS

In [6]:
def hypo_test(df1, df2, alpha):
    
    x, y= data_dict[df1], data_dict[df2]
    n=0
    if len(x.Returns)>len(y.Returns):
        n = len(y.Returns)
    else:
        n = len(x.Returns)
        
    df1_mean = np.mean(x.Returns[:n])
    df2_mean = np.mean(y.Returns[:n])
    
    df1_var = np.std(x.Returns[:n])**2
    df2_var = np.std(y.Returns[:n])**2
    
    denom = np.sqrt(df1_var/len(x.Returns[:n]) + df2_var/len(y.Returns[:n]))
    T = (df1_mean - df2_mean)/denom
    Z = st.norm.ppf(1 - (alpha/2))
    if -Z < T < Z:
        print ('\033[94m' f'The mean of {df1} is equal to the mean of {df2}' '\033[0m' )
    
    else:
        print('\033[94m' f'The mean of {df1} is not equal to the mean of {df2}' '\033[0m')

stock1 = widgets.Dropdown(options =['BAC', 'WFC', 'C','BCS','PNC'], description="Stock1") 
stock2 = widgets.Dropdown(options = ['BAC', 'WFC', 'C','BCS','PNC'], description = 'Stock2')
alpha =  widgets.FloatSlider(min = 0, max = 1, value = 0.99, step = 0.05, description = r'$\mu:\alpha$')  

#WIDGETS INTERACTION WITH HISTOGRAM FUNCTION
widgets.interactive(hypo_test, df1=stock1,df2=stock2, alpha=alpha)   
    
# x ='BAC' 
# y = 'C'
# hypo_test(x, y, 0.95) 

interactive(children=(Dropdown(description='Stock1', options=('BAC', 'WFC', 'C', 'BCS', 'PNC'), value='BAC'), …

## TESTING THE HALLOWEEN EFFECT

### RUN THIS CELL TO DISPLAY THE PLOT & THE RESULTS

In [7]:
files_daily = ['BAC_daily', 'WFC_daily', 'C_daily','BCS_daily','PNC_daily']
list_daily = ['BAC_daily.csv', 'WFC_daily.csv','C_daily.csv', 'BCS_daily.csv', 'PNC_daily.csv']

sym_data_daily = []    #List of Dataframes...This should be passed as a dropdown list in the 'data' widget
pattern = r'[A-Z]+'
for item in list_daily:
    name = re.findall(pattern, item)
    name = name[-1]
    new_d = pd.read_csv(item)
    new_data = pd.DataFrame(new_d)
    
    list_ = []
    for i in range(new_data.shape[0]):
        list_.append(name)
    new_data['Symbol'] = list_   #Symbol Column Created
    sym_data_daily.append(new_data)  #Dataframe added to the list of dataframes.

# print(sym_data_daily)   
# symbol_data_daily = []
# for i in sym_data_daily:
#     a=pd.read_csv(i)
#     symbol_data_daily.append(a)
# print(symbol_data_daily)


daily_data_dict = {files_daily[i]:sym_data_daily[i] for i in range(len(files_daily))}
# print(daily_data_dict)

def hall_effect(df):
    x = daily_data_dict[df]
#     print(x.head())

    dates = x['Date'].tolist()
    dates_ = [parse(i) for i in dates]

    hall = []

    for i in range(1,len(dates_)-100):
        date = dates_[i]
        if date.month == 5 and date.day == 1:
            hall.append(date)
            del dates_[i+1:i+3]

    #         dates_.pop(i+1)
        if date.month == 5 and date.day == 2:
                hall.append(date)
                del dates_[i+1]
        if date.month == 5 and date.day == 3:
                hall.append(date)

    years = [date.year for date in hall]
    years = set(years)

    for i in range(1,len(dates_)-100):
        date = dates_[i]
        if date.month == 9 and date.day == 15:
            hall.append(date)
            del dates_[i+1:i+3]

        if date.month == 9 and date.day == 16:
            hall.append(date)
            del dates_[i+1]

        if date.month == 9 and date.day == 17:
            hall.append(date)
    

    hall_str = [x.strftime("%Y-%m-%d") for x in hall]
    hall_dates = x["Date"].isin(hall_str)
    x = x[hall_dates]
    a = x.iloc[0,0]
    if a[6] == '9':
        x = x[1:x.shape[0]]
    # df['Returns'] = np.log(df['Close']) - np.log(df['Open'])
    # df.drop(df.loc[df['Returns']==0].index, inplace=True)
    # df.drop(df.loc[df['Returns']==np.inf].index, inplace=True)
    # print(years)
    closing_prices = x["Close"].tolist()

    may_price = []
    hall_price = []
    i = 0
    n=len(closing_prices)
    while i < n:
        if i%2 == 0:
            may_price.append(closing_prices[i])
        else:
            hall_price.append(closing_prices[i])
        i+=1

#     print(len(may_price))
    # print(hall_price)

#     print(len(hall_price))
#     return df
    ndf = pd.DataFrame()
    ndf['Year'] = list(years)
    ndf['May Price'] = may_price
    ndf['Halloween Price'] = hall_price

    returns = ['NA']

    for i in range(ndf.shape[0]-1):

    #     if i == ndf.shape[0]-1:
    #         returns.append('NA')

    #     else:
        hall = ndf.iloc[i,2]
        may = ndf.iloc[i+1, 1]
    #     print(hall, may)
        returns.append(np.log(may) - np.log(hall))

    ndf['Returns'] = returns
    #ndf.drop(ndf.loc[ndf['Returns']==0].index, inplace=True)
    #ndf.drop(ndf.loc[ndf['Returns']==np.inf].index, inplace=True)
    
#     #null H0 : mu>=0
#     #alternate H1 : mu<0
    var = np.std(returns[1:])
    mean = np.mean(returns[1:])
    n = len(returns)
    T = mean/var * np.sqrt(n)
#     t = scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    t = stats.t.ppf(1-0.025, n-1)

    if T< -t:
        print ( '\033[94m' 'For the given stock, Halloween Effect Hypothesis can be Rejected.\n' '\033[0m')
    else:
        print ( '\033[94m' 'For the given dataset, Halloween Effect Hypothesis holds true.\n' '\033[0m')
    
    t = ndf['Year'][1:]
    s = returns[1:]
    fig, ax = plt.subplots(figsize = (15,6))
    plt.plot(t, s)
    y = [0 for i in range(len(s))]
    plt.plot(t, y)
    plt.xlabel('Year')
    plt.ylabel('Log-Return')
    plt.title('Halloween Effect')
    plt.xticks(ndf['Year'], rotation = 'vertical')
    plt.grid(True)
    plt.show()


stock1 = widgets.Dropdown(options = ['BAC_daily', 'WFC_daily', 'C_daily','BCS_daily','PNC_daily'], description="Stock1") 


#WIDGETS INTERACTION WITH HISTOGRAM FUNCTION
widgets.interactive(hall_effect, df=stock1)   
       


interactive(children=(Dropdown(description='Stock1', options=('BAC_daily', 'WFC_daily', 'C_daily', 'BCS_daily'…