## Overview: 

### This notebook focuses on downloading the data for the S&P 500 via Yahoo Finance using Beautiful Soup and producing several transformations of the stock data. 

I. Get tickers and data of component companies in S&P500

1. Script list of component companies in S&P500 from https://en.wikipedia.org/wiki/List_of_S%26P_500_companies
2. Obtain the sector information of components and store in a data frame.
3. Get daily prices of components from Yahoo Finance and calculate their daily price changes.  

Reference:   
http://www.thealgoengineer.com/2014/download_sp500_data/  
http://stackoverflow.com/questions/28174193/add-new-column-based-on-a-list-and-sort-date-by-newest/28210920#28210920

### Installation of packages

In [1]:
import pandas as pd
import numpy as np
import requests
import requests_cache
requests_cache.install_cache('cache')

from bs4 import BeautifulSoup
from pandas import DataFrame
from yahoo_finance import Share

### Web-Scraping of company names from the 500 S&P  from wikipedia page using Beautiful Soup.

In [2]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = BeautifulSoup(response.content, 'html.parser')

# Get the names of components of S&P500
raw1 = soup.find_all(rel="nofollow")
tickers=[string.next_element for string in raw1]
for name in tickers:
    if len(name)>4:
        tickers.remove(name)
tickers = tickers[:505]
tickers = [str.replace('.', '-') for str in tickers] # tickers with . inside names cannot be searched in Yahoo finance
tickers = [t.encode('utf-8') for t in tickers]

# Obtain the sectors which the components belong to
raw2 = soup.find_all('td')
raw3 = [raw2[3+8*i] for i in range(505)]
sectors = [string.next_element for string in raw3]
sectors = [s.encode('utf-8') for s in sectors]

# Build the data frame of components information
sp500 = pd.DataFrame(data = zip(tickers,sectors), columns = ['tickers','sectors'])
sp500.to_csv('sp500.csv')
# There are 505 constituents of S&P 500 index.

# Set up a dictionary for components for further convenient
sp500_dic = {key:val for key, val in zip(tickers, sectors)}

### Download of data from Yahoo finance 

In [3]:
# Function to retrieve the daily close price for a specific stock from Yahoo Finance
def get_stockprice(ticker, start_date, end_date):
    """
    Return the daily adjusted close price of one stock in a certain period.
    Args:
    ticker(string): stock symbol of a company.
    start_date, end_date(string): time interval bounds in the format of 'yyyy-mm-dd'.
    """
    stock = Share(ticker)
    df = pd.DataFrame(stock.get_historical(start_date, end_date))
    df.index = df['Date']
    df.rename(columns = {'Adj_Close':ticker}, inplace = True)
    df[ticker] = pd.to_numeric(df[ticker])
    return df[ticker]

In [6]:
# Function to calculate the changes in price
def price_change(df):
    """
    Return the data frame of price change
    Arg:
    df(data frame): the daily price data for stocks
    """
    change = pd.DataFrame()
    for i in range(len(df)-1):
        diff = np.subtract(np.log(df.iloc[i,:]), np.log(df.iloc[i+1,:]))
        change = pd.concat([change, diff],axis=1)
    return change

In [4]:
years=['2008','2009','2010','2011','2012','2013','2014','2015','2016']

start_dates=[y+'-01-01' for y in years]
end_dates=[y+'-12-31' for y in years]

for i in range(0,len(years)):
    year=years[i]
    
    # Get prices for 505 constituents in year 2016

    df = pd.DataFrame()
    for ticker in sp500['tickers']:
        try:
            df1 = get_stockprice(ticker, '2016-01-01', '2016-12-31')
            df = pd.concat([df, df1], axis=1)
        except:
            print(ticker)
        
# Get stock data for that year
    df.to_csv('price_'+ year +'.csv')

#Get stock changes 
    change = price_change(df).transpose()
    change.to_csv('change_'+ year +'.csv')

# Calculate the correlation matrix of price changes 
    corr = change.corr()
    corr.to_csv('corr_'+ year +'.csv')




2008
2009
2010
2011
2012
2013
2014
2015
2016
