In [1]:
from iexfinance.stocks import Stock
import pandas as pd
import numpy as np
from pathlib import Path
from newsapi.newsapi_client import NewsApiClient
import ipywidgets as widgets
from IPython.display import display
from datetime import date, datetime, timedelta
import os 
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from iexfinance.stocks import get_historical_data
import pickle

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Devin/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [25]:
def create_df(ticker, lag=0):
    
    sp500_csv = Path("Data/sp500_constituents.csv")
    sp500_df = pd.read_csv(sp500_csv)
    sp500_df['Ticker'] = sp500_df['Symbol']
    sp500_df.drop(columns='Sector',inplace=True)
    sp500_df.set_index('Ticker', inplace=True)
    
    stock_dict = sp500_df.T.to_dict('list')
    
    from datetime import date, datetime, timedelta
    newsapi = NewsApiClient(api_key=os.environ["NEWS_API_ID"])
    pickle.dump(newsapi,open('newsapi.pickle','wb'))
    current_date = date.today()
    past_date = date.today() - timedelta(days=30)
    
    def get_headlines(keyword):
        all_headlines = []  #empty list for all headlines
        all_dates = [] #empyt list for all dates
        date = current_date #today's date
        #print(f"Fetching news about '{keyword}'") #prints single string of function starting
        #print("*" * 30) #creating a line of stars for readability
        while date > past_date: #establishes length of dates being pulled by for the 
            #lenght of the difference between today and past_date 
            #print(f"retrieving news from: {date}")#printing string for loop readability  
            articles = newsapi.get_everything(   #pulling articles through API
                q=keyword,
                from_param=str(date),
                to=str(date),
                language="en",
                sort_by="relevancy",
                page=1,
            )
            headlines = []
            for i in range(0, len(articles["articles"])):

                #pulling the content part of the dict
                headlines.append(articles["articles"][i]["content"])
            all_headlines.append(headlines)
            all_dates.append(date)
            date = date - timedelta(days=1) #moving through the days            
        return all_headlines, all_dates
    
    headlines, dates = get_headlines(stock_dict[ticker][1])
    
    df = pd.DataFrame(headlines)
    df.index = pd.to_datetime(dates)
    
    concatinated_list = []
    i = 0
    while i < len(df):
        date = df.index[i]
        daily_data = df.iloc[i,:].dropna().to_list()
        daily_data = "".join(daily_data)
        concatinated_list.append({"Date":date,
                                  "articles":daily_data
                                  })
        i = i + 1
        
    df = pd.DataFrame(concatinated_list)
    
    def get_sentiment(dataframe):
    # list of sentiment objects observed
        df_sentiments = []
        i = 0 
        # for rows in df: # the next time you put a for-loop outside of a try fn 
        # there will be a muckduck
        # for loops are used for interating through one object ie. for every item in list 
        # while loops can be used for interating through multiple objects on the same 
        # index ie. 1st item in ...for 2nd item in... for 3rd item in.. 


        while i < len(dataframe["articles"]): # if we did len of df, then it would be 
            # the length of columns by doing len of df[col], then its length of rows

            # to get values for the sentiment parameters
            text = dataframe["articles"][i] 
            date = dataframe["Date"][i]

            # activate sentiment analysis
            sentiment = analyzer.polarity_scores(text)  ## Attribute Error is applied for 
                                                        ## NoneTypes in 
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]

            # append results of sentiment analysis per row of sentiment parameters df
            df_sentiments.append({
                    "text": text,
                    "date": date,
                    "compound": compound,
                    "positive": pos,
                    "negative": neg,
                    "neutral": neu
                })
            i += 1

        # Create DataFrame
        final_df = pd.DataFrame(df_sentiments)

        # Reorder DataFrame columns
        cols = ["date", "text", "compound", "positive", "negative", "neutral"]
        final_list = final_df[cols]

        return final_list

    df = get_sentiment(df)
    df = df.set_index('date')
    df = df.sort_values(by='date',ascending=True)

    # setting start and end date for the past four weeks
    # 29 days needed instead of 28 days so that we get 28 days of return when we calculate
    end_date_stock = datetime.now()
    start_date_stock = end_date_stock + timedelta(-31)

    # getting data from the API and adding to DataFrame
    returns_df = get_historical_data(ticker, start_date_stock, end_date_stock,  
                                     output_format='pandas')
    returns_df.drop(columns=['open','high','low','volume'],inplace=True)
    returns_df = returns_df.pct_change() * 100
    returns_df.dropna(inplace=True)
    returns_df.isnull().sum()
    returns_df.rename(columns={'close':'return'}, inplace=True)
    
    combined_df = pd.concat([df, returns_df], axis=1)

    # Function takes in an NLP/stock returns dataframe and calculates the average polarity
    # scores over non-trading days and replaces scores of the first trading day after
    # non-trading days with average score of previous days.
    def cleaned_df(dataframe):
        compound = []
        positive = []
        negative = []
        neutral = []

        for index, row in dataframe.iterrows():

            # If daily return is null value for a given day, append polarity scores to their
            # respective lists.
            if pd.isnull(row['return']):
                compound.append(row['compound'])
                positive.append(row['positive'])
                negative.append(row['negative'])
                neutral.append(row['neutral'])
                dataframe.drop(index=index, inplace=True)

            elif pd.notnull(row['return']):
                
                # The list of compound polarity scores will be empty if the stock was traded
                # on the previous day; therefore, move along.
                if len(compound) == 0:
                    pass

                # If the list is not empty, then at least one day prior was a non-trading 
                # day. Append the current day's scores to the list and calculate the mean 
                # for each score. Then replace the current day's polarity scores with the 
                # average scores of today and previous non-trading days.
                else:
                    compound.append(row['compound'])
                    compound_mean = np.mean(compound)
                    compound = []

                    positive.append(row['positive'])
                    positive_mean = np.mean(positive)
                    positive = []

                    negative.append(row['negative'])
                    negative_mean = np.mean(negative)
                    negative = []

                    neutral.append(row['neutral'])
                    neutral_mean = np.mean(neutral)
                    neutral = []

                    dataframe.at[index, 'compound'] = compound_mean
                    dataframe.at[index, 'positive'] = positive_mean
                    dataframe.at[index, 'negative'] = negative_mean
                    dataframe.at[index, 'neutral'] = neutral_mean

            else:
                pass

        return dataframe.sort_index(ascending=True)
    
    # Shift the return column up to adjust for a lag in stock reaction to sentiments.
    final_df = cleaned_df(combined_df)
    final_df['return'] = final_df['return'].shift(-lag)
    final_df.dropna(inplace=True)
    
    return final_df

In [28]:
amzn_df = create_df('AMZN', 1)

In [29]:
amzn_df.head(10)

Unnamed: 0_level_0,text,compound,positive,negative,neutral,return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-17,Amazon.com Inc. is prioritizing the stocking a...,0.9368,0.071,0.053,0.876,1.225772
2020-03-18,Markets are smoother with a buffer.\r\nFinanci...,-0.4588,0.077,0.076,0.847,2.78306
2020-03-19,Whole Foods Market on Wednesday joined a growi...,0.9917,0.089,0.026,0.885,-1.852275
2020-03-20,(Reuters) - A Whole Foods Market employee work...,0.9705,0.072,0.047,0.881,3.073523
2020-03-23,WASHINGTON (Reuters) - Walmart Inc said on Mon...,0.3549,0.063,0.066333,0.870333,1.958662
2020-03-24,Wall Street is seeing some green shoots on Tue...,0.9625,0.066,0.04,0.893,-2.796763
2020-03-25,Photographer: Joshua Lott/Bloomberg\r\nAttorne...,-0.9871,0.041,0.089,0.869,3.693314
2020-03-26,(Reuters) - Ford Motor Co (F.N) and General Mo...,0.9849,0.077,0.035,0.889,-2.832538
2020-03-27,(Reuters) - Amazon.com Inc (AMZN.O) is at the ...,0.8519,0.074,0.061,0.865,3.360349
2020-03-30,"LOS ANGELES (Reuters) - Kroger Co, Walmart Inc...",0.022367,0.048333,0.075667,0.876,-0.72456


In [3]:
# Just lag function
def df_lag(df, lag=0):
    df['return'] = df['return'].shift(-lag)
    df.dropna(inplace=True)
    
    return final_df