In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import plotly.express as px

# Data Exploration

In [None]:
data_1 = pd.read_csv("data/daily_financial_news/analyst_ratings_processed.csv", index_col=0)
data_1.head()

In [None]:
data_1['date'] = data_1['date'].str.split(' ', expand=True).iloc[:, 0]

In [None]:
data_1.head()

In [None]:
data_1['stock'].nunique()

In [None]:
data_1.info()

## Drop Null Values

In [None]:
print(data_1.isna().sum())

In [None]:
print(data_1.shape)

In [None]:
data_1[data_1['date'].isna()]

In [None]:
data_1.dropna(subset=['date'],inplace=True)
data_1.dropna(subset=['stock'],inplace=True)

In [None]:
print(data_1.shape)

In [None]:
print(data_1.isna().sum())

In [None]:
data_1.info()

## Filter stocks

In [None]:
# Filter dataset down to stocks with top 100 number of headlines
top_100_stocks_by_headlines = data_1.groupby('stock').size().reset_index(name='size').sort_values('size', ascending=False).reset_index(drop=True).iloc[:100]
top_100_stocks_by_headlines.head()

In [None]:
data_1 = data_1[data_1['stock'].isin(top_100_stocks_by_headlines.stock)]

In [None]:
data_1.shape

## Clean Text Data

In [None]:
data_1['title'] = data_1['title'].str.lower()

In [None]:
data_1.head()

### Remove Punctuations

In [None]:
import string

def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])

data_1['title'] = data_1['title'].apply(remove_punctuation)

In [None]:
data_1.head()

In [None]:
data_1['tokens'] = data_1['title'].apply(lambda x: x.split())
data_1.head()

### Remove stop words

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop = stopwords.words('english')

data_1['tokens'] = data_1['tokens'].apply(lambda x: [word for word in x if word not in stop])

In [None]:
data_1.shape

### Lemmatize tokens

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

data_1['tokens'] = data_1['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [None]:
data_1.head()

In [None]:
data_1['preprocessed_text'] = data_1['tokens'].apply(' '.join)
data_1.head()

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
# NLTK Sentiment Intensity Analyzer uses a 'Bag of Words' approach
# it removes stop words and scores each word individually before compounding

In [None]:
data_duplicate = data_1.copy()
data_duplicate['sentiment_score'] = data_duplicate['preprocessed_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
data_duplicate

In [None]:
data_duplicate['sentiment_score'].describe()

In [None]:
data_duplicate.info()

# Pull yfinance data

In [95]:
tickers = data_duplicate.stock.unique()

In [100]:
start_date, end_date = data_duplicate.date.sort_values().iloc[0], data_duplicate.date.sort_values().iloc[-1]

In [105]:
data = yf.download(list(tickers), start=start_date, end=end_date)

[*********************100%%**********************]  100 of 100 completed


13 Failed downloads:
['AGN', 'JCP', 'ZNGA', 'MON', 'APC', 'BBBY', 'MYL']: Exception('%ticker%: No timezone found, symbol may be delisted')
['PCLN', 'LNKD', 'BBRY', 'GMCR']: Exception('%ticker%: No price data found, symbol may be delisted (1d 2009-04-29 -> 2020-06-11)')
['CHK', 'SHLD']: Exception("%ticker%: Data doesn't exist for startDate = 1240977600, endDate = 1591848000")





In [106]:
data

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,ADBE,AGN,AIG,APC,ATVI,AVGO,AXP,AZN,BABA,...,URBN,VMW,VZ,WDC,WFC,XLF,XRT,YELP,YUM,ZNGA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2009-04-29,11.629640,26.750000,,18.591227,,9.411404,,20.127361,10.166079,,...,9728600,1814000,21459256,4270400,142750100,192158977,26430400,,7280216,
2009-04-30,11.777991,27.350000,,18.195669,,9.446487,,20.345181,9.726615,,...,9012600,2106300,23924686,7831600,115836200,215537513,29959000,,7283276,
2009-05-01,12.132751,27.459999,,18.195669,,9.507887,,19.594938,9.782243,,...,6598900,1435300,13002285,4485000,96730100,137526705,32675200,,6228203,
2009-05-04,12.526214,26.690001,,19.250494,,9.692082,,22.006989,10.010321,,...,5050000,2687900,16180380,4740400,294065000,260737864,31001000,,10258764,
2009-05-05,12.081153,26.889999,,22.810509,,9.621910,,21.434231,9.887935,,...,5925100,3689100,17482986,4889500,175576000,200012388,17197400,,8831181,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-04,89.064148,385.799988,,31.996899,,68.326012,277.644440,101.979034,49.753910,218.039993,...,2600700,1318100,16147300,6973000,61839200,97575100,5712900,1081200.0,2537000,
2020-06-05,88.321449,392.899994,,33.041962,,67.954094,285.005951,105.032455,49.735439,219.550003,...,1548000,1531100,17105600,7708200,107651200,135859300,3879200,2572200.0,3722800,
2020-06-08,88.233482,397.779999,,35.840897,,69.226448,285.401489,108.803780,48.562477,219.000000,...,1629800,1357900,14963000,5376700,69051300,80298400,4900800,1194200.0,2748000,
2020-06-09,87.725349,397.160004,,34.296036,,70.469452,283.424072,105.731216,48.701019,220.720001,...,2361900,1193800,14530800,4124000,51480400,92833100,7829400,1653800.0,2063200,
