In [1]:
# General imports
import numpy as np
import os
import pandas as pd

In [2]:
# This is the config file in the same directory as the analysis script
import config

# Parameters

In [34]:
# Parameters set in this notebook
# If any of these are contained in the config they will be overwritten
pm = {
    # Max number of articles to retrieve
    'n_articles': 100000,
    
    # We toss anything below these as irrelevant
    'required_rank': 5,
    'required_mentions': 0,
    
    # Columns from which to compile text
    'text_columns': [ 'abstract', 'lead_paragraph', 'snippet', 'headline.main', ],
}

In [35]:
# Update with global parameters
pm.update( config.pm )

# New York Times data

In [5]:
from pynytimes import NYTAPI

## Retrieve data

In [6]:
nytapi = NYTAPI( os.environ.get( 'NYTIMES_KEY' ), parse_dates=True )

In [7]:
# Build a prompt for the API
filter_query_prompt = ''
for i, organization in enumerate( pm['organizations'] ):
    if i != 0:
        filter_query_prompt += ' OR '
    filter_query_prompt += 'organizations:("{}")'.format( organization )

In [8]:
# Get all results
all_results = nytapi.article_search(
    query='',
    results=pm['n_articles'],
    dates={ 'begin':pm['start_date'], 'end':pm['end_date'] },
    options={
        'fq': filter_query_prompt,
    },
)



In [9]:
print( 'Retrieved {} results'.format( len( all_results ) ) )

Retrieved 639 results


In [10]:
# Filter on organization rank
results = []
ranks = []
for result in all_results:
    
    append_result = False
    rank = np.inf
    for keyword in result['keywords']:
        is_relevant = (
            ( keyword['value'] in pm['organizations'] )
            and ( keyword['name'] == 'organizations' )
            and ( keyword['rank'] <= pm['required_rank'] )
        )
        if is_relevant:
            append_result = True

            # Keep the lowest (most-relevant) rank
            if keyword['rank'] < rank:
                rank = keyword['rank']

    if append_result:
        results.append( result )
        ranks.append( rank )

In [11]:
print( 'Filtered down to {} retrieved results'.format( len( results ) ) )

Filtered down to 270 retrieved results


In [12]:
# Create storage dictionary
nyt_data = {
    'pub_date': [],
    'word_count': [],
    'type_of_material': [],
    '_id': [],
}
for column in pm['text_columns']:
    nyt_data[column] = []

In [13]:
# Collect
for i, result in enumerate( results ):
    for column in nyt_data.keys():
        
        # Parse column
        if '.' in column:
            column_keys = column.split( '.' )
            column_val = result[column_keys[0]][column_keys[1]]
        else:
            column_val = result[column]
            
        # Store
        nyt_data[column].append( column_val )

In [14]:
# Turn into a dataframe
nyt = pd.DataFrame( nyt_data )

In [15]:
# Collect the full string
nyt['text'] = ( nyt[pm['text_columns']] + ' ' ).sum( axis=1 )

In [16]:
# Store relvancy
nyt['relevance_rank'] = ranks

In [17]:
nyt

Unnamed: 0,pub_date,word_count,type_of_material,_id,abstract,lead_paragraph,snippet,headline.main,text,relevance_rank
0,2020-10-15 20:10:10+00:00,438,News,nyt://article/02669a7b-5dfa-5c38-92bc-10ebcc39...,Facebook and Twitter clamped down on an unsubs...,In all the uproar over how tech companies have...,Facebook and Twitter clamped down on an unsubs...,"In Hubbub Over New York Post Report, YouTube S...",Facebook and Twitter clamped down on an unsubs...,4
1,2021-08-31 17:07:21+00:00,191,News,nyt://article/026a2f96-48f3-5896-916e-badc358d...,It is the latest company to push back plans fo...,Google is pushing back its return-to-office da...,It is the latest company to push back plans fo...,Google delays its return to office until January.,It is the latest company to push back plans fo...,4
2,2021-04-12 16:50:18+00:00,607,Letter,nyt://article/02b7abdf-e925-5941-b577-9c2737ba...,Readers discuss formative experiences in the w...,To the Editor:,Readers discuss formative experiences in the w...,My Job and I: Is This a Love Letter?,Readers discuss formative experiences in the w...,1
3,2020-10-21 11:56:39+00:00,2057,News,nyt://article/0322b0d5-4d0a-50bf-93f5-2fe37e9d...,What you need to know about the antitrust case...,"This Nov. 17 and 18, DealBook opens its doors ...",What you need to know about the antitrust case...,What Did Google Do?,What you need to know about the antitrust case...,2
4,2020-06-04 22:04:38+00:00,1268,News,nyt://article/044a14cb-41a2-54a5-aef6-273f5b33...,"To placate European regulators, the company st...","OAKLAND, Calif. — For the last few months, som...","To placate European regulators, the company st...",Google’s European Search Menu Draws Interest o...,"To placate European regulators, the company st...",1
...,...,...,...,...,...,...,...,...,...,...
265,2020-07-29 20:45:40+00:00,0,Video,nyt://video/986e21d2-1b3b-5fc5-9f3c-fa9509244c78,"Sundar Pichai, Google’s chief executive, faced...","Sundar Pichai, Google’s chief executive, faced...","Sundar Pichai, Google’s chief executive, faced...",‘Very Easy For Users to Be In Control of Their...,"Sundar Pichai, Google’s chief executive, faced...",3
266,2020-11-17 14:29:45+00:00,0,Video,nyt://video/99e855a0-bb31-5542-8cfd-305d00cd34c1,"The executives, who have now testified several...","The executives, who have now testified several...","The executives, who have now testified several...",Watch Full Video: Tech C.E.O.’s Testify Before...,"The executives, who have now testified several...",3
267,2020-10-28 18:44:42+00:00,0,Video,nyt://video/a3d76cb1-e166-5616-b6db-8d49b14a1a4c,"On Wednesday, the chief executives of Facebook...","On Wednesday, the chief executives of Facebook...","On Wednesday, the chief executives of Facebook...","Facebook, Google and Twitter C.E.O.s Testify a...","On Wednesday, the chief executives of Facebook...",2
268,2020-07-29 15:16:19+00:00,0,Video,nyt://video/be70fdbe-80fb-5683-9758-8ef052cf7dbb,"Jeff Bezos of Amazon, Tim Cook of Apple, Mark ...","Jeff Bezos of Amazon, Tim Cook of Apple, Mark ...","Jeff Bezos of Amazon, Tim Cook of Apple, Mark ...",Watch Live: Tech C.E.O.s Testify Before Congress,"Jeff Bezos of Amazon, Tim Cook of Apple, Mark ...",3


## Filter data
Some of the text columns don't mention google *enough* times. This portion of the notebook removes those rows.

We need to import the library in the cell below in order to use the word_count function.

In [18]:
from textblob import TextBlob

In [19]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/zhafen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
# Count keyword mentions
inds_to_drop = []
keyword_counts = []
for i in range (len(nyt['text'])):
    blob = TextBlob(nyt.loc[i,'text'])
    keyword_count = blob.word_counts[pm['keyword']]
    keyword_counts.append( keyword_count )
    if keyword_count < pm['required_mentions']:
        inds_to_drop.append( i )
nyt['keyword_counts'] = keyword_counts

In [21]:
# Drop keywords with an insufficient number of mentions
nyt.drop( inds_to_drop, inplace=True )

## Sentiment analysis


### TextBlob
Here, we are using textblob as our sentiment analysis tool. We are taking data from the text column of the data frame and outputting both polarity and subjectivity for each article. At the end, we are combining it into one single dataframe.

In [22]:
from textblob import TextBlob

In [23]:
pol_vec = []
subj_vec = []
for i in range (len(nyt['text'])):
    blob = TextBlob(nyt['text'][i])
    pol = blob.sentiment.polarity
    subj = blob.sentiment.subjectivity
    pol_vec.append(pol)
    subj_vec.append(subj)
    
    

In [24]:
d = {'polarity': pol_vec, 'subjectivity': subj_vec}
t = pd.DataFrame(data=d)

In [25]:
display(t)

Unnamed: 0,polarity,subjectivity
0,0.032983,0.455107
1,0.228000,0.482000
2,0.500000,0.600000
3,0.306250,0.682292
4,-0.023580,0.137027
...,...,...
265,0.563333,1.000000
266,0.075909,0.266364
267,0.400000,0.800000
268,0.136364,0.500000


In [26]:
nyt = pd.concat([nyt,t], axis=1)

### distilRoberta-financial-sentiment
See https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis

In [None]:

## initializing the new model and tokenizer

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer_fin = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

model_fin = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [None]:
# Sentiment analysis pipeline with the new model and tokenizer
from transformers import pipeline

analyzer_fin = pipeline("sentiment-analysis", model= model_fin , tokenizer = tokenizer_fin)

In [None]:
## Apply analyzer_fin pipeline to the text of each article and recording the sentiment scores in a new column

# initialize a list to store the sentiment scores
sentiment_scores = []

# loop through each article
for text in nyt['text']:
    # apply the sentiment analysis pipeline to the abstract
    sentiment_scores.append(analyzer(text)[0].get('score'))
    
# add the sentiment scores to the media data
nyt['NLP_fin-sentiment-text'] = sentiment_scores


nyt.describe()

In [None]:
## looking at the articles with the highest and lowest NLP_fin-sentiment-text scores
max_score = nyt['NLP_fin-sentiment-text'].max()
min_score = nyt['NLP_fin-sentiment-text'].min()

## display the articles with the highest and lowest nlp_sentiment_scores
articles_with_max_score = nyt[nyt['NLP_fin-sentiment-text'] == max_score]
articles_with_min_score = nyt[nyt['NLP_fin-sentiment-text'] == min_score]

print("Article with the highest sentiment score was: \n'{}' with score {}, \n and the lowest sentiment score was: \n'{}' with score {}".format(articles_with_max_score['headline.main'].values[0], max_score, articles_with_min_score['headline.main'].values[0], min_score))

In [None]:
## plot a distogram of the NLP_fin-sentiment-text column vs its frequency
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style( 'whitegrid' )

# set the plot size
plt.figure(figsize=(10, 6))

# plot a histogram of the NLP_fin-sentiment-text column
sns.distplot(nyt['NLP_fin-sentiment-text'])

# set the title and labels
plt.title('Histogram of NLP_fin sentiment score')
plt.xlabel('NLP_fin sentiment score')
plt.ylabel('Frequency')

In [None]:
## adjusting the polarity scores to be between 0 and 1 (assume uniform distribution)

# initialize a list to store the adjusted sentiment scores
adjusted_polarity_scores = []

# loop through each sentiment score
for score in nyt['Polarity']:
    # adjust the sentiment score
    adjusted_polarity_scores.append((score + 1)/2)

# add the adjusted sentiment scores to the media data
nyt['adjusted_polarity'] = adjusted_polarity_scores

## Save data

In [49]:
# Split into train and test
is_training = nyt['pub_date'] < pd.to_datetime( pm['start_date_test'], utc=True )
is_test = np.invert( is_training )

In [50]:
nyt.loc[is_training].to_csv( '../data/train/media.csv' )
nyt.loc[is_test].to_csv( '../data/test/media.csv' )

# YFinance data

In [27]:
import yfinance as yf

## Retrieve data

In [36]:
yticker = yf.Ticker( pm['ticker'] )

In [38]:
history = yticker.history(
    start = pm['start_date'],
    end = pm['end_date'],
)

## Add an adjusted close column
This more-closely tracks the actual stock value. In many cases it's identical to close.

In [39]:
history['AdjClose'] = history['Close'] - history['Dividends'] - history['Stock Splits']

## Save data

In [42]:
# Split into train and test
is_training = history.index < pd.to_datetime( pm['start_date_test'], utc=True )
is_test = np.invert( is_training )

In [43]:
history.loc[is_training].to_csv( '../data/train/markets.csv' )
history.loc[is_test].to_csv( '../data/test/markets.csv' )