In [None]:
# General imports
import numpy as np
import os
import pandas as pd

In [None]:
# This is the config file in the same directory as the analysis script
import config

# Parameters

In [None]:
# Parameters set in this notebook
# If any of these are contained in the config they will be overwritten
pm = {
    # Max number of articles to retrieve
    'n_articles': 100000,
    
    # We toss anything below these as irrelevant
    'required_rank': 5,
    'required_mentions': 0,
    
    # Columns from which to compile text
    'text_columns': [ 'abstract', 'lead_paragraph', 'snippet', 'headline.main', ],
}

In [None]:
# Update with global parameters
pm.update( config.pm )

# New York Times data

In [None]:
from pynytimes import NYTAPI

## Retrieve data

In [None]:
nytapi = NYTAPI( os.environ.get( 'NYTIMES_KEY' ), parse_dates=True )

In [None]:
# Build a prompt for the API
filter_query_prompt = ''
for i, organization in enumerate( pm['organizations'] ):
    if i != 0:
        filter_query_prompt += ' OR '
    filter_query_prompt += 'organizations:("{}")'.format( organization )

In [None]:
# Get all results
all_results = nytapi.article_search(
    query='',
    results=pm['n_articles'],
    dates={ 'begin':pm['start_date'], 'end':pm['end_date'] },
    options={
        'fq': filter_query_prompt,
    },
)

In [None]:
print( 'Retrieved {} results'.format( len( all_results ) ) )

In [None]:
# Filter on organization rank
results = []
ranks = []
for result in all_results:
    
    append_result = False
    rank = np.inf
    for keyword in result['keywords']:
        is_relevant = (
            ( keyword['value'] in pm['organizations'] )
            and ( keyword['name'] == 'organizations' )
            and ( keyword['rank'] <= pm['required_rank'] )
        )
        if is_relevant:
            append_result = True

            # Keep the lowest (most-relevant) rank
            if keyword['rank'] < rank:
                rank = keyword['rank']

    if append_result:
        results.append( result )
        ranks.append( rank )

In [None]:
print( 'Filtered down to {} retrieved results'.format( len( results ) ) )

In [None]:
# Create storage dictionary
nyt_data = {
    'pub_date': [],
    'word_count': [],
    'type_of_material': [],
    '_id': [],
}
for column in pm['text_columns']:
    nyt_data[column] = []

In [None]:
# Collect
for i, result in enumerate( results ):
    for column in nyt_data.keys():
        
        # Parse column
        if '.' in column:
            column_keys = column.split( '.' )
            column_val = result[column_keys[0]][column_keys[1]]
        else:
            column_val = result[column]
            
        # Store
        nyt_data[column].append( column_val )

In [None]:
# Turn into a dataframe
nyt = pd.DataFrame( nyt_data )

In [None]:
# Collect the full string
nyt['text'] = ( nyt[pm['text_columns']] + ' ' ).sum( axis=1 )

In [None]:
# Store relvancy
nyt['relevance_rank'] = ranks

In [None]:
nyt

## Filter data
Some of the text columns don't mention google *enough* times. This portion of the notebook removes those rows.

We need to import the library in the cell below in order to use the word_count function.

In [None]:
from textblob import TextBlob

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# Count keyword mentions
inds_to_drop = []
keyword_counts = []
for i in range (len(nyt['text'])):
    blob = TextBlob(nyt.loc[i,'text'])
    keyword_count = blob.word_counts[pm['keyword']]
    keyword_counts.append( keyword_count )
    if keyword_count < pm['required_mentions']:
        inds_to_drop.append( i )
nyt['keyword_counts'] = keyword_counts

In [None]:
# Drop keywords with an insufficient number of mentions
nyt.drop( inds_to_drop, inplace=True )

## Sentiment analysis


### TextBlob
Here, we are using textblob as our sentiment analysis tool. We are taking data from the text column of the data frame and outputting both polarity and subjectivity for each article. At the end, we are combining it into one single dataframe.

In [None]:
from textblob import TextBlob

In [None]:
pol_vec = []
subj_vec = []
for i in range (len(nyt['text'])):
    blob = TextBlob(nyt['text'][i])
    pol = blob.sentiment.polarity
    subj = blob.sentiment.subjectivity
    pol_vec.append(pol)
    subj_vec.append(subj)
    
    

In [None]:
d = {'polarity': pol_vec, 'subjectivity': subj_vec}
t = pd.DataFrame(data=d)

In [None]:
display(t)

In [None]:
nyt = pd.concat([nyt,t], axis=1)

## Save data

In [None]:
# Split into train and test
is_training = nyt['pub_date'] < pd.to_datetime( pm['start_date_test'], utc=True )
is_test = np.invert( is_training )

In [None]:
nyt.loc[is_training].to_csv( '../data/train/media_{}_{}.csv'.format( pm['start_date'].date(), pm['start_date_train'].date() )
nyt.loc[is_test].to_csv( '../data/test/media_{}_{}.csv'.format( pm['start_date_train'].date(), pm['end_date'].date() ) )

# YFinance data

In [None]:
import yfinance as yf

## Retrieve data

In [None]:
yticker = yf.Ticker( pm['ticker'] )

In [None]:
history = yticker.history(
    start = pm['start_date'],
    end = pm['end_date'],
)

## Add an adjusted close column
This more-closely tracks the actual stock value. In many cases it's identical to close.

In [None]:
history['AdjClose'] = history['Close'] - history['Dividends'] - history['Stock Splits']

## Save data

In [None]:
# Split into train and test
is_training = history.index < pd.to_datetime( pm['start_date_test'], utc=True )
is_test = np.invert( is_training )

In [None]:
fp = '../data/test/markets.csv'.format( history

In [None]:
history.loc[is_training].to_csv( '../data/train/markets_{}_{}.csv'.format( pm['start_date'].date(), pm['start_date_train'].date() )
history.loc[is_test].to_csv( '../data/test/markets_{}_{}.csv'.format( pm['start_date_train'].date(), pm['end_date'].date() ) )