In [None]:
# General imports
import numpy as np
import os
import pandas as pd

In [None]:
# This is the config file in the same directory as the analysis script
import config

# Parameters

In [None]:
# Parameters set in this notebook
# If any of these are contained in the config they will be overwritten
pm = {
    # Max number of articles to retrieve
    'n_articles': 100000,
    
    # We toss anything below these as irrelevant
    'required_rank': 5,
    'required_mentions': 0,
    
    # Columns from which to compile text
    'text_columns': [ 'abstract', 'lead_paragraph', 'snippet', 'headline.main', ],
}

In [None]:
# Update with global parameters
pm.update( config.pm )

# New York Times data

In [None]:
from pynytimes import NYTAPI

## Retrieve data

In [None]:
nytapi = NYTAPI( os.environ.get( 'NYTIMES_KEY' ), parse_dates=True )

In [None]:
# Build a prompt for the API
filter_query_prompt = ''
for i, organization in enumerate( pm['organizations'] ):
    if i != 0:
        filter_query_prompt += ' OR '
    filter_query_prompt += 'organizations:("{}")'.format( organization )

In [None]:
# Get all results
all_results = nytapi.article_search(
    query='',
    results=pm['n_articles'],
    dates={ 'begin':pm['start_date'], 'end':pm['end_date'] },
    options={
        'fq': filter_query_prompt,
    },
)

In [None]:
print( 'Retrieved {} results'.format( len( all_results ) ) )

In [None]:
# Filter on organization rank
results = []
ranks = []
for result in all_results:
    
    append_result = False
    rank = np.inf
    for keyword in result['keywords']:
        is_relevant = (
            ( keyword['value'] in pm['organizations'] )
            and ( keyword['name'] == 'organizations' )
            and ( keyword['rank'] <= pm['required_rank'] )
        )
        if is_relevant:
            append_result = True

            # Keep the lowest (most-relevant) rank
            if keyword['rank'] < rank:
                rank = keyword['rank']

    if append_result:
        results.append( result )
        ranks.append( rank )

In [None]:
print( 'Filtered down to {} retrieved results'.format( len( results ) ) )

In [None]:
# Create storage dictionary
nyt_data = {
    'pub_date': [],
    'word_count': [],
    'type_of_material': [],
    '_id': [],
}
for column in pm['text_columns']:
    nyt_data[column] = []

In [None]:
# Collect
for i, result in enumerate( results ):
    for column in nyt_data.keys():
        
        # Parse column
        if '.' in column:
            column_keys = column.split( '.' )
            column_val = result[column_keys[0]][column_keys[1]]
        else:
            column_val = result[column]
            
        # Store
        nyt_data[column].append( column_val )

In [None]:
# Turn into a dataframe
nyt = pd.DataFrame( nyt_data )

In [None]:
# Collect the full string
nyt['text'] = ( nyt[pm['text_columns']] + ' ' ).sum( axis=1 )

In [None]:
# Store relvancy
nyt['relevance_rank'] = ranks

In [None]:
nyt

## Filter Data
Some of the text columns don't mention google *enough* times. This portion of the notebook removes those rows.

We need to import the library in the cell below in order to use the word_count function.

In [None]:
from textblob import TextBlob

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# Count keyword mentions
inds_to_drop = []
keyword_counts = []
for i in range (len(nyt['text'])):
    blob = TextBlob(nyt.loc[i,'text'])
    keyword_count = blob.word_counts[pm['keyword']]
    keyword_counts.append( keyword_count )
    if keyword_count < pm['required_mentions']:
        inds_to_drop.append( i )
nyt['keyword_counts'] = keyword_counts

In [None]:
# Drop keywords with an insufficient number of mentions
nyt.drop( inds_to_drop, inplace=True )

## Sentiment Analysis

Using distilRoberta-financial-sentiment.
See https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis


In [None]:

## initializing the new model and tokenizer

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer_fin = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

model_fin = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [None]:
# Sentiment analysis pipeline with the new model and tokenizer
from transformers import pipeline

analyzer_fin = pipeline("sentiment-analysis", model= model_fin , tokenizer = tokenizer_fin)

In [None]:
## Apply analyzer_fin pipeline to the text of each article and recording the sentiment scores in a new column

# initialize a list to store the sentiment scores
sentiment_scores = []

# loop through each article
for text in nyt['text']:
    # apply the sentiment analysis pipeline to the abstract
    sentiment_scores.append(analyzer(text)[0].get('score'))
    
# add the sentiment scores to the media data
nyt['NLP_fin-sentiment-text'] = sentiment_scores


nyt.describe()

In [None]:
## looking at the articles with the highest and lowest NLP_fin-sentiment-text scores
max_score = nyt['NLP_fin-sentiment-text'].max()
min_score = nyt['NLP_fin-sentiment-text'].min()

## display the articles with the highest and lowest nlp_sentiment_scores
articles_with_max_score = nyt[nyt['NLP_fin-sentiment-text'] == max_score]
articles_with_min_score = nyt[nyt['NLP_fin-sentiment-text'] == min_score]

print("Article with the highest sentiment score was: \n'{}' with score {}, \n and the lowest sentiment score was: \n'{}' with score {}".format(articles_with_max_score['headline.main'].values[0], max_score, articles_with_min_score['headline.main'].values[0], min_score))

In [None]:
## plot a distogram of the NLP_fin-sentiment-text column vs its frequency
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style( 'whitegrid' )

# set the plot size
plt.figure(figsize=(10, 6))

# plot a histogram of the NLP_fin-sentiment-text column
sns.distplot(nyt['NLP_fin-sentiment-text'])

# set the title and labels
plt.title('Histogram of NLP_fin sentiment score')
plt.xlabel('NLP_fin sentiment score')
plt.ylabel('Frequency')

In [None]:
## adjusting the polarity scores to be between 0 and 1 (assume uniform distribution)

# initialize a list to store the adjusted sentiment scores
adjusted_polarity_scores = []

# loop through each sentiment score
for score in nyt['Polarity']:
    # adjust the sentiment score
    adjusted_polarity_scores.append((score + 1)/2)

# add the adjusted sentiment scores to the media data
nyt['adjusted_polarity'] = adjusted_polarity_scores

## Save Data

In [None]:
# Split into train and test
is_training = nyt['pub_date'] < pd.to_datetime( pm['start_date_test'], utc=True )
is_test = np.invert( is_training )

In [None]:
nyt.loc[is_training].to_csv( '../data/train/media.csv' )
nyt.loc[is_test].to_csv( '../data/test/media.csv' )