In [1]:
import os
from datetime import datetime
import pandas as pd

In [2]:
from textblob import TextBlob

# Parameters

In [3]:
# Parameters dictionary.
pm = {
    'organization': 'Google Inc',
    'start_date_train': datetime(2022,1,1),
    'end_date_train': datetime(2022,11,1),
    'start_date_test': datetime(2022,11,1),
    'end_date_test': datetime(2023,1,1),
    'n_articles': 10000,
    'text_columns': [ 'abstract', 'lead_paragraph', 'snippet', 'headline.main', ],
}

⭕ **Possible Improvements:**

* The date range for the market data (dependent variable) could be larger than the date range for the news, since there may be a time lag.
* Test for a number of years.
* Play with what text columns are included or not.
* Test if weighting articles by how much a company is mentioned in the article improves predictions.
* Inspect how the number of articles published changes things.

# NYTimes Data

## Retrieve Data
To access the NYTimes API we will by using the `pynytimes` repository, for which the bibtex citations is:
```
@software{Den_Heijer_pynytimes_2023,
    author = {Den Heijer, Micha},
    license = {MIT},
    title = {{pynytimes}},
    url = {https://github.com/michadenheijer/pynytimes},
    version = {0.10.0},
    year = {2023},
    doi = {10.5281/zenodo.7821090}
}
```

Our API key is stored int the environment variable `NYTIMES_KEY`, which is set in e.g. `~/.bash_profile` or `~/.zshrc`

In [4]:
from pynytimes import NYTAPI

In [7]:
nytapi = NYTAPI( os.environ.get( 'NYTIMES_KEY' ), parse_dates=True )

In [8]:
results = nytapi.article_search(
    query=pm['organization'],
    results=pm['n_articles'],
    dates={ 'begin':pm['start_date_train'], 'end':pm['end_date_train'] }
)

⭕ **Possible Improvement:**

Currently searching with keywords. An advanced option is to use the filter query feature of the NYTimes API, e.g.
```
options={
    'fq': 'organizations:("Google Inc")',
},
```
This requires also filtering on the "rank" of the organization in regards to the article, as found in e.g. `article['keywords']['rank']`. Otherwise we'll get articles tangentially related to the target company.

## Format Data

In [9]:
# Create storage dictionary
nyt_data = {
    'pub_date': [],
}
for column in pm['text_columns']:
    nyt_data[column] = []

In [10]:
# Collect
for i, result in enumerate( results ):
    for column in nyt_data.keys():
        
        # Parse column
        if '.' in column:
            column_keys = column.split( '.' )
            column_val = result[column_keys[0]][column_keys[1]]
        else:
            column_val = result[column]
            
        # Store
        nyt_data[column].append( column_val )

In [11]:
# Turn into a dataframe
nyt = pd.DataFrame( nyt_data )

In [12]:
# Collect the full string
nyt['text'] = ( nyt[pm['text_columns']] + ' ' ).sum( axis=1 )

In [13]:
nyt.head()

Unnamed: 0,pub_date,abstract,lead_paragraph,snippet,headline.main,text
0,2022-10-25 20:37:03+00:00,Google’s parent company reported earnings that...,"Even Alphabet, the parent company of Google an...",Google’s parent company reported earnings that...,Alphabet’s Profit Drops 27 Percent From a Year...,Google’s parent company reported earnings that...
1,2022-10-26 22:47:44+00:00,A series of quarterly earnings reports is show...,Google this week reported a steep decline in p...,A series of quarterly earnings reports is show...,Tech’s Biggest Companies Are Sending Worrying ...,A series of quarterly earnings reports is show...
2,2022-10-20 15:05:58+00:00,"Ken Paxton, the state attorney general, said p...",The Texas attorney general filed a privacy law...,"Ken Paxton, the state attorney general, said p...",Texas Sues Google for Collecting Biometric Dat...,"Ken Paxton, the state attorney general, said p..."
3,2022-10-28 12:06:38+00:00,The social network’s new owner has just a few ...,Elon Musk closes his purchase of Twitter and f...,The social network’s new owner has just a few ...,Elon Musk Faces Another Big Decision at Twitter,The social network’s new owner has just a few ...
4,2022-10-25 18:24:56+00:00,Apple has rejected Spotify’s new app three tim...,"Daniel Ek, the chief executive of Spotify, wan...",Apple has rejected Spotify’s new app three tim...,Spotify Wants to Get Into Audiobooks but Says ...,Apple has rejected Spotify’s new app three tim...


## Sentiment Analysis
Here, we are using textblob as our sentiment analysis tool. We are taking data from the text column of the data frame and outputting both polarity and subjectivity for each article. At the end, we are combining it into one single dataframe.

In [14]:
pol_vec = []
subj_vec = []
for i in range (len(nyt['text'])):
    blob = TextBlob(nyt['text'][i])
    pol = blob.sentiment.polarity
    subj = blob.sentiment.subjectivity
    pol_vec.append(pol)
    subj_vec.append(subj)
    
    

In [21]:
d = {'polarity': pol_vec, 'subjectivity': subj_vec}
t = pd.DataFrame(data=d)

In [22]:
display(t)

Unnamed: 0,polarity,subjectivity
0,0.075000,0.410000
1,0.229167,0.558333
2,0.037500,0.375000
3,0.093939,0.274242
4,0.292532,0.487013
...,...,...
356,0.216959,0.586908
357,0.359091,0.613636
358,0.041818,0.312727
359,-0.178571,0.464286


In [23]:
f = pd.concat([nyt,t], axis=1)

In [24]:
display(f)

Unnamed: 0,pub_date,abstract,lead_paragraph,snippet,headline.main,text,polarity,subjectivity
0,2022-10-25 20:37:03+00:00,Google’s parent company reported earnings that...,"Even Alphabet, the parent company of Google an...",Google’s parent company reported earnings that...,Alphabet’s Profit Drops 27 Percent From a Year...,Google’s parent company reported earnings that...,0.075000,0.410000
1,2022-10-26 22:47:44+00:00,A series of quarterly earnings reports is show...,Google this week reported a steep decline in p...,A series of quarterly earnings reports is show...,Tech’s Biggest Companies Are Sending Worrying ...,A series of quarterly earnings reports is show...,0.229167,0.558333
2,2022-10-20 15:05:58+00:00,"Ken Paxton, the state attorney general, said p...",The Texas attorney general filed a privacy law...,"Ken Paxton, the state attorney general, said p...",Texas Sues Google for Collecting Biometric Dat...,"Ken Paxton, the state attorney general, said p...",0.037500,0.375000
3,2022-10-28 12:06:38+00:00,The social network’s new owner has just a few ...,Elon Musk closes his purchase of Twitter and f...,The social network’s new owner has just a few ...,Elon Musk Faces Another Big Decision at Twitter,The social network’s new owner has just a few ...,0.093939,0.274242
4,2022-10-25 18:24:56+00:00,Apple has rejected Spotify’s new app three tim...,"Daniel Ek, the chief executive of Spotify, wan...",Apple has rejected Spotify’s new app three tim...,Spotify Wants to Get Into Audiobooks but Says ...,Apple has rejected Spotify’s new app three tim...,0.292532,0.487013
...,...,...,...,...,...,...,...,...
356,2022-04-07 09:00:21+00:00,Forbes thinks there are 735 of them in America...,"In 1981, Malcolm Forbes, the eccentric and fab...",Forbes thinks there are 735 of them in America...,"How Many Billionaires Are There, Anyway?",Forbes thinks there are 735 of them in America...,0.216959,0.586908
357,2022-01-28 10:00:23+00:00,A Times investigation reveals how Israel reape...,"In June 2019, three Israeli computer engineers...",A Times investigation reveals how Israel reape...,The Battle for the World’s Most Powerful Cyber...,A Times investigation reveals how Israel reape...,0.359091,0.613636
358,2022-10-05 16:04:59+00:00,"From our critics, reviews of closed gallery sh...",Sonia Gomes didn’t go to art school until age ...,"From our critics, reviews of closed gallery sh...",Art We Saw This Fall,"From our critics, reviews of closed gallery sh...",0.041818,0.312727
359,2022-04-23 04:11:44+00:00,"The committee alleged that Mark Meadows, the f...","The committee alleged that Mark Meadows, the f...","The committee alleged that Mark Meadows, the f...",Read the Jan. 6 Committee’s Filing in Its Laws...,"The committee alleged that Mark Meadows, the f...",-0.178571,0.464286


## Save in a CSV

In [25]:
f.to_csv( '../test_data/media_large.csv' )