In [None]:
import os
from datetime import datetime
import pandas as pd

# Parameters

In [None]:
# Parameters dictionary.
pm = {
    'organization': 'Google Inc',
    'start_date_train': datetime(2022,1,1),
    'end_date_train': datetime(2022,11,1),
    'start_date_test': datetime(2022,11,1),
    'end_date_test': datetime(2023,1,1),
    'n_articles': 100,
    'text_columns': [ 'abstract', 'lead_paragraph', 'snippet', 'headline.main', ],
}

⭕ **Possible Improvements:**

* The date range for the market data (dependent variable) could be larger than the date range for the news, since there may be a time lag.
* Test for a number of years.
* Play with what text columns are included or not.
* Test if weighting articles by how much a company is mentioned in the article improves predictions.
* Inspect how the number of articles published changes things.

# NYTimes Data

## Retrieve Data
To access the NYTimes API we will by using the `pynytimes` repository, for which the bibtex citations is:
```
@software{Den_Heijer_pynytimes_2023,
    author = {Den Heijer, Micha},
    license = {MIT},
    title = {{pynytimes}},
    url = {https://github.com/michadenheijer/pynytimes},
    version = {0.10.0},
    year = {2023},
    doi = {10.5281/zenodo.7821090}
}
```

Our API key is stored int the environment variable `NYTIMES_KEY`, which is set in e.g. `~/.bash_profile` or `~/.zshrc`

In [None]:
from pynytimes import NYTAPI

In [None]:
nytapi = NYTAPI( os.environ.get( 'NYTIMES_KEY' ), parse_dates=True )

In [None]:
results = nytapi.article_search(
    query=pm['organization'],
    results=pm['n_articles'],
    dates={ 'begin':pm['start_date_train'], 'end':pm['end_date_train'] }
)

⭕ **Possible Improvement:**

Currently searching with keywords. An advanced option is to use the filter query feature of the NYTimes API, e.g.
```
options={
    'fq': 'organizations:("Google Inc")',
},
```
This requires also filtering on the "rank" of the organization in regards to the article, as found in e.g. `article['keywords']['rank']`. Otherwise we'll get articles tangentially related to the target company.

## Format Data

In [None]:
# Create storage dictionary
nyt_data = {
    'pub_date': [],
}
for column in pm['text_columns']:
    nyt_data[column] = []

In [None]:
# Collect
for i, result in enumerate( results ):
    for column in nyt_data.keys():
        
        # Parse column
        if '.' in column:
            column_keys = column.split( '.' )
            column_val = result[column_keys[0]][column_keys[1]]
        else:
            column_val = result[column]
            
        # Store
        nyt_data[column].append( column_val )

In [None]:
# Turn into a dataframe
nyt = pd.DataFrame( nyt_data )

In [None]:
# Collect the full string
nyt['text'] = ( nyt[pm['text_columns']] + ' ' ).sum( axis=1 )

In [None]:
nyt.head()

## Mock the Addition of Sentiment Data

In [None]:
import numpy as np

In [None]:
rng = np.random.default_rng()

In [None]:
nyt['Polarity'] = rng.uniform( -1, 1, len( nyt ) )
nyt['Sentiment'] = rng.uniform( 0, 1, len( nyt ) )

## Save in a CSV

In [None]:
nyt.to_csv( '../test_data/media.csv' )