1. Fetch data from Polygon
2. Validate and clean data, including new sentiment columns
3. Send data to model
4. Store data in GCS
5. Create API end point and serve data

# Scratch

In [None]:
!pip install fsspec

In [None]:
!pip install gcsfs

In [None]:
pip install --upgrade google-cloud-storage

In [None]:
!pip install gcloud

In [None]:
import gcloud

In [None]:
storage_client = storage.Client()
buckets = list(storage_client.list_buckets())
print(buckets)

In [None]:
!echo "This is a test of gcloud" > testing.txt

In [None]:
>>> import gcsfs
>>> fs = gcsfs.GCSFileSystem(project='my-google-project')
>>> fs.ls('my-bucket')
['my-file.txt']
>>> with fs.open('my-bucket/my-file.txt', 'rb') as f:
... print(f.read())
b'Hello, world'

>>> with fs.open('mybucket/new-file', 'wb') as f:
... f.write(2*2**20 * b'a')
... f.write(2*2**20 * b'a') # data is flushed and file closed
>>> fs.du('mybucket/new-file')
{'mybucket/new-file': 4194304}

In [None]:
import gcsfs

In [None]:
fs = gcsfs.GCSFileSystem(project='mlops-3')

In [None]:
fs.ls('polygonio-news-sentiment-test')

In [None]:
with fs.open('polygonio-news-sentiment-test/testing.txt', 'wb') as f:
    f.write(2*2**20 * b'a')

fs.du('polygonio-news-sentiment-test/testing.txt')

In [None]:
%%sh
ls

In [None]:
!cat testing.txt

In [None]:
destination_file_name= 'testing.txt'
source_file_name = 'testing.txt'
bucket_name = 'polygonio-news-sentiment-test'

In [None]:
bucket = storage_client.bucket(bucket_name)

In [None]:
from gcloud import storage
# create storage client

storage_client = storage.Client()

# give blob credentials
destination_blob_name= 'testing.txt'
source_file_name = 'testing.txt'
bucket_name = 'polygonio-news-sentiment-test'
# get bucket object 

try:
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('file: ',source_file_name,' uploaded to bucket: ',bucket_name,' successfully')
except Exception as e:
    print(e)

# Setup

In [1]:
import requests
import os
import sys
import pandas as pd
import json
import numpy as np

from datetime import datetime, timedelta

# Fetch data from Polygon API

In [2]:
# Get previous hour (UTC)
published_utc = (datetime.now() - timedelta(hours = 1)).strftime('%Y-%m-%dT%H:00:00Z')
print(published_utc)

2022-06-11T00:00:00Z


In [3]:
api_key = os.getenv("POLYGON_API_KEY")
next_url = f"https://api.polygon.io/v2/reference/news?published_utc.gte={published_utc}&limit=1000&apiKey={api_key}"
print(next_url)

https://api.polygon.io/v2/reference/news?published_utc.gte=2022-06-11T00:00:00Z&limit=1000&apiKey=zUqayLy0QMKE4MfqABhB_jophZQFjrWZ


In [4]:
news = []
count = 0
resp = requests.get(next_url)

#while resp.json()["next_url"]:
while True:
    resp = requests.get(next_url)
    if resp.ok:
        news += resp.json()
        count += resp.json()["count"]
        print(f"Count: {count}")
        if "next_url" in resp.json().keys():
            next_url = resp.json()["next_url"] + f"&apiKey={api_key}"
        else:
            break
    else:
        print(f"Request failed with {resp.status_code}")
        sys.exit(1)



Count: 2


In [5]:
print(type(resp.json()))
#print(resp.json())
df = pd.DataFrame(resp.json()['results'])

<class 'dict'>


In [6]:
df.head()

Unnamed: 0,id,publisher,title,author,published_utc,article_url,tickers,amp_url,image_url,description,keywords
0,NzrtYhEH5sBfiDPazBCZdzBIV2xgk4_SM3Tm7sKEDE4,"{'name': 'MarketWatch', 'homepage_url': 'https...","The wild, hair-raising ride of a rookie invest...",MarketWatch,2022-06-11T01:03:00Z,https://www.marketwatch.com/story/the-wild-hai...,"[VOO, VTI, TWTR, GOOG, META]",https://www.marketwatch.com/amp/story/the-wild...,https://images.mktw.net/im-561238/social,"‘My $25,000 ballooned to $63,000 super-fast. I...",
1,jKWGPXAvP7Klrj4iWYt16RUApPMywM2TMh00cY-4UJ4,"{'name': 'The Motley Fool', 'homepage_url': 'h...",Tesla Files for a 3-for-1 Stock Split. Will It...,newsfeedback@fool.com (James Brumley),2022-06-11T00:17:52Z,https://www.fool.com/investing/2022/06/10/tesl...,"[TSLA, GOOGL, AMZN, ORCL, GOOG, SHOP]",,https://g.foolcdn.com/editorial/images/684627/...,CEO Elon Musk may be looking to light a bullis...,[investing]


In [7]:
df['sentiment'] = np.nan
df.fillna('', inplace=True)

In [17]:
df.head()

Unnamed: 0,id,publisher,title,author,published_utc,article_url,tickers,amp_url,image_url,description,keywords,sentiment
0,NzrtYhEH5sBfiDPazBCZdzBIV2xgk4_SM3Tm7sKEDE4,"{'name': 'MarketWatch', 'homepage_url': 'https...","The wild, hair-raising ride of a rookie invest...",MarketWatch,2022-06-10T22:33:00Z,https://www.marketwatch.com/story/the-wild-hai...,"[VOO, VTI, TWTR, GOOG, META]",https://www.marketwatch.com/amp/story/the-wild...,https://images.mktw.net/im-561238/social,"‘My $25,000 ballooned to $63,000 super-fast. I...",,
1,yQGhHCPgQ3rZqnVK6G4qSeGu2RTEp-vi9sDhj4hhF0Q,"{'name': 'MarketWatch', 'homepage_url': 'https...",Blue Nile plans to go public through merger wi...,MarketWatch,2022-06-10T22:30:00Z,https://www.marketwatch.com/story/blue-nile-pl...,[MUDS],https://www.marketwatch.com/amp/story/blue-nil...,https://images.mktw.net/im-562120/social,Blue Nile Inc. plans to become a publicly trad...,,
2,fDT4jYXxEqWNbo3f5SlrvoKVy3aUPDzMI_uGmySvync,"{'name': 'Benzinga', 'homepage_url': 'https://...",Web3 Just Got Here — And Now Ex-Twitter Boss J...,AJ Fabino,2022-06-10T22:11:36Z,https://www.benzinga.com/markets/cryptocurrenc...,"[TWTR, SQ]",https://www.benzinga.com/amp/content/27657693,https://cdn.benzinga.com/files/images/story/20...,"Jack Dorsey, co-founder of Twitter Inc (NYSE: ...","[News, Cryptocurrency, Top Stories, Markets, T...",
3,LhaSJotGj7zJvx-F1f6RvF2qwWH2HNxgOIzReA6OAT8,"{'name': 'The Motley Fool', 'homepage_url': 'h...",Why MGM Stock Got Rocked Today,newsfeedback@fool.com (Eric Volkman),2022-06-10T22:10:01Z,https://www.fool.com/investing/2022/06/10/why-...,[MGM],,https://g.foolcdn.com/editorial/images/684598/...,Investors reacted sharply to news of the compa...,[investing],
4,xX64VaogRU7SOvDXrGInsbpDWkiclc9yjA9D6RD4CoI,"{'name': 'MarketWatch', 'homepage_url': 'https...","After petition for leadership changes, Coinbas...",MarketWatch,2022-06-10T22:09:00Z,https://www.marketwatch.com/story/after-petiti...,"[COIN, TWTR]",https://www.marketwatch.com/amp/story/after-pe...,https://images.mktw.net/im-542990/social,Coinbase Global Inc. has had a rough stretch l...,,


In [8]:
df.to_csv('gs://polygonio-news-sentiment-test/test3.csv')

In [9]:
publisher = set()
for pub in df['publisher']:
    publisher.add(pub['name'])

print(publisher)

{'MarketWatch', 'The Motley Fool', 'Benzinga', 'GlobeNewswire Inc.', 'Invezz'}


# Validate data

# ML Model

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import transformers
transformers.__version__

'4.19.4'

In [10]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 533/533 [00:00<00:00, 305kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 419M/419M [00:06<00:00, 71.6MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221k/221k [00:00<00:00, 1.88MB/s]


In [11]:
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
results = nlp(['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.',
              'formulation patents might protect Vasotec to a limited extent.'])

In [12]:
results

[{'label': 'Positive', 'score': 1.0},
 {'label': 'Negative', 'score': 0.9952379465103149},
 {'label': 'Neutral', 'score': 0.9979718327522278}]

In [13]:
headlines = df['title'].tolist()

In [14]:
results = nlp(headlines)

In [15]:
results

[{'label': 'Neutral', 'score': 0.9563222527503967},
 {'label': 'Positive', 'score': 0.9543068408966064}]

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [17]:
tokenizer2 = AutoTokenizer.from_pretrained("ProsusAI/finbert",num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 67.8kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 758/758 [00:00<00:00, 350kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 226k/226k [00:00<00:00, 1.60MB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112/112 [00:00<00:00, 32.8kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 418M/418M [00:05<00:00, 87.0MB/s]


In [18]:
nlp2 = pipeline("text-classification", model=model, tokenizer=tokenizer)
results2 = nlp2(['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.',
              'formulation patents might protect Vasotec to a limited extent.'])

In [19]:
results2

[{'label': 'neutral', 'score': 0.7334730625152588},
 {'label': 'neutral', 'score': 0.7869897484779358},
 {'label': 'neutral', 'score': 0.7128838300704956}]

# Deploy API