In [16]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.nn.functional import softmax
import torch
import yfinance as yf
import psycopg2

In [19]:
# Input DB params
params = {
    'dbname':
    'user': 
    'password': 
    'host': 
    'port': 
}

conn = psycopg2.connect(**params)
cur = conn.cursor()

tweet_query = """
INSERT INTO Tweet_Headline (company_id, content, date) VALUES (%s, %s, %s) RETURNING tweet_id;
"""
analysis_query = """
INSERT INTO Analysis (tweet_id, sentiment_score, sentiment_label) VALUES (%s, %s, %s);
"""

In [3]:
# Get the company_id if it exists, add the company_id if not
def get_or_create_company_id(ticker_symbol, cur):
    # Check if the company exists
    cur.execute("SELECT company_id FROM company WHERE ticker_symbol = %s", (ticker_symbol,))
    result = cur.fetchone()
    
    if result:
        return result  # Return existing company_id
    else:
        # Get company info
        company_info = get_stock_info(ticker_symbol)
        cur.execute("INSERT INTO company (company_name, ticker_symbol, sector) VALUES (%s, %s, %s) RETURNING company_id",
                    (company_info['Name'], ticker_symbol, company_info['Sector']))
        company_id = cur.fetchone()[0]
        return company_id

In [5]:
# include stocks and news you want to extract
stocks = ['AMZN', 'TSLA', 'NVDA', 'AAPL', 'MSFT', 'META']
news = {}

for stock in stocks:
    url = f'https://finviz.com/quote.ashx?t={stock}&p=d'
    request = Request(url=url, headers={'user-agent': 'news_scraper'})
    response = urlopen(request)
    
    # parse the HTML content
    html = BeautifulSoup(response, features='html.parser')
    finviz_news_table = html.find(id='news-table')
    news[stock] = finviz_news_table

In [6]:
# helper function to parse datetime to a standardized format
def parse_datetime(datetime_str, current_date):
    if 'Today' in datetime_str or datetime_str.count('-') == 0:
        date_part = current_date.strftime('%b-%d-%y')
        time_part = datetime_str.replace('Today', '').strip()
    else:
        # Split into date and time parts for explicit dates
        date_part, time_part = datetime_str.split(' ')
        # Update the current date to this new date
        current_date = datetime.strptime(date_part, '%b-%d-%y')  # Assuming year 2024 for example purposes

    # Convert AM/PM times to 24-hour format and return standardized datetime string
    full_datetime_str = f"{date_part} {time_part}"
    full_datetime = datetime.strptime(full_datetime_str, '%b-%d-%y %I:%M%p')
    
    return full_datetime.strftime('%b-%d-%y %H:%M'), current_date

In [7]:
# extract headers and timestamp
news_extracted = []
current_date = datetime.now()

for stock, news_item in news.items():
    for row in news_item.findAll('tr'):
        try:
            headline = row.find('a', class_='tab-link-news').getText().strip() # headline
            datetime_str = row.find('td', align='right').text.strip() # date of article
            standardized_datetime, current_date = parse_datetime(datetime_str, current_date)
            source = row.find('div', class_='news-link-right').span.text.strip('()') # source of article
            news_extracted.append([stock, standardized_datetime, headline, source])
        except Exception as e:
            print(f"Error parsing: {e}")
        
# convert to dataframe
df = pd.DataFrame(news_extracted, columns=['Stock', 'Date', 'Headline', 'Source'])
df

Unnamed: 0,Stock,Date,Headline,Source
0,AMZN,Apr-12-24 04:00,A Once-in-a-Generation Investment Opportunity:...,Motley Fool
1,AMZN,Apr-12-24 03:22,"Amazon, eyeing up AI, adds Andrew Ng to its bo...",TechCrunch
2,AMZN,Apr-11-24 20:53,Amazon hosts pet extravaganza for third year,Chain Store Age
3,AMZN,Apr-11-24 17:53,Snowflake Stock Jumps as KeyBanc Initiates Cov...,Investopedia
4,AMZN,Apr-11-24 17:45,Amazon CEO Andy Jassy touts AI push in shareho...,Fox Business
...,...,...,...,...
595,META,Apr-04-24 12:14,Why Meta Platforms Stock Was Climbing Today,Motley Fool
596,META,Apr-04-24 10:23,"Meta Platforms Stock Has 15% Upside, According...",Motley Fool
597,META,Apr-04-24 09:08,5 Stocks Hedge Funds Are Investing In,Insider Monkey
598,META,Apr-04-24 07:05,Meta Platforms (META) Rose on Better-Than-Expe...,Insider Monkey


In [8]:
model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(device)

cuda


In [9]:
df['Sentiment'] = df['Headline'].apply(lambda x: sentiment_pipeline(x))
df.head()

Unnamed: 0,Stock,Date,Headline,Source,Sentiment
0,AMZN,Apr-12-24 04:00,A Once-in-a-Generation Investment Opportunity:...,Motley Fool,"[{'label': 'neutral', 'score': 0.9996267557144..."
1,AMZN,Apr-12-24 03:22,"Amazon, eyeing up AI, adds Andrew Ng to its bo...",TechCrunch,"[{'label': 'neutral', 'score': 0.9979020357131..."
2,AMZN,Apr-11-24 20:53,Amazon hosts pet extravaganza for third year,Chain Store Age,"[{'label': 'neutral', 'score': 0.9998151659965..."
3,AMZN,Apr-11-24 17:53,Snowflake Stock Jumps as KeyBanc Initiates Cov...,Investopedia,"[{'label': 'neutral', 'score': 0.9930877089500..."
4,AMZN,Apr-11-24 17:45,Amazon CEO Andy Jassy touts AI push in shareho...,Fox Business,"[{'label': 'neutral', 'score': 0.9988156557083..."


In [12]:
def extract_label(row):
    return row[0]['label']

def extract_score(row):
    return row[0]['score']

df[['sentiment_label']] = df['Sentiment'].apply(lambda x: pd.Series(extract_label(x)))
df[['sentiment_score']] = df['Sentiment'].apply(lambda x: pd.Series(extract_score(x)))
df.head()

Unnamed: 0,Stock,Date,Headline,Source,Sentiment,sentiment_label,sentiment_score
0,AMZN,Apr-12-24 04:00,A Once-in-a-Generation Investment Opportunity:...,Motley Fool,"[{'label': 'neutral', 'score': 0.9996267557144...",neutral,0.999627
1,AMZN,Apr-12-24 03:22,"Amazon, eyeing up AI, adds Andrew Ng to its bo...",TechCrunch,"[{'label': 'neutral', 'score': 0.9979020357131...",neutral,0.997902
2,AMZN,Apr-11-24 20:53,Amazon hosts pet extravaganza for third year,Chain Store Age,"[{'label': 'neutral', 'score': 0.9998151659965...",neutral,0.999815
3,AMZN,Apr-11-24 17:53,Snowflake Stock Jumps as KeyBanc Initiates Cov...,Investopedia,"[{'label': 'neutral', 'score': 0.9930877089500...",neutral,0.993088
4,AMZN,Apr-11-24 17:45,Amazon CEO Andy Jassy touts AI push in shareho...,Fox Business,"[{'label': 'neutral', 'score': 0.9988156557083...",neutral,0.998816


In [20]:
for index, row in df.iterrows():
    company_id = get_or_create_company_id(row['Stock'], cur)
    cur.execute(tweet_query, (company_id, row['Headline'], row['Date']))
    tweet_id = cur.fetchone()[0]  # Get the generated tweet_id
    
    # Insert into Analysis table
    cur.execute(analysis_query, (tweet_id, row['sentiment_score'], row['sentiment_label']))

conn.commit()

In [22]:
cur.execute("SELECT * FROM analysis")
result = cur.fetchall()
print(result)

[(1, 2, 0.9996267557144165, 'neutral'), (2, 3, 0.9979020357131958, 'neutral'), (3, 4, 0.9998151659965515, 'neutral'), (4, 5, 0.9930877089500427, 'neutral'), (5, 6, 0.998815655708313, 'neutral'), (6, 7, 0.9996019005775452, 'positive'), (7, 8, 0.9260686635971069, 'positive'), (8, 9, 0.9994989633560181, 'negative'), (9, 10, 0.9997492432594299, 'neutral'), (10, 11, 0.999708354473114, 'neutral'), (11, 12, 0.9996335506439209, 'neutral'), (12, 13, 0.9988706707954407, 'neutral'), (13, 14, 0.9994862079620361, 'neutral'), (14, 15, 0.9991676807403564, 'positive'), (15, 16, 0.998798131942749, 'positive'), (16, 17, 0.9997796416282654, 'neutral'), (17, 18, 0.9997734427452087, 'neutral'), (18, 19, 0.9992006421089172, 'neutral'), (19, 20, 0.999681830406189, 'neutral'), (20, 21, 0.9996622800827026, 'neutral'), (21, 22, 0.9994404911994934, 'neutral'), (22, 23, 0.999573290348053, 'neutral'), (23, 24, 0.97506183385849, 'neutral'), (24, 25, 0.9978627562522888, 'neutral'), (25, 26, 0.9996750354766846, 'neut

In [None]:
conn.close()