# To develop and test the app

In [1]:
%load_ext autoreload
%autoreload 2


In [None]:
!pip install yfinance pandas numpy networkx nltk scikit-learn statsmodels transformers newspaper3k


In [41]:
# News-Driven Cointegrated Stock Pairs Finder

# =======================================
# STEP 1: Install & Import Dependencies
# =======================================

import yfinance as yf
import pandas as pd
import numpy as np
import networkx as nx
import re
import nltk

from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralClustering
from statsmodels.tsa.stattools import adfuller, coint
from newspaper import Article
from newspaper import build
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/nemesis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
# =======================================
# STEP 2: Scrape and Process News Articles
# =======================================
def scrape_news_articles(source_url, max_articles=30):
    paper = build(source_url, memoize_articles=False)
    articles = []
    for article in paper.articles[:max_articles]:
        try:
            article.download()
            article.parse()
            article.nlp()
            articles.append({"title": article.title, "text": article.text})
        except:
            continue
    return articles

# Example: Reuters
news_articles = scrape_news_articles('https://www.reuters.com')

# =======================================
# STEP 3: Extract Tickers / Company Mentions
# =======================================
# Simplified static list for demo; replace with Named Entity Linking later
company_tickers = {
    'Apple': 'AAPL', 'Microsoft': 'MSFT', 'Google': 'GOOGL',
    'Amazon': 'AMZN', 'Meta': 'META', 'Tesla': 'TSLA',
    'Nvidia': 'NVDA', 'AMD': 'AMD', 'Intel': 'INTC', 'Netflix': 'NFLX'
}

def extract_mentions(text):
    mentioned = []
    for name in company_tickers:
        if re.search(rf'\b{name}\b', text, re.IGNORECASE):
            mentioned.append(company_tickers[name])
    return mentioned

co_mentions = []
for article in news_articles:
    mentions = extract_mentions(article['text'])
    if len(mentions) > 1:
        co_mentions.append(mentions)
        


# =======================================
# STEP 4: Build Co-occurrence Graph
# =======================================
G = nx.Graph()
for pair_list in co_mentions:
    for i in range(len(pair_list)):
        for j in range(i+1, len(pair_list)):
            a, b = pair_list[i], pair_list[j]
            if G.has_edge(a, b):
                G[a][b]['weight'] += 1
            else:
                G.add_edge(a, b, weight=1)

# Visualize top co-occurrences
print("Top connected pairs:")
print(sorted(G.edges(data=True), key=lambda x: -x[2]['weight'])[:5])



Top connected pairs:
[]


In [16]:
# =======================================
# STEP 5: Cluster Related Stocks
# =======================================
# Convert to adjacency matrix
adj_matrix = nx.to_numpy_array(G)
tickers = list(G.nodes)

n_clusters = min(3, len(tickers))
clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed')
labels = clustering.fit_predict(adj_matrix)

clusters = {}
for i, label in enumerate(labels):
    clusters.setdefault(label, []).append(tickers[i])

print("\nIdentified Clusters:")
print(clusters)



InvalidParameterError: The 'n_clusters' parameter of SpectralClustering must be an int in the range [1, inf). Got 0 instead.

In [17]:



# =======================================
# STEP 6: Test Cointegration in Clusters
# =======================================
def test_cointegration(ticker1, ticker2):
    data = yf.download([ticker1, ticker2], start="2022-01-01")['Adj Close'].dropna()
    if len(data.columns) < 2:
        return None
    series1, series2 = data.iloc[:, 0], data.iloc[:, 1]
    score, pvalue, _ = coint(series1, series2)
    return pvalue

results = []
for cluster in clusters.values():
    for i in range(len(cluster)):
        for j in range(i+1, len(cluster)):
            t1, t2 = cluster[i], cluster[j]
            pval = test_cointegration(t1, t2)
            if pval is not None:
                results.append((t1, t2, pval))

cointegrated_pairs = [(a, b, p) for (a, b, p) in results if p < 0.05]

print("\nLikely Cointegrated Pairs:")
print(cointegrated_pairs)

# =======================================
# (Optional) STEP 7: Sentiment Analysis on Articles
# =======================================
# For simplicity, we skip this here — in the repo we will use FinBERT

# END OF NOTEBOOK


NameError: name 'clusters' is not defined

# Main

In [50]:
# import modules
import logging

# import my modules
import etl.extract as ext
import etl.load as ld
import etl.transform as tfm
from utils.logger import setup_logger


# Extract
df_raw = ext.extract_tweets("data/stock_tweets/tweets_small.csv")

# Transform
df_valid, df_clean = tfm.transform(df_raw)
# tfm.transform(df_raw)

# Load
ld.load_to_sqlite(df_clean, db_path="data/tweets.db")


In [52]:
# read from database
import pandas as pd
import sqlite3
import json

def safe_json_load(x):
    try:
        if isinstance(x, str) and x.strip().startswith('['):
            return json.loads(x)
        return x
    except json.JSONDecodeError:
        return [] 
    
# Read sqlite query results into a pandas DataFrame
conn = sqlite3.connect("data/tweets.db")
df = pd.read_sql_query("SELECT * from processed_tweets", conn)

# Apply to column
df['tokens'] = df['tokens'].apply(safe_json_load)

conn.close()
# Verify that result of SQL query is stored in the dataframe
df.head()



Unnamed: 0,id,author,post_date,body,comment_num,retweet_num,like_num,clean_text,tokens
0,550441509175443456,VisualStockRSRC,2015-01-01T00:00:57,"lx21 made $10,008 on $AAPL -Check it out! htt...",0,0,1,"lx21 made $10,008 on $aapl -check it out! le...","[lx21, made, 10,008, aapl, -check, learn, exe,..."
1,550441672312512512,KeralaGuy77,2015-01-01T00:01:36,Insanity of today weirdo massive selling. $aap...,0,0,0,insanity of today weirdo massive selling. $aap...,"[insanity, today, weirdo, massive, selling, aa..."
2,550441732014223360,DozenStocks,2015-01-01T00:01:50,S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...,0,0,0,s&p100 performance $hd $low $sbux $tgt $dvn $...,"[p100, performance, low, sbux, tgt, dvn, ibm, ..."
3,550442977802207232,ShowDreamCar,2015-01-01T00:06:47,$GM $TSLA: Volkswagen Pushes 2014 Record Recal...,0,0,1,$gm $tsla: volkswagen pushes 2014 record recal...,"[tsla, volkswagen, pushes, 2014, record, recal..."
4,550443807834402816,i_Know_First,2015-01-01T00:10:05,Swing Trading: Up To 8.91% Return In 14 Days h...,0,0,1,swing trading: up to 8.91% return in 14 days ...,"[swing, trading, 8.91, return, days, mww, aapl..."
