# Cointegrated stock pairs finder

In [None]:
!pip install yfinance pandas numpy networkx nltk scikit-learn statsmodels transformers newspaper3k

In [None]:
# News-Driven Cointegrated Stock Pairs Finder

# =======================================
# STEP 1: Install & Import Dependencies
# =======================================

import yfinance as yf
import pandas as pd
import numpy as np
import networkx as nx
import re
import nltk

from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralClustering
from statsmodels.tsa.stattools import adfuller, coint
from newspaper import Article
from newspaper import build
nltk.download('punkt')


# =======================================
# STEP 2: Scrape and Process News Articles
# =======================================
def scrape_news_articles(source_url, max_articles=30):
    paper = build(source_url, memoize_articles=False)
    articles = []
    for article in paper.articles[:max_articles]:
        try:
            article.download()
            article.parse()
            article.nlp()
            articles.append({"title": article.title, "text": article.text})
        except:
            continue
    return articles

# Example: Reuters
news_articles = scrape_news_articles('https://www.reuters.com')

# =======================================
# STEP 3: Extract Tickers / Company Mentions
# =======================================
# Simplified static list for demo; replace with Named Entity Linking later
company_tickers = {
    'Apple': 'AAPL', 'Microsoft': 'MSFT', 'Google': 'GOOGL',
    'Amazon': 'AMZN', 'Meta': 'META', 'Tesla': 'TSLA',
    'Nvidia': 'NVDA', 'AMD': 'AMD', 'Intel': 'INTC', 'Netflix': 'NFLX'
}

def extract_mentions(text):
    mentioned = []
    for name in company_tickers:
        if re.search(rf'\b{name}\b', text, re.IGNORECASE):
            mentioned.append(company_tickers[name])
    return mentioned

co_mentions = []
for article in news_articles:
    mentions = extract_mentions(article['text'])
    if len(mentions) > 1:
        co_mentions.append(mentions)
        


# =======================================
# STEP 4: Build Co-occurrence Graph
# =======================================
G = nx.Graph()
for pair_list in co_mentions:
    for i in range(len(pair_list)):
        for j in range(i+1, len(pair_list)):
            a, b = pair_list[i], pair_list[j]
            if G.has_edge(a, b):
                G[a][b]['weight'] += 1
            else:
                G.add_edge(a, b, weight=1)

# Visualize top co-occurrences
print("Top connected pairs:")
print(sorted(G.edges(data=True), key=lambda x: -x[2]['weight'])[:5])

# =======================================
# STEP 5: Cluster Related Stocks
# =======================================
# Convert to adjacency matrix
adj_matrix = nx.to_numpy_array(G)
tickers = list(G.nodes)

n_clusters = min(3, len(tickers))
clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed')
labels = clustering.fit_predict(adj_matrix)

clusters = {}
for i, label in enumerate(labels):
    clusters.setdefault(label, []).append(tickers[i])

print("\nIdentified Clusters:")
print(clusters)



# =======================================
# STEP 6: Test Cointegration in Clusters
# =======================================
def test_cointegration(ticker1, ticker2):
    data = yf.download([ticker1, ticker2], start="2022-01-01")['Adj Close'].dropna()
    if len(data.columns) < 2:
        return None
    series1, series2 = data.iloc[:, 0], data.iloc[:, 1]
    score, pvalue, _ = coint(series1, series2)
    return pvalue

results = []
for cluster in clusters.values():
    for i in range(len(cluster)):
        for j in range(i+1, len(cluster)):
            t1, t2 = cluster[i], cluster[j]
            pval = test_cointegration(t1, t2)
            if pval is not None:
                results.append((t1, t2, pval))

cointegrated_pairs = [(a, b, p) for (a, b, p) in results if p < 0.05]

print("\nLikely Cointegrated Pairs:")
print(cointegrated_pairs)

# =======================================
# (Optional) STEP 7: Sentiment Analysis on Articles
# =======================================
# For simplicity, we skip this here â€” in the repo we will use FinBERT

# END OF NOTEBOOK