Tải thư viện chưa có tren collab

In [None]:
!pip install yfinance
!pip install feedparser
!pip install ta
!pip install transformers
!pip install joblib
!pip install --upgrade numpy



# Import thư viện

In [None]:
import yfinance as yf
import numpy as np
import feedparser
import requests
import pandas as pd
import time
import ta
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel
import torch
import joblib

# Cào dữ liệu

### cào stock data

In [None]:
aapl = yf.Ticker("AAPL")
df = aapl.history(start="1980-12-12", end="2025-06-07", interval="1d")

# Xoá 2 cột: 'Dividends' và 'Stock Splits'
data_cleaned = df.drop(columns=['Dividends', 'Stock Splits'])

# Lưu vào file CSV
output_path = 'AAPL.csv'
data_cleaned.to_csv(output_path)


### cào dữ liệu tin tức tài chính

In [None]:
def fetch_yahoo_finance_rss(ticker="AAPL"):
    rss_url = f"https://feeds.finance.yahoo.com/rss/2.0/headline?s={ticker}&region=US&lang=en-US"
    feed = feedparser.parse(rss_url)

    articles = []
    for entry in feed.entries:
        articles.append({
            "title": entry.title,
            "link": entry.link,
            "published": entry.published,
            "summary": entry.summary
        })

    df = pd.DataFrame(articles)
    df.to_csv(f"{ticker}_yahoo_news.csv", index=False)
    print(f"✅ Lưu {len(df)} bài viết vào file: {ticker}_yahoo_news.csv")

# Test chạy hàm:
fetch_yahoo_finance_rss("AAPL")

# Đọc dữ liệu

In [None]:
stock = pd.read_csv('AAPL.csv')
stock.head()

# DATA PREPROCESSING

### Thống kê missing & invalid.

In [None]:
# Lọc các dòng có giá trị missing (NaN)
rows_with_missing = stock[stock.isnull().any(axis=1)]
missing_count = len(rows_with_missing)

# Lọc các dòng có giá trị invalid (chuỗi rỗng)
rows_with_invalid = stock[(stock == '').any(axis=1)]
invalid_count = len(rows_with_invalid)

# In kết quả
print("Missing Count:", missing_count)
print("Invalid Count:", invalid_count)

# In các dòng có missing
if missing_count > 0:
    print("Rows with Missing:")
    print(rows_with_missing)

# In các dòng có invalid
if invalid_count > 0:
    print("Rows with Invalid:")
    print(rows_with_invalid)


### SMA50 và SMA200

In [None]:
stock['SMA50'] = stock['Close'].rolling(window=50).mean()
stock['SMA200'] = stock['Close'].rolling(window=200).mean()

# Ghi dữ liệu đã cập nhật trở lại vào file CSV (ghi đè)
stock.to_csv('AAPL.csv', index=False)
stock.tail()

### RSI

In [None]:
stock['RSI'] = ta.momentum.RSIIndicator(stock['Close'], window=14).rsi()
stock.to_csv('AAPL.csv', index=False)
stock.tail()

### Lag-1 và Lag-2

In [None]:
stock['Close_Lag_1'] = stock['Close'].shift(1)
stock['Close_Lag_2'] = stock['Close'].shift(2)
stock.to_csv('AAPL.csv', index=False)
stock.head()

### Rolling Mean và Rolling Std

In [None]:
stock['Close_Rolling_Mean_5'] = stock['Close'].rolling(window=5).mean()
stock['Close_Rolling_Std_5'] = stock['Close'].rolling(window=5).std()
stock.to_csv('AAPL.csv', index=False)
stock.head(7)

### Fill Nan (sử dụng KNN imputation)

In [None]:
numeric_df = stock.select_dtypes(include=['float64', 'int64'])

# Áp dụng KNN imputation với k = 5
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(numeric_df)

# Gán lại vào dataframe ban đầu (giữ nguyên cột Date)
stock[numeric_df.columns] = imputed_data
stock.to_csv('AAPL.csv', index=False)

stock.head()


In [None]:
# Load FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# Load dữ liệu
df = pd.read_csv("AAPL_yahoo_news.csv")
texts = df['summary'].fillna("").tolist()

# Hàm lấy embedding FinBERT (768 chiều)
def get_finbert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state.squeeze(0)
    attention_mask = inputs['attention_mask'].squeeze(0)
    masked_embeddings = last_hidden_state * attention_mask.unsqueeze(1)
    mean_embedding = masked_embeddings.sum(0) / attention_mask.sum()
    return mean_embedding.numpy()

# Tính embedding
embeddings = [get_finbert_embedding(text) for text in texts]

# Lưu file CSV chỉ chứa embedding
embedding_df = pd.DataFrame(embeddings)
embedding_df.to_csv("AAPL_yahoo_news.csv", index=False)

# Lưu file .pkl chứa ánh xạ bằng joblib
encoding_key = {
    "embedding": embeddings,
    "original_summary": texts
}
joblib.dump(encoding_key, "AAPL_yahoo_news_key.pkl")

In [None]:
!pip install captum
!pip install wordcloud

In [None]:
import os
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from captum.attr import IntegratedGradients
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Create visuals directory if it doesn't exist
os.makedirs('visuals', exist_ok=True)

# DAVOTS Analysis Functions
Implementing Data Attribution Visualization Over Time Series using Integrated Gradients

In [None]:
def calculate_feature_attributions(model, features, target):
    ig = IntegratedGradients(model)
    attributions = ig.attribute(features, target=target)
    return attributions.detach().numpy()

def create_davots_heatmap(attributions, feature_names, timestamps):
    # Normalize attributions to percentages
    scaler = MinMaxScaler()
    normalized_attrs = scaler.fit_transform(attributions) * 100

    fig = go.Figure(data=go.Heatmap(
        z=normalized_attrs,
        x=feature_names,
        y=timestamps,
        colorscale='Viridis',
        text=np.round(normalized_attrs, 1),
        texttemplate='%{text}%',
        textfont={"size":10},
        colorbar=dict(title='Attribution %')
    ))

    fig.update_layout(
        title='DAVOTS Feature Attribution Heatmap',
        xaxis_title='Features',
        yaxis_title='Timestamp',
        height=800
    )

    return fig

# ICFTS Analysis Functions
Implementing Interventional Counterfactual Time Series analysis

In [None]:
def generate_counterfactuals(sentiment_series, perturbations=[0.05, 0.10]):
    counterfactuals = {}
    for p in perturbations:
        # Generate positive and negative perturbations
        counterfactuals[f'+{int(p*100)}%'] = sentiment_series * (1 + p)
        counterfactuals[f'-{int(p*100)}%'] = sentiment_series * (1 - p)
    return counterfactuals

def create_icfts_plot(base_prices, counterfactual_results, timestamps):
    fig = go.Figure()

    # Plot base price
    fig.add_trace(go.Scatter(
        x=timestamps,
        y=base_prices,
        name='Base Price',
        line=dict(color='black')
    ))

    colors = ['red', 'pink', 'lightblue', 'blue']
    for (name, prices), color in zip(counterfactual_results.items(), colors):
        fig.add_trace(go.Scatter(
            x=timestamps,
            y=prices,
            name=f'Sentiment {name}',
            line=dict(color=color)
        ))

    fig.update_layout(
        title='ICFTS: Price Changes with Sentiment Perturbations',
        xaxis_title='Time',
        yaxis_title='Price',
        height=600
    )

    return fig


# Execute Visualizations
Create and save all required visualizations

In [None]:
# Example data preparation (you'll need to adapt this to your actual model data)
feature_names = ['Price', 'Volume', 'Sentiment', 'Macro']
timestamps = pd.to_datetime(stock['Date'].iloc[-30:]).dt.strftime('%Y-%m-%d')

# Generate some sample attributions (replace with actual model outputs)
attributions = np.random.rand(30, 4)  # 30 days, 4 features

# Create and save DAVOTS heatmap
davots_fig = create_davots_heatmap(attributions, feature_names, timestamps)
davots_fig.write_html('visuals/davots_heatmap.html')

# Create and save ICFTS causal plot
base_prices = stock['Close'].iloc[-30:].values
counterfactual_results = generate_counterfactuals(base_prices)
icfts_fig = create_icfts_plot(base_prices, counterfactual_results, timestamps)
icfts_fig.write_html('visuals/icfts_causal_plot.html')

# Create price vs sentiment line plot
line_fig = px.line(stock[-30:], x=stock[-30:].index, y=['Close', 'RSI'],
                  title='Price vs Sentiment Indicator')
line_fig.write_html('visuals/line_price_sentiment.html')

# Create returns histogram
plt.figure(figsize=(10, 6))
stock['Returns'] = stock['Close'].pct_change()
stock['Returns'].hist(bins=50)
plt.title('Distribution of Returns')
plt.xlabel('Returns')
plt.ylabel('Frequency')
plt.savefig('visuals/histogram_returns.png')
plt.close()

# Create word cloud from news
text = ' '.join(df['summary'].fillna(''))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('visuals/wordcloud_news.png')
plt.close()