# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import yfinance as yf
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from transformers import pipeline
import xgboost as xgb
import re


  from .autonotebook import tqdm as notebook_tqdm





# Data Collection

In [None]:
# Need to call out data import of either full data or filtered data

# Process News Articles

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens)

news_df['clean_title'] = news_df['title'].apply(clean_text)
print(news_df.head())


In [None]:
sentiment_pipeline = pipeline("text-classification", model="ProsusAI/finbert")

def get_sentiment(text):
    result = sentiment_pipeline(text)[0]
    if result['label'] == 'positive':
        return 1
    elif result['label'] == 'negative':
        return -1
    else:
        return 0

news_df['sentiment'] = news_df['clean_title'].apply(get_sentiment)
print(news_df.head())

# Feature Engineering

In [None]:
vectorizer = TfidfVectorizer(max_features=100)
X_tfidf = vectorizer.fit_transform(news_df['clean_title']).toarray()

# Convert to DataFrame
tfidf_df = pd.DataFrame(X_tfidf, columns=vectorizer.get_feature_names_out())
news_df = pd.concat([news_df, tfidf_df], axis=1)

In [None]:
# Assume timestamps align, otherwise map news dates to closest stock date
final_df = pd.merge(news_df, stock_df[['Benchmark']], left_index=True, right_index=True, how='inner')

# Model Training

In [None]:
X = final_df.drop(columns=['title', 'clean_title', 'Benchmark'])
y = final_df['Benchmark']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
xgb_model.fit(X_train, y_train)

# Model Evaluation

In [None]:
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Predict on New Articles

In [None]:
def predict_stock_movement(news_title):
    cleaned_text = clean_text(news_title)
    sentiment = get_sentiment(cleaned_text)

    # Convert to TF-IDF vector
    vectorized_text = vectorizer.transform([cleaned_text]).toarray()
    input_data = np.append(vectorized_text, sentiment).reshape(1, -1)

    prediction = xgb_model.predict(input_data)[0]
    return "Stock is likely to go UP" if prediction == 1 else "Stock is likely to go DOWN"

# Example Prediction
print(predict_stock_movement("Apple releases record-breaking earnings report"))
