In [9]:
import requests
from transformers import pipeline

# Set up your News API key here
news_api_key = 'bcae1bfec35f4e78a3cdc7accdbffd86'

# Function to get financial news from NewsAPI
def get_financial_news():
    url = f'https://newsapi.org/v2/everything?q=financial&apiKey={news_api_key}'
    response = requests.get(url)
    data = response.json()
    headlines = [article['title'] for article in data['articles']]  # Correct key to fetch titles
    return headlines

# Set up sentiment analysis pipeline from Hugging Face
sentiment_pipeline = pipeline('sentiment-analysis')

# Function to analyze sentiment of financial news
def analyze_sentiment(headlines):
    sentiments = []
    for headline in headlines:
        sentiment = sentiment_pipeline(headline)
        sentiments.append((headline, sentiment[0]['label'], sentiment[0]['score']))
    return sentiments

# Combine news data with sentiment analysis
def get_combined_data():
    # Get financial headlines
    headlines = get_financial_news()
    
    if not headlines:
        print("No headlines available.")
        return
    
    # Analyze sentiment
    sentiments = analyze_sentiment(headlines)
    
    # Combine headlines with their sentiment
    combined_data = [{"headline": headline, "sentiment": sentiment, "score": score} 
                     for headline, sentiment, score in sentiments]
    
    return combined_data

# Example usage
combined_data = get_combined_data()
if combined_data:
    for data in combined_data:
        print(f"Headline: {data['headline']} | Sentiment: {data['sentiment']} | Score: {data['score']}")


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Headline: The US Consumer Financial Protection Bureau sues Zelle and four of its partner banks | Sentiment: NEGATIVE | Score: 0.9979087114334106
Headline: They Went After the Hawk Tuah Crypto Promoters. Now They're Suing Pump.Fun | Sentiment: NEGATIVE | Score: 0.9963162541389465
Headline: RFK Jr. Made a Million Dollars From His Anti-Vax Work That He Previously Claimed Was ‘Unpaid’ | Sentiment: NEGATIVE | Score: 0.9959858059883118
Headline: I Have No Idea What Peter Thiel Is Trying to Say and It’s Making Me Really Uncomfortable | Sentiment: NEGATIVE | Score: 0.9994261264801025
Headline: I'm a financial writer, but I'm not teaching my teenagers about investing. It's hard to get excited about stocks before you have an income. | Sentiment: NEGATIVE | Score: 0.9983502626419067
Headline: Yes, You Can Get Up to $4,000 Now With a Tax Refund Advance -- but It's Risky | Sentiment: NEGATIVE | Score: 0.9959114789962769
Headline: Mark Carney, the 'unreliable boyfriend' who ran UK's central bank | S

In [15]:
import requests
import pandas as pd
from datetime import datetime

# API Setup
API_KEY = 'bcae1bfec35f4e78a3cdc7accdbffd86'
BASE_URL = 'https://newsapi.org/v2/everything'

def fetch_news(query, from_date, to_date, page=1):
    params = {
        'q': query,
        'from': from_date,
        'to': to_date,
        'sortBy': 'publishedAt',
        'apiKey': API_KEY,
        'pageSize': 100,
        'page': page
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None

# Fetching data
query = "stock market"
from_date = "2025-01-01"
to_date = datetime.now().strftime('%Y-%m-%d')

news_data = []
for page in range(1, 6):  # Fetch multiple pages
    data = fetch_news(query, from_date, to_date, page)
    if data and 'articles' in data:
        news_data.extend(data['articles'])
    else:
        break

# Save to DataFrame
df = pd.DataFrame(news_data)
df = df[['publishedAt', 'title', 'description', 'content']]
df.to_csv("news_data.csv", index=False)
print("News data saved to 'news_data.csv'.")


Error: 426, {"status":"error","code":"maximumResultsReached","message":"You have requested too many results. Developer accounts are limited to a max of 100 results. You are trying to request results 100 to 200. Please upgrade to a paid plan if you need more results."}
News data saved to 'news_data.csv'.


In [17]:
import pandas as pd
import re

# Load the news data
df = pd.read_csv("news_data.csv")

# Combine title, description, and content for sentiment analysis
df['text'] = df[['title', 'description', 'content']].fillna('').agg(' '.join, axis=1)

# Clean text data (remove special characters, URLs, and extra whitespace)
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

df['clean_text'] = df['text'].apply(clean_text)

# Drop unnecessary columns
df = df[['publishedAt', 'clean_text']]

# Save the preprocessed data
df.to_csv("preprocessed_news_data.csv", index=False)
print("Preprocessed data saved to 'preprocessed_news_data.csv'.")


Preprocessed data saved to 'preprocessed_news_data.csv'.


In [19]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

# Initialize VADER Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Calculate sentiment scores and determine labels
def label_sentiment(text):
    scores = sia.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['clean_text'].apply(label_sentiment)

# Save the labeled data
df.to_csv("labeled_news_data.csv", index=False)
print("Labeled data saved to 'labeled_news_data.csv'.")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/udaysinghshekhawat/nltk_data...


Labeled data saved to 'labeled_news_data.csv'.


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load labeled data
df = pd.read_csv("labeled_news_data.csv")

# Features and labels
X = df['clean_text']
y = df['sentiment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
import pickle
with open("sentiment_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
with open("vectorizer.pkl", "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)

print("Model and vectorizer saved!")


              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
    positive       0.95      1.00      0.97        19

    accuracy                           0.95        20
   macro avg       0.47      0.50      0.49        20
weighted avg       0.90      0.95      0.93        20

Model and vectorizer saved!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
