1. Dataset extraction

In [None]:
import requests
import pandas as pd
import re
from tqdm import tqdm
from transformers import pipeline

API_KEY = "b52a0ad46acf458f99cfe0ea084ce5a5"
query = "economy OR politics OR technology"

all_articles = []
for page in range(1, 6):
    url = f"https://newsapi.org/v2/everything?q={query}&language=en&pageSize=100&page={page}&apiKey={API_KEY}"
    response = requests.get(url).json()
    if response.get("status") == "ok" and "articles" in response:
        all_articles.extend(response["articles"])
    else:
        print("Eroare API:", response.get("message"))
        break

df = pd.DataFrame([
    {
        "timestamp": a["publishedAt"],
        "title": a["title"],
        "description": a["description"],
        "content": a["content"],
        "source": a["source"]["name"],
        "source_url": a["url"]
    }
    for a in all_articles
])
df.to_csv("news_raw.csv", index=False)
print(f"Am salvat {len(df)} articole în news_raw.csv ✅")


2. Functions for subinterval clasification and text cleaning

In [None]:
import re
from bs4 import BeautifulSoup

def classify_subinterval(score):
    if score <= -0.7:
        return "Panic"
    elif score <= -0.4:
        return "Risk"
    elif score <= -0.1:
        return "Mildly negative sentiment"
    elif score < 0.1:
        return "Stable outlook"
    elif score < 0.4:
        return "Mildly optimistic sentiment"
    elif score < 0.7:
        return "Growth"
    else:
        return "Strong confidence"

def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[\r\n\t]+", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"/[a-zA-Z]+/", " ", text)
    text = text.strip()

    return text



3. Dataset Annotation

In [None]:
df = pd.read_csv("news_raw.csv")

df["content"] = (df["content"].fillna('')).apply(clean_text)

tqdm.pandas()
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name)

df["sentiment_result"] = df["content"].progress_apply(lambda x: sentiment_pipeline(x[:512])[0])

label_to_value = {
    "1 star": -1.0,
    "2 stars": -0.5,
    "3 stars": 0.0,
    "4 stars": 0.5,
    "5 stars": 1.0
}

df["sentiment_numeric"] = df["sentiment_result"].apply(lambda x: label_to_value[x["label"]] * x["score"])

df["sentiment_sublabel"] = df["sentiment_numeric"].apply(classify_subinterval)

df.to_csv("news_with_sentiment.csv", index=False)
print(df[["title", "sentiment_numeric", "sentiment_sublabel"]].head())

4. Company extraction from content

In [None]:
!pip install keybert

In [None]:
tqdm.pandas()

ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)

def extract_companies(text):
    if not isinstance(text, str) or not text.strip():
        return []
    entities = ner_pipeline(text)
    return [e['word'] for e in entities if e['entity_group'] == 'ORG']

df["companies"] = df["content"].progress_apply(extract_companies)

df.to_csv("news_with_sentiment_and_companies.csv", index=False)

5. Domain extraction

In [None]:
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import re

df = pd.read_csv("news_with_sentiment_and_companies.csv")

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

candidate_labels = [
    "technology", "finance", "healthcare", "energy", "industrials", "consumer_discretionary",
    "materials", "communication_services", "consumer_staples", "utilities", "real_estate"
]

def extract_domain(text):
    if not isinstance(text, str) or not text.strip():
        return "Unknown"
    text = re.sub(r"http\S+", "", text)
    try:
        result = classifier(text[:512], candidate_labels)
        return result["labels"][0]
    except Exception as e:
        print("EROARE clasificare domeniu:", e)
        return "Unknown"

tqdm.pandas()
df["domain"] = df["content"].progress_apply(extract_domain)

df.to_csv("news_with_sentiment_companies_and_domain.csv", index=False)
print("✅ CSV final salvat cu domenii incluse!")
print(df[["title", "domain", "sentiment_sublabel"]].head())


6. JSON convert

In [None]:

import uuid
import pandas as pd

df = pd.read_csv("news_with_sentiment_companies_and_domain.csv")
df["id"] = [str(uuid.uuid4()) for _ in range(len(df))]

df_export = df.drop(columns=["content"])

df_export.to_json(
    "news_with_sentiment_companies_and_domain.json",
    orient="records",
    force_ascii=False,
    indent=2
)

print("✅ Am salvat news_with_sentiment_companies_and_domain.json")


7. Change domain spelling

In [None]:
import pandas as pd

df = pd.read_json("news_with_sentiment_companies_and_domain.json")

df["domain"] = df["domain"].str.lower().str.replace(" ", "_")

df.to_json(
    "news_with_sentiment_companies_and_domain.json",
    orient="records",
    force_ascii=False,
    indent=2
)

print("✅ Domeniile au fost normalizate în JSON")


8. Send json to Firestore Database

In [None]:
import json
import firebase_admin
from firebase_admin import credentials, firestore

if not firebase_admin._apps:
    cred = credentials.Certificate("prevently-cdae1-firebase-adminsdk-fbsvc-2686544549.json")
    firebase_admin.initialize_app(cred)

db = firestore.client()

with open("news_timestamped.json", "r", encoding="utf-8") as f:
    data = json.load(f)

collection_name = "news_datastore"

for item in data:
    doc_id = str(item.get("id")) if "id" in item else None

    if doc_id:
        db.collection(collection_name).document(doc_id).set(item)
    else:
        db.collection(collection_name).add(item)

print(f"✅ Am încărcat {len(data)} articole în colecția '{collection_name}' din Firestore")


9. Sentiment prediction - function

In [None]:
from prophet import Prophet
import pandas as pd
from fastapi import FastAPI
from pydantic import BaseModel

def predict_sentiment_future(timestamps, sentiment_scores, periods=7):

    df = pd.DataFrame({
        "ds": pd.to_datetime(timestamps, utc=True).tz_localize(None),
        "y": sentiment_scores
    })

    model = Prophet()
    model.fit(df)

    future = model.make_future_dataframe(periods=periods)
    forecast = model.predict(future)

    predicted = forecast.tail(periods)[["ds", "yhat"]]

    return {
        "future_dates": predicted["ds"].dt.strftime("%Y-%m-%d").tolist(),
        "predicted_scores": predicted["yhat"].tolist()
    }


app = FastAPI()

class SentimentInput(BaseModel):
    timestamps: list[str]
    sentiment_scores: list[float]
    periods: int = 7

@app.post("/predict_sentiment")
def predict_sentiment(data: SentimentInput):
    return predict_sentiment_future(
        data.timestamps,
        data.sentiment_scores,
        data.periods
    )




In [None]:
from prophet import Prophet
import pandas as pd

df = pd.read_csv("news_with_sentiment_and_companies.csv")

df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True).dt.tz_localize(None)
df = df.sort_values("timestamp")

df_prophet = df[["timestamp", "sentiment_numeric"]].rename(columns={"timestamp": "ds", "sentiment_numeric": "y"})

model = Prophet()
model.fit(df_prophet)

future = model.make_future_dataframe(periods=7)
forecast = model.predict(future)

model.plot(forecast)
model.plot_components(forecast)



10. General tags extract

In [None]:
from keybert import KeyBERT

print("🔍 Extragem tag-uri din content...")

tag_model_name = "ml6team/keyphrase-extraction-distilbert-inspec"
kw_model = KeyBERT(model=tag_model_name)

def extract_tags(text):
    if not isinstance(text, str) or not text.strip():
        return []
    text = clean_text(text)
    text = text[:1000]
    try:
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 3),
            stop_words="english",
            top_n=5
        )
        tags = [kw for kw, score in keywords]
        return tags
    except Exception as e:
        print("Eroare la extragerea tag-urilor:", e)
        return []

df["tags"] = df["content"].progress_apply(extract_tags)


11. Timestamp conversion

In [None]:
import json
from datetime import datetime

with open("news_with_sentiment_companies_and_domain.json", "r", encoding="utf-8") as f:
    data = json.load(f)

for item in data:
    ts = item.get("timestamp")
    if isinstance(ts, (int, float)):
        item["timestamp"] = datetime.utcfromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%SZ")

with open("news_timestamped.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ Timestamp numeric transformat în ISO 8601 în JSON")
