In [2]:
import os
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
import datetime as dt
import pandas_ta as ta
import yfinance as yf
import copy
import random
from pymongo import MongoClient
import sys
from dotenv import load_dotenv
import warnings

import feedparser
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai

warnings.filterwarnings("ignore")
warnings.simplefilter('ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

load_dotenv()
mongo_client = MongoClient(os.environ.get("MONGO_URI"))
stock_db = mongo_client["stock_db"]

In [None]:
def get_article_content(url):

    # Set up headers for the request
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    
    try:
        # Get webpage content
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find article content
        content_div = soup.find("div", {"itemprop": "articleBody", "id": "vst_detail"})
        if not content_div:
            return ""
        
        # Extract text from paragraphs
        article_content = ""
        paragraphs = content_div.find_all("p")
        
        for p in paragraphs:
            # Skip author and publishing info
            if p.get("class") in [["pAuthor"], ["pPublishTimeSource", "right"]]:
                continue
            
            text = p.get_text(strip=True)
            if text:
                article_content += f"{text}\n"
        
        return article_content.strip()
        
    except Exception:
        return ""

def get_summary(content):
    genai.configure(api_key=os.environ.get("GEMINI_API"))
    model = genai.GenerativeModel('gemini-2.0-flash')
    response = model.generate_content(f'Tóm tắt lại nội dung bài báo sau {content}. Yêu cầu viết liền thành 1 đoạn văn khoảng 50 từ')
    return response.text

# rss_url = "https://vietstock.vn/830/chung-khoan/co-phieu.rss"
rss_url = "https://vietstock.vn/761/kinh-te/vi-mo.rss"

feed = feedparser.parse(rss_url)

co_phieu = {}
for entry in feed.entries[:3]:
    title = entry['title']
    content = get_article_content(entry['id'])
    co_phieu[title] = {'summary': get_summary(content), 'link': entry['id']}

df = pd.DataFrame.from_dict(co_phieu, orient='index').reset_index().rename(columns={'index':'title'})