In [1]:
from IPython import get_ipython
from IPython.display import display

In [2]:
!pip install transformers newspaper3k nltk jinja2 ipywidgets langchain PyPDF2
!pip install transformers newspaper3k nltk Jinja2 lxml_html_clean

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmente

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from IPython.display import display, HTML, clear_output
from ipywidgets import interact_manual, Text, Output, HBox, VBox, Button, FileUpload

In [4]:
import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
from transformers import pipeline
from newspaper import Article
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from jinja2 import Template
import PyPDF2
import io

In [6]:
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    return text

In [10]:
def get_financial_news_summary(text_or_url, is_pdf=False):
    if is_pdf:
        text = text_or_url
    else:
        article = Article(text_or_url)
        article.download()
        article.parse()
        article.nlp()
        text = article.text

    max_text_length = 1024
    text = text[:max_text_length]

    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=150, min_length=30, do_sample=False)

    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)

    max_text_length = 1024
    text = text[:max_text_length]

    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=150, min_length=30, do_sample=False)

    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    overall_sentiment = "Neutral"
    if sentiment['compound'] >= 0.05:
        overall_sentiment = "Positive"
    elif sentiment['compound'] <= -0.05:
        overall_sentiment = "Negative"

    source = text_or_url if not is_pdf else "Uploaded PDF"

    return {
        "summary": summary[0]['summary_text'],
        "sentiment": sentiment,
        "overall_sentiment": overall_sentiment,
        "source": source,
    }


In [11]:
def display_summary(text_or_url="", is_pdf=False):
    clear_output(wait=True)
    global news_summary
    news_summary = get_financial_news_summary(text_or_url, is_pdf)

    html_template = Template("""
    <div style="border: 2px solid #007bff; padding: 20px; border-radius: 10px; background-color: #f8f9fa; box-shadow: 2px 2px 5px #888888;">
        <h2 style="color: #007bff; text-align: center;">Financial News Summary</h2>
        <p style="font-weight: bold; color: black;">Source: <span style="color: black;">{{ news_summary.source }}</span></p>
        <h3 style="color: #343a40;">Summary:</h3>
        <p style="text-align: justify; color: black;">{{ news_summary.summary }}</p>
        <h3 style="color: #343a40;">Detailed Sentiment Analysis:</h3>
        <ul style="color: black;">
            <li>Positive: {{ '{:.0%}'.format(news_summary.sentiment.pos) }}</li>
            <li>Negative: {{ '{:.0%}'.format(news_summary.sentiment.neg) }}</li>
            <li>Neutral: {{ '{:.0%}'.format(news_summary.sentiment.neu) }}</li>
        </ul>
    </div>
    """)

    html_content = html_template.render(news_summary=news_summary)
    display(HTML(html_content))

In [12]:
url_input = Text(placeholder="Paste article URL here", layout={'width': '50%'})
pdf_upload = FileUpload(accept='.pdf', multiple=False, layout={'width': '50%'})

def on_submit_url(b):
    display_summary(url_input.value)

def on_upload_pdf(change):
    if change['new']:
        uploaded_file = list(change['new'].values())[0]
        pdf_text = extract_text_from_pdf(io.BytesIO(uploaded_file['content']))
        display_summary(pdf_text, is_pdf=True)


url_button = Button(description="Submit URL")
url_button.on_click(on_submit_url)

pdf_upload.observe(on_upload_pdf, names='value')

display(HBox([url_input, url_button]))
display(VBox([pdf_upload]))

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
