In [1]:
! pip install transformers torch newspaper3k sentencepiece sacremoses
! pip install gradio --upgrade
! pip install newspaper3k
! pip install lxml-html-clean
! pip install edge-tts
! pip install nltk

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading 

In [2]:
import gradio as gr
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
from newspaper import Article
from transformers import pipeline
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import asyncio
import edge_tts
import tempfile
import warnings
import newspaper
import nltk
import re

In [3]:
warnings.filterwarnings("ignore")

In [4]:
# pretrained model used for summerizing news article
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-cnn_dailymail')
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-cnn_dailymail')

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [5]:
#pretrained model used for english to urdu text convertion.
ur_model_name = "facebook/m2m100_418M"
ur_tokenizer = M2M100Tokenizer.from_pretrained(ur_model_name)
ur_model = M2M100ForConditionalGeneration.from_pretrained(ur_model_name)

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [6]:
# Set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [18]:
# Function for capturing article from the URL

def get_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

In [27]:
#Function for Summarizing news article

def summarize_article(url):
    # Get the article text
    article_text = get_article_text(url)

    if "Error" in article_text:
        return article_text

    # Tokenize and summarize the article text
    inputs = tokenizer(article_text, max_length=1024, return_tensors='pt', truncation=True).to(device)

    summary_ids = model.generate(inputs['input_ids'], max_length=700, num_beams=8,no_repeat_ngram_size=3,length_penalty=3, early_stopping=True,num_return_sequences=1)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

#Function for cleaning news article

def clean_text(text):
    # Replacing common HTML tags manually
    tags_to_remove = ['<br>', '<br/>', '<p>', '</p>', '<div>', '</div>', '<a>', '</a>', '<span>', '</span>', '<b>', '</b>','/n','<n>']
    # Iterating over the tags list and replacing them with an empty string
    for tag in tags_to_remove:
        text = text.replace(tag,'')
    return text

In [50]:
# #Function for Converting English text to Urdu text
def translate_to_urdu(text):
    # Set the source and target languages
    ur_tokenizer.src_lang = "en"  # Source language: English
    encoded_input = ur_tokenizer(text, return_tensors="pt")

    # Generate translation with adjusted parameters
    generated_tokens = ur_model.generate(
        **encoded_input,
        forced_bos_token_id=ur_tokenizer.get_lang_id("ur"),  # Target language: Urdu
        max_length=600,  # Limit length to prevent truncation
        num_beams=8,
        length_penalty=3,  # Neutral length penalty for more accurate text
        no_repeat_ngram_size=3,# Prevents repetition of 2-grams instead of 3-grams
        repetition_penalty=1.5,
        temperature=1.5,
        top_k=30,  # Moderate k for simpler vocabulary
        top_p=0.9,  # Higher p for balanced creativity
        num_return_sequences=1,
        early_stopping=False
    )

    # Decode the translated text
    translated_text = ur_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translated_text

In [51]:

async def urdu_text_to_speech(text):
    # Create a temporary file and save the TTS output
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
        tts = edge_tts.Communicate(text, "ur-PK-AsadNeural")
        await tts.save(fp.name)  # Save the audio to the temporary file
        return fp.name  # Return the file path for Gradio to use

In [52]:

def process_article(url):
    try:
        newz = summarize_article(url)
        clean_summary = clean_text(newz)
        translated_summary = translate_to_urdu(clean_summary)
        audio_file_path = asyncio.run(urdu_text_to_speech(translated_summary))
        return audio_file_path,clean_summary
    except Exception as e:
        return str(e)

# Create Gradio interface
interface = gr.Interface(
    fn=process_article,
    inputs=gr.Textbox(label="Enter Article URL"),
    outputs=[gr.Audio(label="Urdu Translated Summary"),gr.Textbox(label="Article Summary")],
    title="Article Summarizer & Urdu TTS"
)

# Launch the interface
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1e1acfb2ecf5ecb5a7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [53]:
# prompt: generate me a markdown  report of this whole code with bit more professional detail for github

# News Article Summarizer and Urdu Text-to-Speech

This project provides a user-friendly interface for summarizing news articles and translating them into Urdu, along with generating an audio output of the Urdu translation.

## Technologies Used

* **Python:** The core programming language for building the application.
* **Gradio:** Used to create a web-based user interface for interaction.
* **Transformers:** A library from Hugging Face that provides pre-trained models for tasks like text summarization and translation.
* **Pegasus:** A transformer-based model used for abstractive text summarization.
* **M2M100:** A multilingual machine translation model for converting English to Urdu.
* **Newspaper3k:** A library for extracting articles from URLs.
* **Edge TTS:** Used for generating speech from text in Urdu.
* **SentencePiece:** A library for subword tokenization, essential for working with multilingual models.
* **Sacremoses:** A library for tokenization and detokenization of text.

## Functionality

1. **Article Extraction:** The user provides a URL to a news article.
2. **Summarization:** The article is processed, and a concise summary is generated using the Pegasus model.
3. **Translation:** The English summary is translated into Urdu using the M2M100 model.
4. **Text-to-Speech:** The Urdu translated text is converted into an audio file using Edge TTS.
5. **Output:** The audio file of the Urdu summary and the English summary are presented to the user.


## Code Structure

The code is organized into several functions:

* **`get_article_text(url)`:** Retrieves the text content of the article from the given URL.
* **`summarize_article(url)`:** Summarizes the article text using the Pegasus model.
* **`clean_text(text)`:** Cleans the text of any unnecessary symbols or formats.
* **`translate_to_urdu(text)`:** Translates the English text to Urdu using the M2M100 model.
* **`urdu_text_to_speech(text)`:** Converts the Urdu text into an audio file using Edge TTS.
* **`process_article(url)`:** Orchestrates the entire process by calling the above functions.

## Interface

The Gradio interface provides a user-friendly input field for entering a news article's URL. The output consists of:

* Audio output: An audio file of the translated Urdu summary.
* Text output: The English summary of the news article.

## Installation

Before running the code, you need to install the required libraries:


SyntaxError: unterminated string literal (detected at line 41) (<ipython-input-53-590facb136ad>, line 41)