In [1]:
! pip install transformers torch newspaper3k sentencepiece sacremoses
! pip install gradio --upgrade
! pip install newspaper3k
! pip install lxml-html-clean
! pip install edge-tts
! pip install nltk



In [2]:
import gradio as gr
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
from newspaper import Article
from transformers import pipeline
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import asyncio
import edge_tts
import tempfile
import warnings
import newspaper
import nltk


In [3]:
warnings.filterwarnings("ignore")

In [4]:
# pretrained model used for summerizing news article
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-cnn_dailymail')
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-cnn_dailymail')

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
#pretrained model used for english to urdu text convertion.
ur_model_name = "facebook/m2m100_418M"
ur_tokenizer = M2M100Tokenizer.from_pretrained(ur_model_name)
ur_model = M2M100ForConditionalGeneration.from_pretrained(ur_model_name)

In [6]:
# Set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [7]:
# Function for capturing article from the URL

def get_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

In [8]:
#Function for Summarizing news article

def summarize_article(url):
    # Get the article text
    article_text = get_article_text(url)

    if "Error" in article_text:
        return article_text

    # Tokenize and summarize the article text
    inputs = tokenizer(article_text, max_length=1600, return_tensors='pt', truncation=True).to(device)

    summary_ids = model.generate(inputs['input_ids'], max_length=600,min_length=400, num_beams=8,no_repeat_ngram_size=3, early_stopping=True,num_return_sequences=1)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

#Function for cleaning news article

def clean_text(text):

    return text.replace('<n>',' ')

In [29]:
# #Function for Converting English text to Urdu text
def translate_to_urdu(text):
    # Set the source and target languages
    ur_tokenizer.src_lang = "en"  # Source language: English
    encoded_input = ur_tokenizer(text, return_tensors="pt")

    # Generate translation with adjusted parameters
    generated_tokens = ur_model.generate(
        **encoded_input,
        forced_bos_token_id=ur_tokenizer.get_lang_id("ur"),  # Target language: Urdu
        max_length=800,  # Limit length to prevent truncation
        min_length=350,
        num_beams=8,
        length_penalty=1.2,  # Neutral length penalty for more accurate text
        no_repeat_ngram_size=7,  # Prevents repetition of 2-grams instead of 3-grams
        early_stopping=False
    )

    # Decode the translated text
    translated_text = ur_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    return translated_text

In [30]:


async def urdu_text_to_speech(text):
    # Create a temporary file and save the TTS output
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
        tts = edge_tts.Communicate(text, "ur-PK-AsadNeural")
        await tts.save(fp.name)  # Save the audio to the temporary file
        return fp.name  # Return the file path for Gradio to use

In [31]:
def process_article(url):
    try:
        newz = summarize_article(url)
        clean_summary = clean_text(newz)
        translated_summary = translate_to_urdu(clean_summary)
        audio_file_path = asyncio.run(urdu_text_to_speech(translated_summary))
        return audio_file_path,clean_summary
    except Exception as e:
        return str(e)

# Create Gradio interface
interface = gr.Interface(
    fn=process_article,
    inputs=gr.Textbox(label="Enter Article URL"),
    outputs=[gr.Audio(label="Urdu Translated Summary"),gr.Textbox(label="Article Summary")],
    title="Article Summarizer & Urdu TTS"
)

# Launch the interface
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://725b593009fd6916e2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# prompt: generate me a markdown  report of this whole code with bit more professional detail for github

# News Article Summarizer and Urdu Text-to-Speech

This project provides a user-friendly interface for summarizing news articles and translating them into Urdu, along with generating an audio output of the Urdu translation.

## Technologies Used

* **Python:** The core programming language for building the application.
* **Gradio:** Used to create a web-based user interface for interaction.
* **Transformers:** A library from Hugging Face that provides pre-trained models for tasks like text summarization and translation.
* **Pegasus:** A transformer-based model used for abstractive text summarization.
* **M2M100:** A multilingual machine translation model for converting English to Urdu.
* **Newspaper3k:** A library for extracting articles from URLs.
* **Edge TTS:** Used for generating speech from text in Urdu.
* **SentencePiece:** A library for subword tokenization, essential for working with multilingual models.
* **Sacremoses:** A library for tokenization and detokenization of text.

## Functionality

1. **Article Extraction:** The user provides a URL to a news article.
2. **Summarization:** The article is processed, and a concise summary is generated using the Pegasus model.
3. **Translation:** The English summary is translated into Urdu using the M2M100 model.
4. **Text-to-Speech:** The Urdu translated text is converted into an audio file using Edge TTS.
5. **Output:** The audio file of the Urdu summary and the English summary are presented to the user.


## Code Structure

The code is organized into several functions:

* **`get_article_text(url)`:** Retrieves the text content of the article from the given URL.
* **`summarize_article(url)`:** Summarizes the article text using the Pegasus model.
* **`clean_text(text)`:** Cleans the text of any unnecessary symbols or formats.
* **`translate_to_urdu(text)`:** Translates the English text to Urdu using the M2M100 model.
* **`urdu_text_to_speech(text)`:** Converts the Urdu text into an audio file using Edge TTS.
* **`process_article(url)`:** Orchestrates the entire process by calling the above functions.

## Interface

The Gradio interface provides a user-friendly input field for entering a news article's URL. The output consists of:

* Audio output: An audio file of the translated Urdu summary.
* Text output: The English summary of the news article.

## Installation

Before running the code, you need to install the required libraries:
