In [None]:
# Import necessary libraries
import os
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline

# Step 1: Data Collection
def get_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')  # Adjust according to the HTML structure
    return ' '.join([p.get_text() for p in paragraphs])

# Example URLs
article_urls = [
    'https://www.thehindu.com/business/indiachina-trade-record-31-bn-deficit-in-2013/article5562569.ece',
    'https://m.economictimes.com/news/economy/indicators/indias-october-trade-deficit-widens-to-26-91-billion/articleshow/95529500.cms',
    'https://timesofindia.indiatimes.com/business/india-business/indias-trade-deficit-with-china-hits-100bn-for-first-time/articleshow/96979850.cms',
    'https://www.thehindu.com/news/international/indias-imports-from-china-reach-record-high-in-2022-trade-deficit-surges-beyond-100-billion/article66372861.ece'
]

In [None]:
# Collecting the articles from the given URLs
articles = [get_article_text(url) for url in article_urls]

In [None]:
# Data Preprocessing
def save_to_txt(article, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        print(filename)
        file.write(article)

In [None]:
# Save each article into a text file
for i, article in enumerate(articles):
    save_to_txt(article, f'article_{i+1}.txt')

article_1.txt
article_2.txt
article_3.txt
article_4.txt


In [None]:
articles

["To enjoy additional benefits CONNECT WITH US  January 10, 2014 07:19 pm | Updated November 16, 2021 09:26 pm IST - BEIJING  \nCOMMents\n  SHARE\n \nREAD LATER\n \nA file picture of entry gate at Nathu La which allow the Indian traders' vehicles enter China.\n India’s trade deficit with China reached a record $ 31.4 billion in 2013, with two-way trade declining last year by 1.5 per cent on account of a sharp decline in Indian exports, new trade figures released in Beijing on Friday showed. Indian exports to China last year totalled $ 17.03 billion - a 9.4 per cent fall from last year - out of $ 65.47 total bilateral trade, according to figures released by the Chinese General Administration of Customs (GAC). Chinese exports to India, in recent years largely comprised of machinery, were up 1.6 per cent. Friday’s annual figures marked the second straight year of declines, highlighting the unexpected slowdown in rapidly growing trade ties that came to be seen as one of the key drivers of 

In [None]:
import re
# Step 1: Data Preprocessing
def preprocess_text(text):
    # Remove non-alphanumeric characters and extra spaces
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()

cleaned_articles = [preprocess_text(article) for article in articles]

In [None]:
cleaned_articles

['to enjoy additional benefits connect with us january 10 2014 07 19 pm updated november 16 2021 09 26 pm ist beijing comments share read later a file picture of entry gate at nathu la which allow the indian traders vehicles enter china india s trade deficit with china reached a record 31 4 billion in 2013 with two way trade declining last year by 1 5 per cent on account of a sharp decline in indian exports new trade figures released in beijing on friday showed indian exports to china last year totalled 17 03 billion a 9 4 per cent fall from last year out of 65 47 total bilateral trade according to figures released by the chinese general administration of customs gac chinese exports to india in recent years largely comprised of machinery were up 1 6 per cent friday s annual figures marked the second straight year of declines highlighting the unexpected slowdown in rapidly growing trade ties that came to be seen as one of the key drivers of a relationship amid political uncertainties su

## Non-parametric Test
For non-parametric tests, we used the Mann-Whitney U test to compare two independent samples. In this case, let's compare the lengths of the articles before and after cleaning. We'll use the scipy library for this test.

In [None]:
from scipy.stats import mannwhitneyu

# Non-parametric Test
article_lengths_before = [len(article) for article in articles]
article_lengths_after = [len(cleaned_article) for cleaned_article in cleaned_articles]

stat, p_value = mannwhitneyu(article_lengths_before, article_lengths_after)
print(f"\nMann-Whitney U Test p-value: {p_value}")


Mann-Whitney U Test p-value: 0.6857142857142857


The p-value obtained from the Mann-Whitney U test is 0.6857. The p-value is a probability value that helps us make a decision about the null hypothesis. Here's how to interpret the p-value:

Null Hypothesis (H0): There is no significant difference between the lengths of the articles before and after cleaning.

Alternative Hypothesis (H1): There is a significant difference between the lengths of the articles before and after cleaning.

Interpretation:

If the p-value is less than the significance level (commonly set at 0.05), we reject the null hypothesis, suggesting that there is a significant difference.
If the p-value is greater than the significance level, we fail to reject the null hypothesis, indicating that there is not enough evidence to suggest a significant difference.

Conclusion:

In this case, the p-value is 0.6857, which is greater than 0.05. Therefore, we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude that there is a significant difference in the lengths of the articles before and after cleaning.

In [None]:
# Step 2: Corpus Creation
corpus = ' '.join(cleaned_articles)

# Step 3: Topic Modeling
def perform_topic_modeling(articles):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(articles)
    lda = LatentDirichletAllocation(n_components=4, random_state=42)
    lda.fit(X)
    return lda, vectorizer

In [None]:
lda_model, vectorizer = perform_topic_modeling(cleaned_articles)

In [None]:
# Display Topics
feature_names = vectorizer.get_feature_names_out()
topics = [[feature_names[i] for i in topic.argsort()[:-5 - 1:-1]] for topic in lda_model.components_]
print("Topics:")
for i, topic in enumerate(topics):
    print(f"Topic {i + 1}: {', '.join(topic)}")

Topics:
Topic 1: global, 10, national, affordable, strategies
Topic 2: trade, china, billion, india, comments
Topic 3: prime, year, offer, et, exclusively
Topic 4: global, 10, national, affordable, strategies


In [None]:
# Step 4: Seq2Seq Summarization
tokenizer = AutoTokenizer.from_pretrained('t5-base')
seq2seq_model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
# Abstractive Summarization
abstractive_summaries = []

for article in cleaned_articles:
    # Tokenize the cleaned text
    tokenized_text = tokenizer(article, return_tensors='pt', max_length=512, truncation=True)

    # Generate abstractive summaries
    summary_ids = seq2seq_model.generate(
        input_ids=tokenized_text['input_ids'],
        max_length=150,
        min_length=50,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode the summary and append to the list
    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    abstractive_summaries.append(summary_text)

In [None]:
# Display the results
for i, summary in enumerate(abstractive_summaries, start=1):
    print(f"\nSummary for Article {i}:\n")

    # Set the maximum number of characters per line
    max_chars_per_line = 80

    # Print the summary in lines with a maximum number of characters
    lines = [summary[j:j+max_chars_per_line] for j in range(0, len(summary), max_chars_per_line)]
    for line in lines:
        print(line)


Summary for Article 1:

the gac said comments share read later a file picture of entry gate at nathu la 
which allow the indian traders vehicles enter china new trade figures released i
n beijing on friday showed indian exports to china last year totalled 17 03 bill
ion a 9 4 per cent fall from last year chinese exports to india grew 7 9 per cen
t to 2 21 trillion the gac said comments share read later a file picture of entr
y gate at nathu la which allow the indian traders

Summary for Article 2:

a client requested an arrangement of locally grown nargis flowers daffodils and 
tulips csam minister of state for electronics and it rajeev chandrasekhar said f
riday wedding planner mukta kapoor recently had a client request an arrangement 
of locally grown nargis flowers daffodils and tulips csam minister of state for 
electronics and it rajeev chandrasekhar said friday 

Summary for Article 3:

10 most affordable cities in india to buy a house most visited monuments in indi
a the pros a

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')

# Function to tokenize and vectorize sentences
def vectorize_sentences(sentences, vectorizer):
    vectors = vectorizer.transform(sentences).toarray()
    return vectors

# Function to get extractive summaries using LDA
def get_lda_extractive_summaries(text, vectorizer, lda_model, num_topics=3, num_sentences=3):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Vectorize sentences using the pre-fitted CountVectorizer
    sentence_vectors = vectorize_sentences(sentences, vectorizer)

    # Apply Latent Dirichlet Allocation (LDA)
    topics = lda_model.transform(sentence_vectors)

    # Get the dominant topic for each sentence
    dominant_topics = np.argmax(topics, axis=1)

    # Calculate sentence scores based on topic dominance
    sentence_scores = np.max(topics, axis=1)

    # Rank sentences by their scores and select top sentences as extractive summary
    ranked_sentences = pd.DataFrame({'Sentence': sentences, 'Score': sentence_scores, 'Topic': dominant_topics})
    ranked_sentences = ranked_sentences.sort_values(by='Score', ascending=False).head(num_sentences)

    # Combine selected sentences to form the extractive summary
    extractive_summary = ' '.join(ranked_sentences['Sentence'].tolist())
    return extractive_summary

# Fit the CountVectorizer on all articles
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(cleaned_articles)

# Perform extractive summarization using LDA
lda_model = LatentDirichletAllocation(n_components=3, random_state=42)
lda_model.fit(vectorizer.transform(cleaned_articles))

# Perform extractive summarization using LDA
extractive_summaries_lda = [get_lda_extractive_summaries(article, vectorizer, lda_model) for article in cleaned_articles]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Display the results
for i, summary_extractive_lda in enumerate(extractive_summaries_lda, start=1):
    print(f"\nArticle {i}:\n")

    # Set the maximum number of characters per line
    max_chars_per_line = 80

    # Print the extractive summary in lines with a maximum number of characters
    lines = [summary_extractive_lda[j:j+max_chars_per_line] for j in range(0, len(summary_extractive_lda), max_chars_per_line)]
    for line in lines:
        print(line)
    print()  # Add an extra line for better readability


Article 1:

to enjoy additional benefits connect with us january 10 2014 07 19 pm updated no
vember 16 2021 09 26 pm ist beijing comments share read later a file picture of 
entry gate at nathu la which allow the indian traders vehicles enter china india
 s trade deficit with china reached a record 31 4 billion in 2013 with two way t
rade declining last year by 1 5 per cent on account of a sharp decline in indian
 exports new trade figures released in beijing on friday showed indian exports t
o china last year totalled 17 03 billion a 9 4 per cent fall from last year out 
of 65 47 total bilateral trade according to figures released by the chinese gene
ral administration of customs gac chinese exports to india in recent years large
ly comprised of machinery were up 1 6 per cent friday s annual figures marked th
e second straight year of declines highlighting the unexpected slowdown in rapid
ly growing trade ties that came to be seen as one of the key drivers of a relati
onship amid pol

This below code We are efficiently handles potential token length issues by breaking down the text into manageable chunks for sentiment analysis. It then provides sentiment labels and confidence scores for each chunk within each article.







## Emotion Analysis

In [None]:
# Emotion Analysis
sentiment_analyzer = pipeline("sentiment-analysis")

# Sentiment Analysis for Each Chunk
for i, article in enumerate(cleaned_articles):
    chunk_size = 512  # Adjust as needed
    chunks = [article[j:j+chunk_size] for j in range(0, len(article), chunk_size)]

    print(f"\nEmotions for Article {i + 1}:")
    for chunk_num, chunk in enumerate(chunks):
        emotions_chunk = sentiment_analyzer(chunk)
        print(f"Chunk {chunk_num + 1}: {emotions_chunk[0]['label']} with confidence {emotions_chunk[0]['score']}")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.



Emotions for Article 1:
Chunk 1: NEGATIVE with confidence 0.9872323274612427
Chunk 2: NEGATIVE with confidence 0.9961349964141846
Chunk 3: POSITIVE with confidence 0.9075860381126404
Chunk 4: NEGATIVE with confidence 0.9919589757919312
Chunk 5: NEGATIVE with confidence 0.9746127724647522

Emotions for Article 2:
Chunk 1: NEGATIVE with confidence 0.9666159152984619
Chunk 2: POSITIVE with confidence 0.9543963074684143
Chunk 3: NEGATIVE with confidence 0.9927135109901428
Chunk 4: POSITIVE with confidence 0.8304776549339294
Chunk 5: NEGATIVE with confidence 0.9847776889801025
Chunk 6: NEGATIVE with confidence 0.9987768530845642
Chunk 7: NEGATIVE with confidence 0.9984546899795532
Chunk 8: NEGATIVE with confidence 0.9913115501403809
Chunk 9: NEGATIVE with confidence 0.9810521602630615
Chunk 10: NEGATIVE with confidence 0.9853445887565613
Chunk 11: POSITIVE with confidence 0.8221780061721802

Emotions for Article 3:
Chunk 1: POSITIVE with confidence 0.9988841414451599
Chunk 2: POSITIVE with

The provided output shows the sentiment analysis results for each article broken down into chunks. Each chunk is labeled as either positive, negative, or neutral, accompanied by a confidence score indicating the model's certainty in its prediction.

For Article 1, the sentiment fluctuates across chunks, with the initial segments expressing a strong negative sentiment, followed by a positive tone in Chunk 3. However, subsequent chunks revert to a negative sentiment. The varying emotions suggest a mixed perspective or narrative in Article 1, with moments of negativity interspersed with a positive section.

Article 2 exhibits a similar pattern, with an alternating sentiment between negative and positive across its chunks. The sentiment changes are notable, indicating potential shifts in the subject matter or the author's stance. Interestingly, Chunk 11 introduces a positive sentiment, providing a nuanced emotional context to the article.

Article 3 maintains a consistently positive sentiment throughout its chunks, suggesting a generally optimistic or affirmative tone in the content. This stability in positive emotions indicates a cohesive and positive narrative or discussion in Article 3.

In contrast, Article 4 presents a more complex emotional trajectory. The initial chunks convey a strong negative sentiment, which gradually shifts as the article progresses. The latter chunks introduce positive sentiments, indicating a potential evolution or change in the narrative tone within Article 4.

In summary, the sentiment analysis highlights the emotional dynamics within each article, showcasing the variation and nuances in the expression of sentiments across different sections or themes. These emotional insights can provide a deeper understanding of the subjective tones present in the articles, aiding in the interpretation of the overall sentiment conveyed by each piece.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

### Sentiment analysis for the entire text using the Longformer for sequence-to-sequence language modeling (LLM) model from the transformers library.

In [None]:
# Sentiment Analysis for Entire Text
from transformers import LongformerTokenizer, LongformerForSequenceClassification, pipeline

# Sentiment Analysis for Entire Text
llm_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
llm_model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096")

def get_sentiment_llm(text):
    inputs = llm_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True)
    outputs = llm_model(**inputs)
    logits = outputs.logits
    sentiment_label = "positive" if logits.argmax() == 2 else "negative" if logits.argmax() == 0 else "neutral"
    return sentiment_label

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
entire_text_sentiment = get_sentiment_llm(corpus)

print("\nSentiment for Entire Text:", entire_text_sentiment)


Sentiment for Entire Text: neutral



The output "Sentiment for Entire Text: neutral" indicates that, according to the Longformer model's sentiment analysis, the overall sentiment of the entire text (corpus) is considered neutral.