In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter

#load the dataset
df=pd.read_csv("/content/news_summary_more.csv")
df

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...
...,...,...
98396,CRPF jawan axed to death by Maoists in Chhatti...,A CRPF jawan was on Tuesday axed to death with...
98397,First song from Sonakshi Sinha's 'Noor' titled...,"'Uff Yeh', the first song from the Sonakshi Si..."
98398,'The Matrix' film to get a reboot: Reports,"According to reports, a new version of the 199..."
98399,Snoop Dogg aims gun at clown dressed as Trump ...,A new music video shows rapper Snoop Dogg aimi...


In [None]:
print(f"Dataset Initial shape: {df.shape}")
print(f"Initial columns: {list(df.columns)}")
print(f"Null values before cleaning:")
print(f"  text: {df['text'].isnull().sum()}")
print(f"  headline: {df['headlines'].isnull().sum()}")

Dataset Initial shape: (98401, 2)
Initial columns: ['headlines', 'text']
Null values before cleaning:
  text: 0
  headline: 0


In [None]:
#function to clean the dataset
def pre_processing(df,lowercase="false",verbose="true"):
  # Remove duplicate rows where both 'text' and 'headline' are the same
    before_dedup = len(df)
    df = df.drop_duplicates(subset=['text', 'headlines'])
    if verbose:
        print(f"Removed {before_dedup - len(df)} duplicate rows")

    # Rename columns
    df = df.rename(columns={'text': 'article', 'headlines': 'summary'})
    if verbose:
        print("Columns renamed: 'text' → 'article', 'headlines' → 'summary'")

    # Clean text minimally
    df['article'] = df['article'].str.strip()
    df['summary'] = df['summary'].str.strip()

    if lowercase:
        df['article'] = df['article'].str.lower()
        df['summary'] = df['summary'].str.lower()
        if verbose:
            print("Text converted to lowercase")

    # Reset index
    df = df.reset_index(drop=True)
    if verbose:
        print("Index reset")

    return df

clean_df=pre_processing(df)

Removed 22 duplicate rows
Columns renamed: 'text' → 'article', 'headlines' → 'summary'
Text converted to lowercase
Index reset


In [None]:
#Inspect the quality of cleaned data
def inspect_text_quality(df, sample_size=5):
    """
    Inspect the quality of cleaned text data.

    Parameters:
    df (pd.DataFrame): Cleaned dataset
    sample_size (int): Number of samples to display

    Returns:
    dict: Analysis results
    """
    print(f"Final dataset shape: {df.shape}")
    print(f"Final columns: {list(df.columns)}")
    print()

    # Display first few rows
    print(f"First {sample_size} rows:")
    print("="*80)
    for i in range(min(sample_size, len(df))):
        print(f"Row {i+1}:")
        print(f"Summary: {df.iloc[i]['summary']}")
        print(f"Article: {df.iloc[i]['article'][:200]}...")  # Show first 200 chars
        print("-"*80)

    # Basic statistics
    article_lengths = df['article'].str.len()
    summary_lengths = df['summary'].str.len()

    print("\nText Length Statistics:")
    print(f"Article lengths - Mean: {article_lengths.mean():.1f}, Median: {article_lengths.median():.1f}")
    print(f"Summary lengths - Mean: {summary_lengths.mean():.1f}, Median: {summary_lengths.median():.1f}")

    # Check for common issues
    analysis = {}

    # Check for very short articles/summaries
    short_articles = (article_lengths < 100).sum()
    short_summaries = (summary_lengths < 10).sum()

    print(f"\nPotential Quality Issues:")
    print(f"Articles shorter than 100 chars: {short_articles} ({short_articles/len(df)*100:.1f}%)")
    print(f"Summaries shorter than 10 chars: {short_summaries} ({short_summaries/len(df)*100:.1f}%)")

    # Check for HTML tags
    html_in_articles = df['article'].str.contains('<[^>]+>', regex=True).sum()
    html_in_summaries = df['summary'].str.contains('<[^>]+>', regex=True).sum()

    print(f"Articles with HTML tags: {html_in_articles}")
    print(f"Summaries with HTML tags: {html_in_summaries}")

    # Check for special characters
    special_chars_articles = df['article'].str.contains('[^\w\s\.\,\!\?\;\:\'\"\-\(\)]', regex=True).sum()
    special_chars_summaries = df['summary'].str.contains('[^\w\s\.\,\!\?\;\:\'\"\-\(\)]', regex=True).sum()

    print(f"Articles with special characters: {special_chars_articles}")
    print(f"Summaries with special characters: {special_chars_summaries}")

    analysis['short_articles'] = short_articles
    analysis['short_summaries'] = short_summaries
    analysis['html_in_articles'] = html_in_articles
    analysis['html_in_summaries'] = html_in_summaries
    analysis['special_chars_articles'] = special_chars_articles
    analysis['special_chars_summaries'] = special_chars_summaries

    return analysis

analysis=inspect_text_quality(clean_df,10)

Final dataset shape: (98379, 2)
Final columns: ['summary', 'article']

First 10 rows:
Row 1:
Summary: upgrad learner switches to career in ml & al with 90% salary hike
Article: saurav kant, an alumnus of upgrad and iiit-b's pg program in machine learning and artificial intelligence, was a sr systems engineer at infosys with almost 5 years of work experience. the program and ...
--------------------------------------------------------------------------------
Row 2:
Summary: delhi techie wins free food from swiggy for one year on cred
Article: kunal shah's credit card bill payment platform, cred, gave users a chance to win free food from swiggy for one year. pranav kaushik, a delhi techie, bagged this reward after spending 2000 cred coins. ...
--------------------------------------------------------------------------------
Row 3:
Summary: new zealand end rohit sharma-led india's 12-match winning streak
Article: new zealand defeated india by 8 wickets in the fourth odi at hamilton on thur

In [None]:
def assess_modeling_readiness(analysis, df):
    """
    Assess whether the data is ready for modeling or needs further cleaning.

    Parameters:
    analysis (dict): Results from inspect_text_quality
    df (pd.DataFrame): Cleaned dataset

    Returns:
    str: Assessment and recommendations
    """
    print("\n" + "="*60)
    print("MODELING READINESS ASSESSMENT")
    print("="*60)

    issues = []
    recommendations = []

    # Check various quality metrics
    if analysis['html_in_articles'] > 0 or analysis['html_in_summaries'] > 0:
        issues.append("HTML tags present in text")
        recommendations.append("Remove HTML tags using BeautifulSoup or regex")

    if analysis['short_articles'] > len(df) * 0.05:  # More than 5% are very short
        issues.append("Significant number of very short articles")
        recommendations.append("Consider filtering out articles shorter than 100 characters")

    if analysis['special_chars_articles'] > len(df) * 0.1:  # More than 10% have special chars
        issues.append("Many articles contain special characters")
        recommendations.append("Consider removing or normalizing special characters")

    # Overall assessment
    if len(issues) == 0:
        print(" DATA APPEARS CLEAN AND READY FOR MODELING")
        print("\nThe dataset looks good for:")
        print("- Fine-tuning summarization models (T5/BART)")
        print("- Emotion detection on summaries")
        print("\nNo additional cleaning required.")

    elif len(issues) <= 2:
        print("DATA IS MOSTLY CLEAN WITH MINOR ISSUES")
        print("\nMinor issues found:")
        for issue in issues:
            print(f"- {issue}")
        print("\nRecommendations:")
        for rec in recommendations:
            print(f"- {rec}")
        print("\nThe data can be used for modeling, but cleaning these issues may improve performance.")

    else:
        print("DATA NEEDS SIGNIFICANT CLEANING")
        print("\nIssues found:")
        for issue in issues:
            print(f"- {issue}")
        print("\nRecommendations:")
        for rec in recommendations:
            print(f"- {rec}")
        print("\nConsider additional cleaning before modeling.")

    return issues, recommendations

issue_recommendaton=assess_modeling_readiness(analysis,clean_df)
print(issue_recommendaton)


MODELING READINESS ASSESSMENT
DATA IS MOSTLY CLEAN WITH MINOR ISSUES

Minor issues found:
- Many articles contain special characters

Recommendations:
- Consider removing or normalizing special characters

The data can be used for modeling, but cleaning these issues may improve performance.
(['Many articles contain special characters'], ['Consider removing or normalizing special characters'])


In [None]:
#Function to Remove non-alphanumeric characters except common punctuation
def clean_special_chars(text):
    text = re.sub(r"[^\w\s.,!?\"']+", '', text)
    return text

# Apply to dataset
clean_df['article'] = clean_df['article'].apply(clean_special_chars)
clean_df['summary'] = clean_df['summary'].apply(clean_special_chars)

In [None]:
print(clean_df)
clean_df.to_csv('news_summary_clean.csv', index=False)  #saving the dataset

                                                 summary  \
0      upgrad learner switches to career in ml  al wi...   
1      delhi techie wins free food from swiggy for on...   
2      new zealand end rohit sharmaled india's 12matc...   
3      aegon life iterm insurance plan helps customer...   
4      have known hirani for yrs, what if metoo claim...   
...                                                  ...   
98374  crpf jawan axed to death by maoists in chhatti...   
98375  first song from sonakshi sinha's 'noor' titled...   
98376          'the matrix' film to get a reboot reports   
98377  snoop dogg aims gun at clown dressed as trump ...   
98378  madhesi morcha withdraws support to nepalese g...   

                                                 article  
0      saurav kant, an alumnus of upgrad and iiitb's ...  
1      kunal shah's credit card bill payment platform...  
2      new zealand defeated india by 8 wickets in the...  
3      with aegon life iterm insurance plan

In [None]:
#Emotion Detection using RoBERTa - Chunked Processing
from transformers import pipeline
import torch
from tqdm import tqdm
import warnings
import pandas as pd
import os
warnings.filterwarnings('ignore')

def load_emotion_model():
    """
    Load the pre-trained emotion detection model.

    Returns:
    pipeline: Hugging Face emotion classification pipeline
    """
    print("Loading emotion detection model...")
    print("Model: j-hartmann/emotion-english-distilroberta-base")

    # Check if CUDA is available
    device = 0 if torch.cuda.is_available() else -1
    if device == 0:
        print("Using GPU acceleration")
    else:
        print("Using CPU (this may be slower)")

    # Load the emotion classification pipeline
    emotion_classifier = pipeline(
        "text-classification",
        model="j-hartmann/emotion-english-distilroberta-base",
        device=device,
        return_all_scores=False  # Only return the top prediction
    )

    print("Model loaded successfully")
    return emotion_classifier

def classify_emotions_batch(summaries, emotion_classifier, batch_size=32):
    """
    Classify emotions for summaries in batches for efficiency.

    Parameters:
    summaries (list): List of summary texts
    emotion_classifier (pipeline): Hugging Face emotion classifier
    batch_size (int): Number of texts to process at once

    Returns:
    list: List of emotion labels
    """
    print(f"Classifying emotions for {len(summaries)} summaries...")
    print(f"Processing in batches of {batch_size}")

    emotion_labels = []

    # Process in batches with progress bar
    for i in tqdm(range(0, len(summaries), batch_size), desc="Processing batches"):
        batch = summaries[i:i+batch_size]

        # Get predictions for the batch
        try:
            predictions = emotion_classifier(batch)

            # Extract labels from predictions
            if isinstance(predictions[0], list):
                # If multiple predictions per text, take the first one
                batch_labels = [pred[0]['label'] for pred in predictions]
            else:
                # Single prediction per text
                batch_labels = [pred['label'] for pred in predictions]

            emotion_labels.extend(batch_labels)

        except Exception as e:
            print(f"Error processing batch {i//batch_size + 1}: {str(e)}")
            # Add placeholder labels for failed batch
            emotion_labels.extend(['unknown'] * len(batch))

    return emotion_labels

def classify_emotions_individual(summaries, emotion_classifier):
    """
    Classify emotions one by one (fallback method if batch processing fails).

    Parameters:
    summaries (list): List of summary texts
    emotion_classifier (pipeline): Hugging Face emotion classifier

    Returns:
    list: List of emotion labels
    """
    print(f"🔍 Classifying emotions individually for {len(summaries)} summaries...")

    emotion_labels = []

    for i, summary in enumerate(tqdm(summaries, desc="Processing summaries")):
        try:
            # Get prediction for single text
            prediction = emotion_classifier(summary)

            # Extract label
            if isinstance(prediction, list) and len(prediction) > 0:
                emotion_labels.append(prediction[0]['label'])
            else:
                emotion_labels.append(prediction['label'])

        except Exception as e:
            if i < 5:  # Only print first 5 errors to avoid spam
                print(f" Error processing summary {i+1}: {str(e)}")
            emotion_labels.append('unknown')

    return emotion_labels

def analyze_emotion_distribution(df):
    """
    Analyze and display the distribution of emotions.

    Parameters:
    df (pd.DataFrame): DataFrame with emotion_label column
    """
    print("\nEMOTION DISTRIBUTION ANALYSIS")
    print("="*50)

    # Count emotions
    emotion_counts = df['emotion_label'].value_counts()
    emotion_percentages = df['emotion_label'].value_counts(normalize=True) * 100

    print(f"Total summaries analyzed: {len(df)}")
    print(f"Number of unique emotions: {len(emotion_counts)}")
    print()

    # Display detailed breakdown
    print("Emotion Distribution:")
    print("-" * 30)
    for emotion, count in emotion_counts.items():
        percentage = emotion_percentages[emotion]
        print(f"{emotion:15} | {count:6,} ({percentage:5.1f}%)")

    # Show examples for each emotion
    print("\nSample summaries for each emotion:")
    print("="*50)
    for emotion in emotion_counts.index[:5]:  # Show top 5 emotions
        print(f"\n{emotion.upper()}:")
        samples = df[df['emotion_label'] == emotion]['summary'].head(3)
        for i, sample in enumerate(samples, 1):
            print(f"  {i}. {sample[:100]}...")

def save_results(df, output_path):
    """
    Save the DataFrame with emotion labels.

    Parameters:
    df (pd.DataFrame): DataFrame to save
    output_path (str): Path to save the file
    """
    print(f"\nSaving results to {output_path}...")
    df.to_csv(output_path, index=False)
    print(f"Results saved successfully!")
    print(f"File: {output_path}")
    print(f"Shape: {df.shape}")

def process_chunk(chunk_df, chunk_number, emotion_classifier, batch_size=32):
    """
    Process a single chunk of data.

    Parameters:
    chunk_df (pd.DataFrame): DataFrame chunk to process
    chunk_number (int): Chunk number for naming
    emotion_classifier (pipeline): Hugging Face emotion classifier
    batch_size (int): Batch size for processing

    Returns:
    pd.DataFrame: Processed chunk with emotion labels
    """
    print(f"\n{'='*60}")
    print(f"PROCESSING CHUNK {chunk_number}")
    print(f"{'='*60}")
    print(f"Chunk size: {len(chunk_df)} entries")

    # Convert summaries to list
    summaries = chunk_df['summary'].tolist()

    # Classify emotions
    try:
        # Try batch processing first (more efficient)
        emotion_labels = classify_emotions_batch(summaries, emotion_classifier, batch_size)
    except Exception as e:
        print(f"Batch processing failed: {str(e)}")
        print("Falling back to individual processing...")
        emotion_labels = classify_emotions_individual(summaries, emotion_classifier)

    # Add emotion labels to DataFrame
    chunk_df['emotion_label'] = emotion_labels

    # Analyze results for this chunk
    analyze_emotion_distribution(chunk_df)

    # Save chunk results
    output_path = f'news_emotion_chunk_{chunk_number}.csv'
    save_results(chunk_df, output_path)

    return chunk_df

def main_emotion_classification_chunked(csv_path, chunk_size=10000, batch_size=32):
    """
    Main function to perform emotion classification on news summaries in chunks.

    Parameters:
    csv_path (str): Path to the input CSV file
    chunk_size (int): Number of entries per chunk
    batch_size (int): Batch size for processing

    Returns:
    None
    """
    print("CHUNKED EMOTION DETECTION")
    print("="*60)

    # Read the dataset
    print(f"Loading dataset from {csv_path}...")
    clean_df = pd.read_csv(csv_path)
    print(f"Dataset loaded. Total entries: {len(clean_df)}")

    # Calculate number of chunks
    total_chunks = (len(clean_df) + chunk_size - 1) // chunk_size
    print(f"Will process {total_chunks} chunks of {chunk_size} entries each")

    # Load emotion model once
    emotion_classifier = load_emotion_model()

    # Process each chunk
    for chunk_num in range(total_chunks):
        start_idx = chunk_num * chunk_size
        end_idx = min((chunk_num + 1) * chunk_size, len(clean_df))

        # Get chunk
        chunk_df = clean_df.iloc[start_idx:end_idx].copy()

        # Process chunk
        process_chunk(chunk_df, chunk_num + 1, emotion_classifier, batch_size)

        # Clear memory
        del chunk_df
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

        print(f"Chunk {chunk_num + 1}/{total_chunks} completed")

    print(f"\n{'='*60}")
    print("ALL CHUNKS PROCESSED SUCCESSFULLY!")
    print(f"{'='*60}")
    print(f"Output files: news_emotion_chunk_1.csv to news_emotion_chunk_{total_chunks}.csv")
    print("You can now merge these files to create the final dataset.")

# Usage
CSV_PATH_CLEAN_DATA = "/content/news_summary_clean.csv"
main_emotion_classification_chunked(CSV_PATH_CLEAN_DATA)

CHUNKED EMOTION DETECTION
Loading dataset from /content/news_summary_clean.csv...
Dataset loaded. Total entries: 98379
Will process 10 chunks of 10000 entries each
Loading emotion detection model...
Model: j-hartmann/emotion-english-distilroberta-base
Using CPU (this may be slower)


Device set to use cpu


Model loaded successfully

PROCESSING CHUNK 1
Chunk size: 10000 entries
Classifying emotions for 10000 summaries...
Processing in batches of 32



Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:02<12:33,  2.42s/it][A
Processing batches:   1%|          | 2/313 [00:04<10:54,  2.11s/it][A
Processing batches:   1%|          | 3/313 [00:06<10:42,  2.07s/it][A
Processing batches:   1%|▏         | 4/313 [00:08<11:40,  2.27s/it][A
Processing batches:   2%|▏         | 5/313 [00:10<10:56,  2.13s/it][A
Processing batches:   2%|▏         | 6/313 [00:12<10:22,  2.03s/it][A
Processing batches:   2%|▏         | 7/313 [00:14<09:55,  1.94s/it][A
Processing batches:   3%|▎         | 8/313 [00:16<09:39,  1.90s/it][A
Processing batches:   3%|▎         | 9/313 [00:19<12:31,  2.47s/it][A
Processing batches:   3%|▎         | 10/313 [00:21<11:36,  2.30s/it][A
Processing batches:   4%|▎         | 11/313 [00:23<11:07,  2.21s/it][A
Processing batches:   4%|▍         | 12/313 [00:25<10:39,  2.12s/it][A
Processing batches:   4%|▍         | 13/313 [00:27<10:25,  2.09s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,655 ( 36.5%)
sadness         |  2,020 ( 20.2%)
anger           |  1,542 ( 15.4%)
joy             |  1,414 ( 14.1%)
fear            |    593 (  5.9%)
surprise        |    543 (  5.4%)
disgust         |    233 (  2.3%)

Sample summaries for each emotion:

NEUTRAL:
  1. have known hirani for yrs, what if metoo claims are not true sonam...
  2. india get all out for 92, their lowest odi total in new zealand...
  3. govt directs alok verma to join work 1 day before his retirement...

SADNESS:
  1. rahat fateh ali khan denies getting notice for smuggling currency...
  2. those on bail will go to jail pm modi takes jibe at rahul...
  3. 12 killed, 170 injured in saudi arabia floods...

ANGER:
  1. how long can i tolerate congress leaders' potshots k'taka cm...
  2. odisha cm patnaik controls mining mafia union minister...
  3. u


Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:01<07:18,  1.41s/it][A
Processing batches:   1%|          | 2/313 [00:02<07:10,  1.39s/it][A
Processing batches:   1%|          | 3/313 [00:04<07:10,  1.39s/it][A
Processing batches:   1%|▏         | 4/313 [00:05<07:05,  1.38s/it][A
Processing batches:   2%|▏         | 5/313 [00:06<07:05,  1.38s/it][A
Processing batches:   2%|▏         | 6/313 [00:08<07:31,  1.47s/it][A
Processing batches:   2%|▏         | 7/313 [00:09<07:22,  1.45s/it][A
Processing batches:   3%|▎         | 8/313 [00:11<07:20,  1.44s/it][A
Processing batches:   3%|▎         | 9/313 [00:12<07:20,  1.45s/it][A
Processing batches:   3%|▎         | 10/313 [00:14<07:14,  1.43s/it][A
Processing batches:   4%|▎         | 11/313 [00:15<07:07,  1.41s/it][A
Processing batches:   4%|▍         | 12/313 [00:16<06:54,  1.38s/it][A
Processing batches:   4%|▍         | 13/313 [00:18<06:49,  1.37s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,476 ( 34.8%)
sadness         |  1,979 ( 19.8%)
anger           |  1,724 ( 17.2%)
joy             |  1,324 ( 13.2%)
fear            |    643 (  6.4%)
surprise        |    556 (  5.6%)
disgust         |    298 (  3.0%)

Sample summaries for each emotion:

NEUTRAL:
  1. will build ram mandir but rahul will decide date up dy cm...
  2. badal rahe hain bas naam akhilesh's poem on cities' name change...
  3. musk using old tech russian firm shows reusable mars rocket...

SADNESS:
  1. yes bank chairman who was named in aircelmaxis chargesheet quits...
  2. vodafone idea posts â¹4,970 cr loss in first ever quarterly result...
  3. 14yrold gaming addict commits suicide after mom takes away phone...

ANGER:
  1. google pixel 3, 3 xl deleting texts without consent reports...
  2. punjab cm condemns attempts to politicise armed forc


Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:01<08:26,  1.62s/it][A
Processing batches:   1%|          | 2/313 [00:03<07:50,  1.51s/it][A
Processing batches:   1%|          | 3/313 [00:04<07:27,  1.44s/it][A
Processing batches:   1%|▏         | 4/313 [00:05<07:11,  1.40s/it][A
Processing batches:   2%|▏         | 5/313 [00:07<07:06,  1.39s/it][A
Processing batches:   2%|▏         | 6/313 [00:08<07:06,  1.39s/it][A
Processing batches:   2%|▏         | 7/313 [00:09<07:00,  1.37s/it][A
Processing batches:   3%|▎         | 8/313 [00:11<06:53,  1.36s/it][A
Processing batches:   3%|▎         | 9/313 [00:12<07:00,  1.38s/it][A
Processing batches:   3%|▎         | 10/313 [00:14<07:13,  1.43s/it][A
Processing batches:   4%|▎         | 11/313 [00:15<07:07,  1.42s/it][A
Processing batches:   4%|▍         | 12/313 [00:16<07:01,  1.40s/it][A
Processing batches:   4%|▍         | 13/313 [00:18<06:58,  1.39s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,523 ( 35.2%)
sadness         |  1,921 ( 19.2%)
anger           |  1,650 ( 16.5%)
joy             |  1,381 ( 13.8%)
fear            |    629 (  6.3%)
surprise        |    587 (  5.9%)
disgust         |    309 (  3.1%)

Sample summaries for each emotion:

NEUTRAL:
  1. anas bags india's 1st men's 400m silver at asiad in 36 years...
  2. audi's new electric car prototype can go 0100kmph in 2 secs...
  3. pm modi follows 55 women on twitter on rakshabandhan...

SADNESS:
  1. over 60 people injured during u'khand stonepelting festival...
  2. congo rolls out trial ebola treatment as death toll rises...
  3. floodhit kerala may face 45 decline in tourists official...

ANGER:
  1. us hatching criminal plot against us north korea...
  2. us is waging 'psychological war' against us iran...
  3. second attack on akali dal leader in


Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:01<07:31,  1.45s/it][A
Processing batches:   1%|          | 2/313 [00:02<07:36,  1.47s/it][A
Processing batches:   1%|          | 3/313 [00:04<07:16,  1.41s/it][A
Processing batches:   1%|▏         | 4/313 [00:05<07:01,  1.36s/it][A
Processing batches:   2%|▏         | 5/313 [00:06<07:03,  1.38s/it][A
Processing batches:   2%|▏         | 6/313 [00:08<06:57,  1.36s/it][A
Processing batches:   2%|▏         | 7/313 [00:09<06:51,  1.34s/it][A
Processing batches:   3%|▎         | 8/313 [00:10<06:49,  1.34s/it][A
Processing batches:   3%|▎         | 9/313 [00:12<06:53,  1.36s/it][A
Processing batches:   3%|▎         | 10/313 [00:13<07:08,  1.41s/it][A
Processing batches:   4%|▎         | 11/313 [00:15<07:01,  1.39s/it][A
Processing batches:   4%|▍         | 12/313 [00:16<06:53,  1.37s/it][A
Processing batches:   4%|▍         | 13/313 [00:17<06:46,  1.35s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,647 ( 36.5%)
sadness         |  1,743 ( 17.4%)
anger           |  1,720 ( 17.2%)
joy             |  1,330 ( 13.3%)
surprise        |    631 (  6.3%)
fear            |    615 (  6.2%)
disgust         |    314 (  3.1%)

Sample summaries for each emotion:

NEUTRAL:
  1. the world needs to care priyanka on rohingya refugees...
  2. ziva accompanies dhoni for 'last walk' to pune dressing room...
  3. dd troll kxip over 'see you in ipl 2019' tweet...

SADNESS:
  1. pakistan heatwave kills 65 people in 3 days report...
  2. pant's brilliant form hurt glenn maxwell dd coach ponting...
  3. accused took 3hrs to cut teen's body into 12 pieces in delhi...

ANGER:
  1. zlatan slaps player for stepping on his shoe, gets sent off...
  2. youth pelt stones at army during its iftar party in jk...
  3. police constable arrested for assaul


Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:01<06:56,  1.33s/it][A
Processing batches:   1%|          | 2/313 [00:02<06:50,  1.32s/it][A
Processing batches:   1%|          | 3/313 [00:03<06:52,  1.33s/it][A
Processing batches:   1%|▏         | 4/313 [00:05<06:55,  1.34s/it][A
Processing batches:   2%|▏         | 5/313 [00:06<07:22,  1.44s/it][A
Processing batches:   2%|▏         | 6/313 [00:08<07:10,  1.40s/it][A
Processing batches:   2%|▏         | 7/313 [00:09<07:02,  1.38s/it][A
Processing batches:   3%|▎         | 8/313 [00:10<06:56,  1.37s/it][A
Processing batches:   3%|▎         | 9/313 [00:12<06:50,  1.35s/it][A
Processing batches:   3%|▎         | 10/313 [00:13<06:48,  1.35s/it][A
Processing batches:   4%|▎         | 11/313 [00:14<06:46,  1.34s/it][A
Processing batches:   4%|▍         | 12/313 [00:16<06:46,  1.35s/it][A
Processing batches:   4%|▍         | 13/313 [00:17<07:08,  1.43s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,776 ( 37.8%)
anger           |  1,799 ( 18.0%)
sadness         |  1,707 ( 17.1%)
joy             |  1,210 ( 12.1%)
fear            |    657 (  6.6%)
surprise        |    566 (  5.7%)
disgust         |    285 (  2.9%)

Sample summaries for each emotion:

NEUTRAL:
  1. maybe people get money writing about linkup with disha tiger...
  2. raina shares chhetri's video, urges fans to support them...
  3. minorities are safer in india, says mukhtar abbas naqvi...

ANGER:
  1. tamil actress sangeetha arrested for running prostitution racket...
  2. tata sons rejects us site's claims on 50 mn gift to harvard...
  3. 4yrold raped, stabbed in haryana body found in container...

SADNESS:
  1. parties which can't handle defeat blame evms ec...
  2. un experts condemn tuticorin police firing that killed 13...
  3. ongc incurs â¹4,000cr


Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:01<06:54,  1.33s/it][A
Processing batches:   1%|          | 2/313 [00:02<07:24,  1.43s/it][A
Processing batches:   1%|          | 3/313 [00:04<07:24,  1.44s/it][A
Processing batches:   1%|▏         | 4/313 [00:05<07:08,  1.39s/it][A
Processing batches:   2%|▏         | 5/313 [00:06<07:02,  1.37s/it][A
Processing batches:   2%|▏         | 6/313 [00:08<06:54,  1.35s/it][A
Processing batches:   2%|▏         | 7/313 [00:09<06:50,  1.34s/it][A
Processing batches:   3%|▎         | 8/313 [00:10<06:46,  1.33s/it][A
Processing batches:   3%|▎         | 9/313 [00:12<06:45,  1.33s/it][A
Processing batches:   3%|▎         | 10/313 [00:13<06:44,  1.34s/it][A
Processing batches:   4%|▎         | 11/313 [00:15<07:08,  1.42s/it][A
Processing batches:   4%|▍         | 12/313 [00:16<06:59,  1.39s/it][A
Processing batches:   4%|▍         | 13/313 [00:17<06:53,  1.38s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,847 ( 38.5%)
anger           |  1,653 ( 16.5%)
sadness         |  1,580 ( 15.8%)
joy             |  1,328 ( 13.3%)
surprise        |    681 (  6.8%)
fear            |    648 (  6.5%)
disgust         |    263 (  2.6%)

Sample summaries for each emotion:

NEUTRAL:
  1. sisters gigi hadid and bella hadid pose nude for vogue uk...
  2. varun youngest b'wood actor to get statue at madame tussauds...
  3. we are not endorsing jauhar deepika padukone on padmaavat...

ANGER:
  1. court grants bail to accused in govind pansare murder case...
  2. man killed, five injured during violent clashes in amethi...
  3. drug peddler arrested after filing it returns for â¹40 lakh...

SADNESS:
  1. government to withdraw its plea to take over unitech...
  2. tripura cm sarkar poorest indian cm with â¹2,410 in bank...
  3. apple to cut iphone


Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:01<07:33,  1.45s/it][A
Processing batches:   1%|          | 2/313 [00:02<07:42,  1.49s/it][A
Processing batches:   1%|          | 3/313 [00:04<07:14,  1.40s/it][A
Processing batches:   1%|▏         | 4/313 [00:05<07:10,  1.39s/it][A
Processing batches:   2%|▏         | 5/313 [00:06<07:00,  1.37s/it][A
Processing batches:   2%|▏         | 6/313 [00:08<06:56,  1.36s/it][A
Processing batches:   2%|▏         | 7/313 [00:09<06:54,  1.36s/it][A
Processing batches:   3%|▎         | 8/313 [00:10<06:45,  1.33s/it][A
Processing batches:   3%|▎         | 9/313 [00:12<06:44,  1.33s/it][A
Processing batches:   3%|▎         | 10/313 [00:13<07:06,  1.41s/it][A
Processing batches:   4%|▎         | 11/313 [00:15<06:58,  1.39s/it][A
Processing batches:   4%|▍         | 12/313 [00:16<06:54,  1.38s/it][A
Processing batches:   4%|▍         | 13/313 [00:17<06:52,  1.38s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,751 ( 37.5%)
anger           |  1,860 ( 18.6%)
sadness         |  1,583 ( 15.8%)
joy             |  1,223 ( 12.2%)
fear            |    679 (  6.8%)
surprise        |    623 (  6.2%)
disgust         |    281 (  2.8%)

Sample summaries for each emotion:

NEUTRAL:
  1. kfc sells 10,000 internet blocking tent...
  2. hong kong flat sells at record price of 17,000 per sq ft...
  3. 'padmavati' is not my film boney kapoor on 'padmavati' row...

ANGER:
  1. bhansali has also hurt sentiments up cm on padmavati 'row'...
  2. rishi kapoor trolled for mocking beyoncã on twitter...
  3. proisis group releases poster of beheaded pope francis...

SADNESS:
  1. ndtv group ceo kvl narayan rao dies at 63...
  2. cbi summons kin of conductor in ryan i'ntl murder case...
  3. german chancellor hints new polls after coalition talks fail...



Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:01<07:44,  1.49s/it][A
Processing batches:   1%|          | 2/313 [00:02<07:38,  1.47s/it][A
Processing batches:   1%|          | 3/313 [00:04<07:22,  1.43s/it][A
Processing batches:   1%|▏         | 4/313 [00:05<07:07,  1.38s/it][A
Processing batches:   2%|▏         | 5/313 [00:06<06:53,  1.34s/it][A
Processing batches:   2%|▏         | 6/313 [00:08<06:50,  1.34s/it][A
Processing batches:   2%|▏         | 7/313 [00:09<06:43,  1.32s/it][A
Processing batches:   3%|▎         | 8/313 [00:10<06:44,  1.33s/it][A
Processing batches:   3%|▎         | 9/313 [00:12<06:43,  1.33s/it][A
Processing batches:   3%|▎         | 10/313 [00:13<07:07,  1.41s/it][A
Processing batches:   4%|▎         | 11/313 [00:15<06:55,  1.38s/it][A
Processing batches:   4%|▍         | 12/313 [00:16<06:51,  1.37s/it][A
Processing batches:   4%|▍         | 13/313 [00:17<06:45,  1.35s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,628 ( 36.3%)
anger           |  1,833 ( 18.3%)
sadness         |  1,785 ( 17.8%)
joy             |  1,189 ( 11.9%)
fear            |    746 (  7.5%)
surprise        |    564 (  5.6%)
disgust         |    255 (  2.5%)

Sample summaries for each emotion:

NEUTRAL:
  1. centre has not filed affidavit on rohingyas in sc rijiju...
  2. germany should be proud of its wwii soldiers politician...
  3. â¹6,400 invested during 2008 crisis would be â¹16,000 now...

ANGER:
  1. chal hatt sona to troll who called her a flop singer...
  2. six navy officers held over brawl at goa bar...
  3. i belong to category of donkeys asaram on 'fake babas' list...

SADNESS:
  1. didn't get coach job since i didn't have any setting sehwag...
  2. gambhir reveals he sledged warne over wrong text to exwife...
  3. 14 out of 16 lossmaking itdc hotels


Processing batches:   0%|          | 0/313 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/313 [00:01<07:05,  1.36s/it][A
Processing batches:   1%|          | 2/313 [00:02<07:16,  1.40s/it][A
Processing batches:   1%|          | 3/313 [00:04<07:19,  1.42s/it][A
Processing batches:   1%|▏         | 4/313 [00:05<07:06,  1.38s/it][A
Processing batches:   2%|▏         | 5/313 [00:06<07:01,  1.37s/it][A
Processing batches:   2%|▏         | 6/313 [00:08<06:56,  1.36s/it][A
Processing batches:   2%|▏         | 7/313 [00:09<06:50,  1.34s/it][A
Processing batches:   3%|▎         | 8/313 [00:10<06:43,  1.32s/it][A
Processing batches:   3%|▎         | 9/313 [00:12<06:39,  1.31s/it][A
Processing batches:   3%|▎         | 10/313 [00:13<06:38,  1.32s/it][A
Processing batches:   4%|▎         | 11/313 [00:15<07:04,  1.41s/it][A
Processing batches:   4%|▍         | 12/313 [00:16<06:52,  1.37s/it][A
Processing batches:   4%|▍         | 13/313 [00:17<06:51,  1.37s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 10000
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,794 ( 37.9%)
anger           |  1,657 ( 16.6%)
sadness         |  1,606 ( 16.1%)
joy             |  1,370 ( 13.7%)
fear            |    674 (  6.7%)
surprise        |    637 (  6.4%)
disgust         |    262 (  2.6%)

Sample summaries for each emotion:

NEUTRAL:
  1. lg unveils device that disinfects handrails of escalators...
  2. gulf of mexico tube worm among longest living animals study...
  3. infosys executive vice president ritika suri quits...

ANGER:
  1. govt asks states to file firs over violence in cow's name...
  2. kerala priest arrested for sexually abusing minor boys...
  3. itc loses â¹50,000 crore in value on hike in cigarette cess...

SADNESS:
  1. man falls into gorge, spends night clinging to tree branch...
  2. gorkhaland supporter killed in 'police firing' in darjeeling...
  3. rajasthan university 


Processing batches:   0%|          | 0/262 [00:00<?, ?it/s][A
Processing batches:   0%|          | 1/262 [00:01<06:22,  1.47s/it][A
Processing batches:   1%|          | 2/262 [00:02<05:54,  1.36s/it][A
Processing batches:   1%|          | 3/262 [00:04<05:41,  1.32s/it][A
Processing batches:   2%|▏         | 4/262 [00:05<05:34,  1.30s/it][A
Processing batches:   2%|▏         | 5/262 [00:06<05:33,  1.30s/it][A
Processing batches:   2%|▏         | 6/262 [00:07<05:31,  1.29s/it][A
Processing batches:   3%|▎         | 7/262 [00:09<05:28,  1.29s/it][A
Processing batches:   3%|▎         | 8/262 [00:10<05:28,  1.29s/it][A
Processing batches:   3%|▎         | 9/262 [00:11<05:36,  1.33s/it][A
Processing batches:   4%|▍         | 10/262 [00:13<05:40,  1.35s/it][A
Processing batches:   4%|▍         | 11/262 [00:14<05:34,  1.33s/it][A
Processing batches:   5%|▍         | 12/262 [00:15<05:27,  1.31s/it][A
Processing batches:   5%|▍         | 13/262 [00:17<05:23,  1.30s/it][A
Processin


EMOTION DISTRIBUTION ANALYSIS
Total summaries analyzed: 8379
Number of unique emotions: 7

Emotion Distribution:
------------------------------
neutral         |  3,340 ( 39.9%)
anger           |  1,536 ( 18.3%)
sadness         |  1,278 ( 15.3%)
joy             |  1,004 ( 12.0%)
fear            |    523 (  6.2%)
surprise        |    473 (  5.6%)
disgust         |    225 (  2.7%)

Sample summaries for each emotion:

NEUTRAL:
  1. onduty emergency vehicles allowed to use multicolour beacon...
  2. can't tell who sought pm's pictures for advertisements pmo...
  3. man dressed as spiderman performs stunts on highway bridge...

ANGER:
  1. people donate over â¹2 crore to martyred jawans' families...
  2. tamannaah upset as scenes were cut from baahubali 2 reports...
  3. kejriwal's worst enemy won't believe in allegations vishwas...

SADNESS:
  1. lynn, narine score 50s as kkr hand rcb their 10th defeat...
  2. my fault to have endorsed akhilesh as up cm mulayam...
  3. 30 candidates rejec




In [None]:
#Mergeing all chunks files
# Read all chunk files
chunks = []
for i in range(1, 11):  # Adjust range based on number of chunks
    chunk_df = pd.read_csv(f'news_emotion_chunk_{i}.csv')
    chunks.append(chunk_df)

# Merge all chunks
final_merge_dataset = pd.concat(chunks, ignore_index=True)
final_merge_dataset.to_csv('news_summary_with_emotion_final.csv', index=False)