In [1]:
import pandas as pd
import kagglehub
import json

# Download latest version
path = kagglehub.dataset_download("rmisra/news-headlines-dataset-for-sarcasm-detection")

print("Path to dataset files:", path)

data = []

with open(path + '/Sarcasm_Headlines_Dataset.json', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

print(df.head())
print(df.info())

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/rmisra/news-headlines-dataset-for-sarcasm-detection?dataset_version_number=2...


100%|██████████| 3.30M/3.30M [00:02<00:00, 1.17MB/s]

Extracting files...





Path to dataset files: C:\Users\Vaishnav M\.cache\kagglehub\datasets\rmisra\news-headlines-dataset-for-sarcasm-detection\versions\2
                                        article_link  \
0  https://www.huffingtonpost.com/entry/versace-b...   
1  https://www.huffingtonpost.com/entry/roseanne-...   
2  https://local.theonion.com/mom-starting-to-fea...   
3  https://politics.theonion.com/boehner-just-wan...   
4  https://www.huffingtonpost.com/entry/jk-rowlin...   

                                            headline  is_sarcastic  
0  former versace store clerk sues over secret 'b...             0  
1  the 'roseanne' revival catches up to our thorn...             0  
2  mom starting to fear son's web series closest ...             1  
3  boehner just wants wife to listen, not come up...             1  
4  j.k. rowling wishes snape happy birthday in th...             0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Colu

## Define the Cleaning Function

We create a function `clean_headline` that applies the following transformations to each headline:
- Expands contractions
- Removes URLs
- Removes non-ASCII characters
- Removes special characters (using a regex pattern)
- Converts the text to lowercase
- Removes punctuation
- Trims extra spaces

In [2]:
import re
import string
import contractions

def clean_headline(text):
    """
    Clean a news headline by applying several transformations:
    1. Expand contractions (e.g., "can't" -> "cannot").
    2. Remove URLs.
    3. Remove non-ASCII characters.
    4. Remove special characters using a regex pattern.
    5. Convert text to lowercase.
    6. Remove punctuation.
    7. Remove extra spaces.
    """
    if not isinstance(text, str):
        return text, 0

    change_count = 0
    original = text

    # Expand contractions
    expanded = contractions.fix(text)
    if expanded != text:
        change_count += 1
    text = expanded
    
    # Remove URLs
    removed_url = re.sub(r'http\S+|www\.\S+', '', text)
    if removed_url != text:
        change_count += 1
    text = removed_url
    
    # Remove non-ASCII characters
    non_ascii_removed = text.encode('ascii', 'ignore').decode('utf-8')
    if non_ascii_removed != text:
        change_count += 1
    text = non_ascii_removed
    
    # Remove special characters (emoticons, symbols, etc.)
    regex_pattern = re.compile(
        pattern = "["  
                  u"\U0001F600-\U0001F64F"  # emoticons
                  u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                  u"\U0001F680-\U0001F6FF"  # transport & map symbols
                  u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                  u"\U00002702-\U000027B0"
                  u"\U000024C2-\U0001F251"
                  u"\ufe0f"                # dingbats
                  "]+", flags = re.UNICODE)
    special_removed = regex_pattern.sub(r'', text)
    if special_removed != text:
        change_count += 1
    text = special_removed
    
    # Convert to lowercase
    lowercased = text.lower()
    if lowercased != text:
        change_count += 1
    text = lowercased
    
    # Remove punctuation
    no_punct = text.translate(str.maketrans('', '', string.punctuation))
    if no_punct != text:
        change_count += 1
    text = no_punct
    
    # Remove extra spaces
    no_extra_spaces = re.sub(r'\s+', ' ', text).strip()
    if no_extra_spaces != text:
        change_count += 1
    text = no_extra_spaces
    
    return text, change_count


In [3]:
# Apply the cleaning function to the 'headline' column
df[['headline_cleaned', 'num_changes']] = df['headline'].apply(lambda x: pd.Series(clean_headline(x)))

# Display the original and cleaned headlines for comparison
print(df[['headline', 'headline_cleaned', 'num_changes']].head())


                                            headline  \
0  former versace store clerk sues over secret 'b...   
1  the 'roseanne' revival catches up to our thorn...   
2  mom starting to fear son's web series closest ...   
3  boehner just wants wife to listen, not come up...   
4  j.k. rowling wishes snape happy birthday in th...   

                                    headline_cleaned  num_changes  
0  former versace store clerk sues over secret bl...            1  
1  the roseanne revival catches up to our thorny ...            1  
2  mom starting to fear sons web series closest t...            1  
3  boehner just wants wife to listen not come up ...            1  
4  jk rowling wishes snape happy birthday in the ...            1  


In [None]:
df['num_changes'].value_counts()

num_changes
1    13403
0    11042
2     2155
3      101
4        8
Name: count, dtype: int64

In [5]:
df.to_csv('intermediate_files/sarcasm_headlines_cleaned.csv', index=False)