In [2]:
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification
from tqdm import tqdm
import os


In [3]:
import kagglehub

headlines_path = kagglehub.dataset_download("rmisra/news-category-dataset")

print("Path to dataset files:", headlines_path)

for item in os.listdir(headlines_path):
  print(item)

Path to dataset files: /kaggle/input/news-category-dataset
News_Category_Dataset_v3.json


In [4]:
sites = pd.read_json(os.path.join(headlines_path, "News_Category_Dataset_v3.json"), lines=True)

sites.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [5]:
sites['category'].unique()

array(['U.S. NEWS', 'COMEDY', 'PARENTING', 'WORLD NEWS', 'CULTURE & ARTS',
       'TECH', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WEIRD NEWS',
       'ENVIRONMENT', 'EDUCATION', 'CRIME', 'SCIENCE', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA',
       'QUEER VOICES', 'HOME & LIVING', 'WOMEN', 'BLACK VOICES', 'TRAVEL',
       'MONEY', 'RELIGION', 'LATINO VOICES', 'IMPACT', 'WEDDINGS',
       'COLLEGE', 'PARENTS', 'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE',
       'HEALTHY LIVING', 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST',
       'FIFTY', 'ARTS', 'DIVORCE'], dtype=object)

In [6]:
sites['date'] = sites['date'].astype('string').str[:4]

In [7]:
sites_dataset = sites[['category', 'date']]
sites_dataset.head()

Unnamed: 0,category,date
0,U.S. NEWS,2022
1,U.S. NEWS,2022
2,COMEDY,2022
3,PARENTING,2022
4,U.S. NEWS,2022


In [8]:
# json_output = sites_dataset.to_json(orient='records', indent=2)

In [9]:
sites_dataset.to_json("sites_dataset.json", orient='records', indent=2)
from google.colab import files
files.download("sites_dataset.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
new_df=sites[['headline', 'date']]
new_df.head()

Unnamed: 0,headline,date
0,Over 4 Million Americans Roll Up Sleeves For O...,2022
1,"American Airlines Flyer Charged, Banned For Li...",2022
2,23 Of The Funniest Tweets About Cats And Dogs ...,2022
3,The Funniest Tweets From Parents This Week (Se...,2022
4,Woman Who Called Cops On Black Bird-Watcher Lo...,2022


In [11]:
tqdm.pandas()

model_name = "Swoodplays/emotion-classification"
print("Loading model...")
classifier = pipeline("text-classification", model=model_name, return_all_scores=False)

model = AutoModelForSequenceClassification.from_pretrained(model_name)
id2label = model.config.id2label

def classify_headline(text):
    result = classifier(text, truncation=True, max_length=512)[0]
    label_id = int(result["label"].replace("LABEL_", ""))
    return id2label[label_id]

print("Headlines classification...")
new_df["emotion"] = new_df["headline"].progress_apply(classify_headline)

print("Finished:")
print(new_df.head())


Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


Headlines classification...


  0%|          | 6/209527 [00:00<6:53:42,  8.44it/s] You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 209527/209527 [17:27<00:00, 199.94it/s]

Finished:
                                            headline  date  emotion
0  Over 4 Million Americans Roll Up Sleeves For O...  2022  LABEL_3
1  American Airlines Flyer Charged, Banned For Li...  2022  LABEL_3
2  23 Of The Funniest Tweets About Cats And Dogs ...  2022  LABEL_3
3  The Funniest Tweets From Parents This Week (Se...  2022  LABEL_1
4  Woman Who Called Cops On Black Bird-Watcher Lo...  2022  LABEL_0



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["emotion"] = new_df["headline"].progress_apply(classify_headline)


In [12]:
label_mapping = {
    "LABEL_0": "sadness",
    "LABEL_1": "joy",
    "LABEL_2": "love",
    "LABEL_3": "anger",
    "LABEL_4": "fear",
    "LABEL_5": "surprise"
}

new_df["emotion"] = new_df["emotion"].replace(label_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["emotion"] = new_df["emotion"].replace(label_mapping)


In [13]:
new_df.head()

Unnamed: 0,headline,date,emotion
0,Over 4 Million Americans Roll Up Sleeves For O...,2022,anger
1,"American Airlines Flyer Charged, Banned For Li...",2022,anger
2,23 Of The Funniest Tweets About Cats And Dogs ...,2022,anger
3,The Funniest Tweets From Parents This Week (Se...,2022,joy
4,Woman Who Called Cops On Black Bird-Watcher Lo...,2022,sadness


In [14]:
new_df.to_csv("classified_headlines.csv", index=True)

from google.colab import files
files.download("classified_headlines.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>