In [1]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296185 sha256=00b528a5235b9a3bd5d60dd0bde895c1a77d7f4d1e940fbc3a99d602c28c321a
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import fasttext
import re

In [3]:
df = pd.read_json("/content/News_Category_Dataset_v3.json", lines=True)
df.head()


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [5]:
print("Number of Missing values:")
df.isnull().sum()

Number of Missing values:


Unnamed: 0,0
link,0
headline,0
category,0
short_description,0
authors,0
date,0


In [6]:
print(f"Number of Duplicates: {df.duplicated().sum()}")

df.drop_duplicates(inplace=True)


Number of Duplicates: 13


In [7]:
df["category"].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
POLITICS,35601
WELLNESS,17942
ENTERTAINMENT,17362
TRAVEL,9900
STYLE & BEAUTY,9811
PARENTING,8791
HEALTHY LIVING,6694
QUEER VOICES,6347
FOOD & DRINK,6340
BUSINESS,5992


In [8]:
# Step 1: Filter relevant columns (category and headline)
df = df[['category', 'headline']]

In [9]:
# Step 2 (Preprocessing): Filter out non-alphabetical characters and trim extra spaces
def clean_text(text):
    # Remove special characters and digits, retain only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [10]:
df['headline'] = df['headline'].apply(clean_text)
df['category'] = df['category'].apply(clean_text)

In [11]:

# Step 3: Create a fastText-compatible format
# fastText expects labels in the format __label__<category>
# df['fasttext_format'] = '__label__' + df['category'].astype(str) + ' ' + df['headline']
df['fasttext_format'] = df.apply(lambda x: f"__label__{x['category']} {x['headline']}", axis=1)

# Remove leading and trailing quotes, if any, from each row
df['fasttext_format'] = df['fasttext_format'].apply(lambda x: x.strip('"'))
# df['fasttext_format'] = df['fasttext_format'].str.strip('\"')

In [12]:
# Step 4: Split the data into training and validation sets
train_df, val_df = train_test_split(df['fasttext_format'], test_size=0.2, random_state=42)
df['fasttext_format'].tail()

Unnamed: 0,fasttext_format
209522,__label__TECH RIM CEO Thorsten Heins Significa...
209523,__label__SPORTS Maria Sharapova Stunned By Vic...
209524,__label__SPORTS Giants Over Patriots Jets Over...
209525,__label__SPORTS Aldon Smith Arrested ers Lineb...
209526,__label__SPORTS Dwight Howard Rips Teammates A...


In [13]:
# Save the data to text files (required for fastText)
train_df.to_csv('news_train.txt', index=False, header=False)
val_df.to_csv('news_val.txt', index=False, header=False)

In [14]:
# Train the model on the training data
model = fasttext.train_supervised(
    input='news_train.txt',
    epoch=30,
    lr=1.0,
    wordNgrams=2,
    verbose=2,
    minCount=1,
    loss="softmax",
    bucket=2000000)

In [15]:
# Save the model for future use
model.save_model('news_category_model.bin')

# Evaluate the model on the validation set
results = model.test('news_val.txt')

In [16]:

# Display results
print(f"Number of samples: {results[0]}")
print(f"Precision@1: {results[1]}")
print(f"Recall@1: {results[2]}")

Number of samples: 41903
Precision@1: 0.7081354556952962
Recall@1: 0.7081354556952962


In [21]:
test_headlines = [
    "Apple releases new iPhone with exciting features",
    "Government discusses new healthcare reforms",
    "Football team wins the championship",
    "Stocks hit record highs amid tech rally"
]

# Predict the category of each headline
for headline in test_headlines:
    label, confidence = model.predict(headline)
    print(f'Headline: "{headline}"\nPredicted Category: {label[0].replace("__label__", "")}, Confidence: {confidence[0]:.4f}\n')



Headline: "Apple releases new iPhone with exciting features"
Predicted Category: TECH, Confidence: 1.0000

Headline: "Government discusses new healthcare reforms"
Predicted Category: POLITICS, Confidence: 0.9847

Headline: "Football team wins the championship"
Predicted Category: SPORTS, Confidence: 0.9864

Headline: "Stocks hit record highs amid tech rally"
Predicted Category: BUSINESS, Confidence: 0.9158

