In [1]:
# 1.Import Required Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# 2.Download NLTK Resources (Run Once)
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tirth\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tirth\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tirth\AppData\Roaming\nltk_data...


True

In [3]:
# 3.Load Processed Dataset
df = pd.read_csv("../data/processed/news_combined.csv")
df.head()

Unnamed: 0,title,text,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [4]:
# 4.nitialize NLP Tools
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

### 5.Text Cleaning Function (IMPORTANT)

* This function does:

1. Lowercasing
2. Remove URLs
3. Remove punctuation
4. Remove numbers
5. Remove stopwords
6. Lemmatization

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)      # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)            # remove punctuation & numbers
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

In [6]:
# 6.pply Cleaning (Takes 1–2 Minutes)
df["clean_text"] = df["text"].apply(clean_text)
df.head()

Unnamed: 0,title,text,label,clean_text
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",0,st century wire say ben stein reputable profes...
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,1,washington reuters u president donald trump re...
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1,reuters puerto rico governor ricardo rossello ...
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",0,monday donald trump embarrassed country accide...
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1,glasgow scotland reuters u presidential candid...


In [7]:
# 7.Check Before vs After
print("ORIGINAL TEXT:\n")
print(df["text"][0][:500])
print("\nCLEANED TEXT:\n")
print(df["clean_text"][0][:500])

ORIGINAL TEXT:

21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Hollywood fame appearing in TV shows and films such as Ferris Bueller s Day Off) made some provocative statements on Judge Jeanine Pirro s show recently. While discussing the halt that was imposed on President Trump s Executive Order on travel. Stein referred to the judgement by the 9th Circuit Court in Washington state as a  Coup d tat against the executive branch and against the constitution.  Stein

CLEANED TEXT:

st century wire say ben stein reputable professor pepperdine university also hollywood fame appearing tv show film ferris bueller day made provocative statement judge jeanine pirro show recently discussing halt imposed president trump executive order travel stein referred judgement th circuit court washington state coup tat executive branch constitution stein went call judge seattle political puppet judiciary political pawn watch interview complete statement note

In [8]:
# 8.Remove Empty Rows (Safety Step)
df = df[df["clean_text"].str.strip() != ""]
df.shape

(44182, 4)

In [9]:
# 9.Save Clean Dataset
df.to_csv("../data/processed/news_cleaned.csv", index=False)
print("✅ Cleaned dataset saved to data/processed/news_cleaned.csv")

✅ Cleaned dataset saved to data/processed/news_cleaned.csv


### ✅ STEP 2 OUTPUT (What You Achieved)

- ✔ Cleaned raw text properly
- ✔ Applied NLP best practices
- ✔ Created clean_text column
- ✔ Dataset ready for ML models