In [None]:
import pandas as pd
df = pd.read_csv("fake_or_real_news.csv")
df.head()

In [None]:
# Cell 2: Basic info & cleanup
df.info()
print(df['label'].value_counts())

# Drop useless index column if present
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

# Combine title and text into one field for NLP
df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')

df[['title', 'text', 'content', 'label']].head()

In [None]:
# Cell 3: Text preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # lowercase
    text = text.lower()
    # remove urls
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    # keep only letters
    text = re.sub(r'[^a-z\s]', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # tokenize
    words = text.split()
    # remove stopwords & lemmatize
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['content'].apply(clean_text)
df[['content', 'clean_text']].head()