In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Set random seed for reproducibility
np.random.seed(42)

ModuleNotFoundError: No module named 'pandas'

In [6]:
import sys
print(sys.version)

3.10.11 | packaged by Anaconda, Inc. | (main, May 16 2023, 00:55:32) [MSC v.1916 64 bit (AMD64)]


In [None]:
# Load the datasets
true_df = pd.read_csv('dataset/True.csv')
fake_df = pd.read_csv('dataset/Fake.csv')

# Add label column - 0 for real (true), 1 for fake
true_df['label'] = 0
fake_df['label'] = 1

# Combine the datasets
df = pd.concat([true_df, fake_df], axis=0)

# Shuffle the dataset to mix fake and real news
df = shuffle(df, random_state=42).reset_index(drop=True)

In [None]:
# Basic info about the dataset
print(f"Total number of samples: {len(df)}")
print(f"Number of real news: {len(df[df['label']==0])}")
print(f"Number of fake news: {len(df[df['label']==1])}\n")

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Check subject distribution
plt.figure(figsize=(10,6))
sns.countplot(x='subject', hue='label', data=df)
plt.title('Distribution of Subjects by Label')
plt.xticks(rotation=45)
plt.show()

# Check date distribution (if relevant)
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    plt.figure(figsize=(10,6))
    df[df['label']==0]['date'].dt.year.hist(alpha=0.5, bins=30, label='Real')
    df[df['label']==1]['date'].dt.year.hist(alpha=0.5, bins=30, label='Fake')
    plt.title('Distribution of News by Year')
    plt.legend()
    plt.show()

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Clean and preprocess text data
    """
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    
    # Tokenize
    tokens = text.split()
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back to string
    clean_text = ' '.join(tokens)
    
    return clean_text

# Combine title and text for better context (optional)
df['combined_text'] = df['title'] + ' ' + df['text']

# Apply preprocessing to the combined text
df['clean_text'] = df['combined_text'].apply(preprocess_text)

# Check the results
print("Original text example:\n", df['text'].iloc[0][:200], "...")
print("\nCleaned text example:\n", df['clean_text'].iloc[0][:200], "...")

In [None]:
# Check for empty strings after cleaning
empty_text = df[df['clean_text'].str.len() == 0]
print(f"Number of empty texts after cleaning: {len(empty_text)}")

# Remove rows with empty text if any
df = df[df['clean_text'].str.len() > 0]

# Check for duplicates
duplicates = df.duplicated(subset=['clean_text'])
print(f"Number of duplicate texts: {duplicates.sum()}")

# Remove duplicates if any
df = df[~duplicates]

# Reset index after cleaning
df = df.reset_index(drop=True)

In [None]:
# Select only the columns we need
final_df = df[['clean_text', 'label']].copy()

# Check class distribution
print("\nFinal class distribution:")
print(final_df['label'].value_counts(normalize=True))

# Save cleaned dataset (optional)
final_df.to_csv('dataset/cleaned_news.csv', index=False)

print("\nData preparation complete!")