In [None]:
# Install required packages
!pip install nltk pandas scikit-learn

import nltk
import pandas as pd
import re
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk import word_tokenize, pos_tag
from IPython.display import FileLink



In [None]:
# Download NLTK Data

required_downloads = [
    'punkt_tab', 'stopwords', 'wordnet',
    'averaged_perceptron_tagger', 'averaged_perceptron_tagger_eng'
]

for item in required_downloads:
    try:
        print(f"  Downloading {item}...")
        nltk.download(item, quiet=True)
        print(f"  Successfully downloaded {item}")
    except Exception as e:
        print(f"  Warning: Could not download {item}: {e}")

  Downloading punkt_tab...
  Successfully downloaded punkt_tab
  Downloading stopwords...
  Successfully downloaded stopwords
  Downloading wordnet...
  Successfully downloaded wordnet
  Downloading averaged_perceptron_tagger...
  Successfully downloaded averaged_perceptron_tagger
  Downloading averaged_perceptron_tagger_eng...
  Successfully downloaded averaged_perceptron_tagger_eng


In [None]:
# Define Data Loading Function

def load_data(filename='news.csv'):
    """Load CSV data with error handling"""
    try:
        df = pd.read_csv(filename)
        print(f"  Successfully loaded {filename}")
        print(f"  Dataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
        print(f"  Columns: {list(df.columns)}")
        return df
    except FileNotFoundError:
        print(f"  Error: '{filename}' file not found")
        print(f"  Available files: {os.listdir('.')}")
        return None
    except Exception as e:
        print(f"  Error loading data: {e}")
        return None

In [None]:
# Define Text Cleaning Function

def clean_text(text):
    """Clean and normalize text data"""
    if not isinstance(text, str) or pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters but keep spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

In [None]:
# Define Stopwords Function

def get_stopwords():
    """Get English stopwords with fallback"""
    try:
        return set(stopwords.words('english'))
    except Exception as e:
        print(f"  Warning: Could not load stopwords: {e}")
        # Fallback stopwords
        return set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])

stop_words = get_stopwords()
print(f"  Loaded {len(stop_words)} stopwords")

  Loaded 198 stopwords


In [None]:
# Define Tokenization Function

def tokenize_and_remove_stopwords(text):
    """Tokenize text and remove stopwords"""
    if not text or not isinstance(text, str):
        return []

    try:
        tokens = word_tokenize(text)
        # Filter out stopwords and single characters
        filtered = [word for word in tokens if word not in stop_words and len(word) > 1]
        return filtered
    except Exception as e:
        print(f"  Warning: Tokenization error: {e}")
        return text.split()


In [None]:
# Define Stemming Functions

def apply_stemming(words, algorithm='snowball'):
    """Apply stemming algorithm to words"""
    if not words:
        return []

    if algorithm == 'porter':
        stemmer = PorterStemmer()
    elif algorithm == 'lancaster':
        stemmer = LancasterStemmer()
    else:  # default to snowball
        stemmer = SnowballStemmer('english')

    return [stemmer.stem(word) for word in words]


In [None]:
# Define POS Tagging Helper

def get_wordnet_pos(tag):
    """Convert POS tag to WordNet format"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Define Lemmatization Function

def lemmatize_text(text):
    """Lemmatize text using POS tags"""
    if not text or not isinstance(text, str):
        return ""

    try:
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)

        lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag))
                     for word, tag in pos_tags]
        return ' '.join(lemmatized)
    except Exception as e:
        print(f"  Warning: Lemmatization error: {e}")
        return text

In [None]:
# Apply preprocessing

if 'text' in df.columns:
    print("\nProcessing text...")
    df['processed_text'] = df['text'].apply(full_preprocess)
    print("\nPreprocessing complete. Sample results:")
    print(df[['text', 'processed_text']].head())
else:
    print("Error: 'text' column not found in the dataframe.")
    exit()


Processing text...

Preprocessing complete. Sample results:
                                                text  \
0  Daniel Greenfield, a Shillman Journalism Fello...   
1  Google Pinterest Digg Linkedin Reddit Stumbleu...   
2  U.S. Secretary of State John F. Kerry said Mon...   
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...   
4  It's primary day in New York and front-runners...   

                                      processed_text  
0  daniel greenfield shillman journalism fellow f...  
1  google pinterest digg linkedin reddit stumbleu...  
2  u secretary state john kerry say monday stop p...  
3  kaydee king november lesson tonight dem loss t...  
4  primary day new york frontrunners hillary clin...  


In [None]:
# Save processed data to CSV
processed_filename = 'processed_news.csv'
df.to_csv(processed_filename, index=False)
print(f"\nProcessed data saved to {processed_filename}")


Processed data saved to processed_news.csv


In [None]:
# Machine learning pipeline
if 'label' not in df.columns:
    print("\nWarning: 'label' column not found. Skipping model training.")
else:
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import classification_report

    # Vectorize text
    try:
        print("\nTraining machine learning model...")
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(df['processed_text'])
        y = df['label']

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train classifier
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train, y_train)

        # Evaluate
        y_pred = clf.predict(X_test)
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

    except ValueError as e:
        print(f"Error during model training: {e}")


Training machine learning model...

Classification Report:
              precision    recall  f1-score   support

        FAKE       0.90      0.94      0.92       628
        REAL       0.94      0.90      0.92       639

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267

