In [None]:
# Step 1: Install Required Packages

!pip install nltk pandas scikit-learn



In [None]:
# Step 2: Import Libraries

import nltk
import pandas as pd
import re
import os
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk import word_tokenize, pos_tag

In [None]:
# Step 3: Download NLTK Data

required_downloads = [
    'punkt_tab', 'stopwords', 'wordnet',
    'averaged_perceptron_tagger', 'averaged_perceptron_tagger_eng'
]

for item in required_downloads:
    try:
        print(f"  Downloading {item}...")
        nltk.download(item, quiet=True)
        print(f"  Successfully downloaded {item}")
    except Exception as e:
        print(f"  Warning: Could not download {item}: {e}")

  Downloading punkt_tab...
  Successfully downloaded punkt_tab
  Downloading stopwords...
  Successfully downloaded stopwords
  Downloading wordnet...
  Successfully downloaded wordnet
  Downloading averaged_perceptron_tagger...
  Successfully downloaded averaged_perceptron_tagger
  Downloading averaged_perceptron_tagger_eng...
  Successfully downloaded averaged_perceptron_tagger_eng


In [None]:
# Step 4: Define Data Loading Function

def load_data(filename='news.csv'):
    """Load CSV data with error handling"""
    try:
        df = pd.read_csv(filename)
        print(f"  Successfully loaded {filename}")
        print(f"  Dataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
        print(f"  Columns: {list(df.columns)}")
        return df
    except FileNotFoundError:
        print(f"  Error: '{filename}' file not found")
        print(f"  Available files: {os.listdir('.')}")
        return None
    except Exception as e:
        print(f"  Error loading data: {e}")
        return None

In [None]:
# Step 5: Define Text Cleaning Function

def clean_text(text):
    """Clean and normalize text data"""
    if not isinstance(text, str) or pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters but keep spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

In [None]:
# Step 6: Define Stopwords Function

def get_stopwords():
    """Get English stopwords with fallback"""
    try:
        return set(stopwords.words('english'))
    except Exception as e:
        print(f"  Warning: Could not load stopwords: {e}")
        # Fallback stopwords
        return set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])

stop_words = get_stopwords()
print(f"  Loaded {len(stop_words)} stopwords")

  Loaded 198 stopwords


In [None]:
# Step 7: Define Tokenization Function

def tokenize_and_remove_stopwords(text):
    """Tokenize text and remove stopwords"""
    if not text or not isinstance(text, str):
        return []

    try:
        tokens = word_tokenize(text)
        # Filter out stopwords and single characters
        filtered = [word for word in tokens if word not in stop_words and len(word) > 1]
        return filtered
    except Exception as e:
        print(f"  Warning: Tokenization error: {e}")
        return text.split()


In [None]:
# Step 8: Define Stemming Functions

def apply_stemming(words, algorithm='snowball'):
    """Apply stemming algorithm to words"""
    if not words:
        return []

    if algorithm == 'porter':
        stemmer = PorterStemmer()
    elif algorithm == 'lancaster':
        stemmer = LancasterStemmer()
    else:  # default to snowball
        stemmer = SnowballStemmer('english')

    return [stemmer.stem(word) for word in words]


In [None]:
# Step 9: Define POS Tagging Helper

def get_wordnet_pos(tag):
    """Convert POS tag to WordNet format"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Step 10: Define Lemmatization Function

def lemmatize_text(text):
    """Lemmatize text using POS tags"""
    if not text or not isinstance(text, str):
        return ""

    try:
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)

        lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag))
                     for word, tag in pos_tags]
        return ' '.join(lemmatized)
    except Exception as e:
        print(f"  Warning: Lemmatization error: {e}")
        return text

In [None]:
# Step 11: Define Complete Preprocessing Pipeline

def full_preprocess(text, method='lemmatize'):
    """Complete text preprocessing pipeline"""
    if not isinstance(text, str) or pd.isna(text):
        return ""

    # Step 11a: Clean text
    text = clean_text(text)

    # Step 11b: Tokenize and remove stopwords
    tokens = tokenize_and_remove_stopwords(text)
    text = ' '.join(tokens)

    # Step 11c: Apply stemming or lemmatization
    if method == 'lemmatize':
        text = lemmatize_text(text)
    elif method == 'stem':
        tokens = word_tokenize(text)
        tokens = apply_stemming(tokens)
        text = ' '.join(tokens)

    return text

In [None]:
# Step 11: Define Complete Preprocessing Pipeline

def full_preprocess(text, method='lemmatize'):
    """Complete text preprocessing pipeline"""
    if not isinstance(text, str) or pd.isna(text):
        return ""

    # Step 11a: Clean text
    text = clean_text(text)

    # Step 11b: Tokenize and remove stopwords
    tokens = tokenize_and_remove_stopwords(text)
    text = ' '.join(tokens)

    # Step 11c: Apply stemming or lemmatization
    if method == 'lemmatize':
        text = lemmatize_text(text)
    elif method == 'stem':
        tokens = word_tokenize(text)
        tokens = apply_stemming(tokens)
        text = ' '.join(tokens)

    return text

In [None]:
# Step 13: Define Machine Learning Function

def train_model(df, text_col, label_col):
    """Train and evaluate a machine learning model"""
    try:
        from sklearn.model_selection import train_test_split
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.linear_model import LogisticRegression
        from sklearn.metrics import classification_report, accuracy_score

        print("  Step 13a: Vectorizing text...")
        vectorizer = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.8)
        X = vectorizer.fit_transform(df[text_col])
        y = df[label_col]

        print(f"  Feature matrix shape: {X.shape}")
        print(f"  Unique labels: {y.nunique()}")

        print("  Step 13b: Splitting data...")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        print("  Step 13c: Training model...")
        clf = LogisticRegression(max_iter=1000, random_state=42)
        clf.fit(X_train, y_train)

        print("  Step 13d: Evaluating model...")
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"  Model Accuracy: {accuracy:.3f}")
        print("  Classification Report:")
        print(classification_report(y_test, y_pred))

        return clf, vectorizer

    except ImportError:
        print("  Error: Scikit-learn not available. Install with: pip install scikit-learn")
        return None, None
    except Exception as e:
        print(f"  Error during model training: {e}")
        return None, None

In [None]:
# Step 14: Main Execution Function

def main():
    print("Starting News Text Preprocessing Pipeline")
    print("=" * 50)

    # Step 14a: Load data
    print("\nStep 14a: Loading data...")
    df = load_data()
    if df is None:
        print("  Cannot proceed without data. Exiting.")
        return

    # Step 14b: Find text column
    print("\nStep 14b: Identifying text column...")
    text_column = None
    possible_text_cols = ['text', 'content', 'article', 'news_text', 'body']

    for col in possible_text_cols:
        if col in df.columns:
            text_column = col
            print(f"  Found text column: {col}")
            break

    if text_column is None:
        print("  No text column found. Available columns:")
        for i, col in enumerate(df.columns, 1):
            print(f"    {i}. {col}")
        return

    # Step 14c: Show sample data
    print(f"\nStep 14c: Sample original text from '{text_column}':")
    sample_text = str(df[text_column].iloc[0])
    print(f"  Length: {len(sample_text)} characters")
    print(f"  Preview: {sample_text[:200]}...")

    # Step 14d: Apply preprocessing
    print(f"\nStep 14d: Processing {len(df):,} texts...")
    print("  This may take a few minutes...")

    df['processed_text'] = df[text_column].apply(lambda x: full_preprocess(x, method='lemmatize'))

    # Step 14e: Show results
    print(f"\nStep 14e: Sample processed text:")
    sample_processed = df['processed_text'].iloc[0]
    print(f"  Length: {len(sample_processed)} characters")
    print(f"  Preview: {sample_processed[:200]}...")

    # Step 14f: Calculate statistics
    print(f"\nStep 14f: Preprocessing statistics:")
    original_lengths = df[text_column].astype(str).str.len()
    processed_lengths = df['processed_text'].str.len()

    print(f"  Average original length: {original_lengths.mean():.1f} characters")
    print(f"  Average processed length: {processed_lengths.mean():.1f} characters")
    print(f"  Reduction: {(1 - processed_lengths.mean()/original_lengths.mean())*100:.1f}%")

    # Step 14g: Save processed data
    print(f"\nStep 14g: Saving processed data...")
    if not save_processed_data(df):
        print("  Failed to save processed data.")
        return

    # Step 14h: Check for label column and train model
    print(f"\nStep 14h: Checking for label column...")
    label_column = None
    possible_label_cols = ['label', 'category', 'class', 'target', 'sentiment']

    for col in possible_label_cols:
        if col in df.columns:
            label_column = col
            print(f"  Found label column: {col}")
            break

    if label_column:
        print(f"\nStep 14i: Training machine learning model...")
        model, vectorizer = train_model(df, 'processed_text', label_column)
        if model is not None:
            print("  Model training completed successfully.")
    else:
        print("  No label column found. Skipping ML training.")
        print("  Available columns:")
        for i, col in enumerate(df.columns, 1):
            print(f"    {i}. {col}")

    print(f"\nPipeline completed successfully!")
    print("Your processed data is now available in 'processed_news.csv'")
