In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl (284 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
Installing collected packages: regex, click, nltk
Successfully installed click-8.1.8 nltk-3.9.1 regex-2024.11.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [18]:
import logging
import cloudpickle
import string
import numpy as np
import pandas as pd
import sys
from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from typing import Tuple, Set, Optional

# PLEAE CHANGE TO PROJECT ROOT OTHERWISE THE PICKLE WILL NOT WORK
project_root = "<ABSOLUTE PATH TO PROJECT ROOT>/comp34812"
if project_root not in sys.path:
    sys.path.append(project_root)

In [13]:
class BaseConfig:
    DATA_DIR = Path("Path to data directory") # comp34812/data
    TRAIN_FILE = Path("Path to train file") # train.csv
    DEV_FILE = Path("Path to dev file") # dev.csv
    TEST_FILE = Path("Path to test file") # test.csv
    AUG_TRAIN_FILE = Path("Path to augmented train file") # train_augmented.csv
    SAVE_DIR = Path("Path to save directory") # comp34812/data/results
    CACHE_DIR = Path("Path to cache directory") # comp34812/cache

    # Augmentation config
    AUGMENTATION_CONFIG = {
        "0": {
            "replace": 0.0,
            "add": 0.1, # 10%
            "translate":{
                "percentage": 1.0,
                "split": {
                    "Claim": 0.15,
                    "Evidence": 0.7,
                    "Both": 0.15
                },
                "src": "en",
                "intermediates": {
                    "fr": 0.5,
                    "de": 0.4,
                    "ja": 0.1
                }
            },
            "synonym_replacement": {
                "percentage": 0.7,
                "replacement_fraction": 0.3,
                "min_similarity": 0.85,
                "min_word_length": 4,
                "word_frequency_threshold": 3,
                "synonym_selection_strategy": "random",
                "enable_random_synonym_insertion": True,
                "synonym_insertion_probability": 0.03,
                "enable_random_word_insertion": True,
                "word_insertion_probability": 0.01,
                "enable_random_deletion": True,
                "deletion_probability": 0.01,
            },
            "x_or_y": {
                "percentage": 0.08,
                "max_choices": 4,
                "num_words_to_augment": {
                    "Claim": 1,
                    "Evidence": 2
                },
                "split": {
                    "Claim": 0.90,
                    "Evidence": 0.05,
                    "Both": 0.05
                }
            }
        },
        "1": {
            "replace": 0.0,
            "add": 1.0,
            "translate":{
                "percentage": 0.8,
                "split": {
                    "Claim": 0.15,
                    "Evidence": 0.7,
                    "Both": 0.15
                },
                "src": "en",
                "intermediates": {
                    "fr": 0.5,
                    "de": 0.4,
                    "ja": 0.1
                }
            },
            "synonym_replacement": {
                "percentage": 0.7,
                "replacement_fraction": 0.3,
                "min_similarity": 0.85,
                "min_word_length": 4,
                "word_frequency_threshold": 3,
                "synonym_selection_strategy": "random",
                "enable_random_synonym_insertion": True,
                "synonym_insertion_probability": 0.03,
                "enable_random_word_insertion": True,
                "word_insertion_probability": 0.01,
                "enable_random_deletion": True,
                "deletion_probability": 0.01,
            },
            "x_or_y": {
                "percentage": 0.02,
                "max_choices": 4,
                "num_words_to_augment": {
                    "Claim": 1,
                    "Evidence": 2
                },
                "split": {
                    "Claim": 0.90,
                    "Evidence": 0.05,
                    "Both": 0.05
                }
            }
        }
    }
    
def get_config() -> BaseConfig:
    return BaseConfig()


config = get_config()

In [14]:
# Configure logging
logger = logging.getLogger(__name__)

params = {
    "vocab_size": 12000,
    "n_gram_range": (1, 2),
    "embedding_dim": 300,
    "pca_components": 540,
    "C": 1.96,
    "tfidf_weighting": True,
    "min_df": 1,
    "max_df": 0.95,
    "kernel": 'rbf',
    "gamma": 'scale'
}

In [21]:
def prepare_svm_data(data: pd.DataFrame, 
                    remove_stopwords: bool = True, 
                    lemmatize: bool = True, 
                    min_freq: int = 2, 
                    vocab_size: Optional[int] = None) -> Tuple[pd.DataFrame, np.ndarray, Set[str]]:
    """
    Prepare text data for SVM training by cleaning, normalizing and vocabulary management.
    
    Args:
        data: DataFrame containing 'Claim' and 'Evidence' columns
        remove_stopwords: Whether to remove common stopwords
        lemmatize: Whether to apply lemmatization
        min_freq: Minimum frequency for words to be included in vocabulary
        vocab_size: Maximum vocabulary size (most frequent words kept)
    
    Returns:
        Tuple containing:
            - Processed DataFrame with added 'text' column
            - NumPy array of labels
            - Set of vocabulary words
    """
    translator = str.maketrans('', '', string.punctuation)

    def clean_text(text: str) -> str:
        """
        Clean and normalize text by lowercasing, removing punctuation,
        and optionally removing stopwords and lemmatizing.
        """
        text = text.lower().translate(translator)
        # Normalize whitespace
        text = " ".join(text.split())
        
        if remove_stopwords:
            try:
                # Keep important discourse markers and modal verbs
                keep_words = {
                    'because', 'since', 'therefore', 'hence', 'thus', 'although',
                    'however', 'but', 'not', 'should', 'must', 'might', 'may',
                    'could', 'would', 'against', 'between', 'before', 'after'
                }
                custom_stopwords = set(stopwords.words("english")) - keep_words
                
                text = " ".join([word for word in text.split() 
                               if word not in custom_stopwords])
            except Exception:
                pass
            
        if lemmatize:
            try:
                lemmatizer = WordNetLemmatizer()
                words = text.split()
                text = " ".join([lemmatizer.lemmatize(word) for word in words])
            except Exception:
                pass
        return text

    # Build vocabulary from training data
    train_samples = pd.concat([data['Claim'], data['Evidence']]).apply(clean_text)
    all_words = [word for text in train_samples for word in text.split()]
    word_counts = Counter(all_words)

    # Filter words by minimum frequency and sort by frequency
    filtered_words = [(word, count) for word, count in word_counts.items() if count >= min_freq]
    sorted_words = sorted(filtered_words, key=lambda x: (-x[1], x[0]))
    
    # Apply vocabulary size limit if specified
    if vocab_size is not None:
        sorted_words = sorted_words[:vocab_size]
    
    vocab = {word for word, _ in sorted_words}

    def replace_rare_words(text: str) -> str:
        """Replace words not in vocabulary with <UNK> token."""
        return ' '.join([word if word in vocab else '<UNK>' for word in text.split()])

    # Process the data with UNK replacement
    data['text'] = ("Claim: " + data['Claim'].apply(clean_text).apply(replace_rare_words) + 
                    " [SEP] " + "Evidence: " + data['Evidence'].apply(clean_text).apply(replace_rare_words))

    # Extract labels
    if 'label' in data.columns:
        labels = data['label'].values
    else:
        labels = [None] * len(data)

    return data, labels, vocab

In [16]:
def predict_with_saved_model(
    pipeline_path: Path, 
    input_csv_path: Path, 
    output_csv_path: Path
) -> None:
    """
    Loads a saved SVM pipeline, makes predictions on data from an input CSV, 
    and saves the predictions to an output CSV.

    Args:
        pipeline_path: Path to the saved .pkl pipeline file.
        input_csv_path: Path to the input CSV file (must contain 'Evidence' column).
        output_csv_path: Path where the predictions CSV will be saved.
    """
    logger.info("\n" + "="*70)
    logger.info(f"MAKING PREDICTIONS FROM {input_csv_path}")
    logger.info("="*70)

    # --- Input Validation ---
    if not pipeline_path.exists():
        logger.error(f"Pipeline file not found at {pipeline_path}. Cannot make predictions.")
        return
    if not input_csv_path.exists():
        logger.error(f"Input CSV file not found at {input_csv_path}. Cannot make predictions.")
        return
    
    # Ensure output directory exists
    output_csv_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        # --- Load Pipeline --- 
        with open(pipeline_path, "rb") as f:
            loaded_pipeline = cloudpickle.load(f)
        logger.info(f"Pipeline loaded successfully from {pipeline_path}")

        # --- Load and Prepare Input Data ---
        input_df = pd.read_csv(input_csv_path)
        logger.info(f"Loaded {len(input_df)} rows from {input_csv_path}")

        if 'Evidence' not in input_df.columns or 'Claim' not in input_df.columns:
            logger.error(f"Input CSV {input_csv_path} must contain 'Evidence' and 'Claim' columns.")
            return
            

        # Determine training parameters needed for preprocessing
        training_vocab_size = params.get('vocab_size', 12000) 
        logger.info(f"Using parameters for preprocessing: vocab_size={training_vocab_size}")


        # Apply the *exact same* preprocessing as used during training
        processed_data_df, _, _ = prepare_svm_data(
            input_df, 
            remove_stopwords=True,
            lemmatize=True,        
            min_freq=2, 
            vocab_size=training_vocab_size
        )
        processed_texts = processed_data_df['text'].tolist()
        logger.info(f"Preprocessing complete for {len(processed_texts)} texts.")

        # --- Make Predictions --- 
        predictions = loaded_pipeline.predict(processed_texts)
        logger.info(f"Generated {len(predictions)} predictions.")

        # --- Save Predictions --- 
        predictions_df = pd.DataFrame({'prediction': predictions})
        predictions_df.to_csv(output_csv_path, index=False)
        logger.info(f"Predictions saved successfully to {output_csv_path}")

    except ModuleNotFoundError as e:
         logger.error(f"Error loading pickle: A module required by the pickled object was not found: {e}")
         logger.error("Ensure all necessary libraries and custom classes (GloveVectorizer, etc.) are importable.")
    except FileNotFoundError as e:
        logger.error(f"Error: A required file was not found: {e}")
    except KeyError as e:
        logger.error(f"Error: Missing expected column in input data: {e}")
    except Exception as e:
        logger.error(f"An error occurred during prediction: {e}", exc_info=True)


In [22]:
pipeline_pickle_path = Path("/Users/harvey/School/Year 3/NLU/comp34812/data/results/svm/svm_pipeline.pkl")

try:
    prediction_input_file = config.TEST_FILE
    prediction_output_file = config.DATA_DIR / "svm_predictions.csv"
    
    # Ensure the predictions directory exists
    prediction_output_file.parent.mkdir(parents=True, exist_ok=True)
    
    predict_with_saved_model(
        pipeline_path=pipeline_pickle_path,
        input_csv_path=prediction_input_file, 
        output_csv_path=prediction_output_file
    )
except Exception as e:
    logger.error(f"Error predicting with saved model: {e}", exc_info=True)