# Prototype end to end process
1. Save the train/val/test sets (which were generated from the train set)
2. Then have a simple process to train on the train set, optimize on the val set, and then test on the holdout test set. I'll then test the outputs on the actual submission test set.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import polars as pl
from omegaconf import OmegaConf
from pathlib import Path
import os
from typing import Tuple

import torch

In [None]:
from trav_nlp.misc import polars_train_val_test_split
from trav_lib.evaluate import evaluate_model

In [4]:
cfg = OmegaConf.create({
    'raw_data': {
        'train_path': '../data/train.csv',
        'test_path': '../data/test.csv',
        'sample_submission_path': '../data/sample_submission.csv',
    },
    # Split the train dataset into a train/val/test split
    'training_data': {
        'train_path': '../data/splits/train.parquet',
        'val_path': '../data/splits/val.parquet',
        'test_path': '../data/splits/test.parquet'
    },

    'params': {
        'train_frac': 0.8,
        'val_frac': 0.1,
        'test_frac': 0.1,
        'train_val_test_seed': 42,   
    }
})

## Create the train/val/test splits if they don't already exist

In [5]:
def load_or_create_data(cfg):

    # Define local variables for configuration paths
    train_path = cfg.training_data.train_path
    val_path = cfg.training_data.val_path
    test_path = cfg.training_data.test_path
    raw_train_path = cfg.raw_data.train_path

    # Define local variables for parameters
    train_frac = cfg.params.train_frac
    val_frac = cfg.params.val_frac
    test_frac = cfg.params.test_frac
    seed = cfg.params.train_val_test_seed

    if Path(train_path).exists():
        df_train = pl.read_parquet(train_path)
        df_val = pl.read_parquet(val_path)
        df_test = pl.read_parquet(test_path)
    else:
        df = pl.read_csv(raw_train_path)
        Path(train_path).parent.mkdir(parents=True, exist_ok=True)
        
        df_train, df_val, df_test = polars_train_val_test_split(
            df, 
            train_frac=train_frac,
            val_frac=val_frac,
            test_frac=test_frac,
            shuffle=True,
            seed=seed
        )
        
        df_train.write_parquet(train_path)
        df_val.write_parquet(val_path)
        df_test.write_parquet(test_path)
    
    return df_train, df_val, df_test

In [55]:
df_train, df_val, df_test = load_or_create_data(cfg)

In [56]:
df_train.head()

id,keyword,location,text,target
i64,str,str,str,i64
9853,"""trauma""",,"""Today was trauma on top of tra…",0
798,"""battle""",,"""Dragon Ball Z: Battle Of Gods …",0
9822,"""trauma""",,"""Hiroshima: They told me to pai…",1
1817,"""buildings%20on%20fire""","""New Hampshire""","""17 people displaced after 3-al…",1
6148,"""hijack""","""Nigeria""","""Criminals Who Hijack Lorries A…",1


## Now, I guess I'll have a large wrapper function which runs a single experiment
1. I suppose the larger wrapper will be run_experiment() or something similar. Then within that run_experiment wrapper I can have various different types of pipelines to train and evaluate, etc.
2. I'll start with the most simple pipeline I can do. An sklearn pipeline
3. The general idea of this will be to run an experiment, get the results of the model experiment, at the very least on the hold out test set, and then also submit the results to kaggle and get the results of that submission as well.
    - So, it'll be train, val, and hold-out test set performance in a chart. Then also I'll submit the kaggle and get that performance.
4. So first I'll code up the various parts of the loop. 
5. Then I'll integrate MLFlow so that I can include all those results into a single chart.

In [72]:
from trav_nlp.misc import submit_to_kaggle

In [None]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

import logging

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer

In [34]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names, but LGBMClassifier was fitted with feature names",
    category=UserWarning,
)

In [21]:
def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )
    
# Usage example:
setup_logging()
logging.info("Logging is configured.")

2025-02-22 10:06:12 INFO: Logging is configured.


In [78]:
def train(df_train, df_val = None):
    """Train and optimize the model"""

    # Define a function to extract the 'text' column
    def extract_text(df):
        return df['text']

    def convert_to_numpy(scipy_csr_matrix):
        return scipy_csr_matrix.toarray()

    # Create a FunctionTransformer to apply that function
    extract_text_transform = FunctionTransformer(extract_text)

    convert_to_numpy_transform = FunctionTransformer(convert_to_numpy)

    # Create the pipeline with the text selector, vectorizer, and classifier
    pipeline = make_pipeline(
        extract_text_transform,
        CountVectorizer(),
        convert_to_numpy_transform,
        lgb.LGBMClassifier(random_state=42)
    )

    pipeline.fit(df_train, df_train['target'])

    train_preds = pipeline.predict_proba(df_train)[:,1]
    train_roc_auc = roc_auc_score(df_train['target'], train_preds)
    logging.info(f"Train ROC: {train_roc_auc}")

    if df_val is not None:
        val_preds = pipeline.predict_proba(df_val)[:, 1]
        val_roc_auc = roc_auc_score(df_val['target'], val_preds)
        logging.info(f"Val ROC: {val_roc_auc}")

    return pipeline

In [57]:
pipeline = train(df_train, df_val)

[LightGBM] [Info] Number of positive: 2614, number of negative: 3476
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 699
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.429228 -> initscore=-0.285001
[LightGBM] [Info] Start training from score -0.285001


2025-02-22 13:03:15 INFO: Train ROC: 0.9263938401965869
2025-02-22 13:03:15 INFO: Val ROC: 0.8571826280623607


In [None]:
def eval_df_test(pipeline, df_test):

    test_preds = pipeline.predict_proba(df_test)[:, 1]
    test_roc_auc = roc_auc_score(df_test['target'], test_preds)
    logging.info(f"Test ROC: {test_roc_auc}")

2025-02-22 13:06:53 INFO: Test ROC: 0.8419177701317206


In [84]:
def generate_and_submit_to_kaggle(pipeline, kaggle_test_path, kaggle_sample_submission_path):

    df_kaggle_test = pl.read_csv(kaggle_test_path)
    kaggle_sample_submission = pl.read_csv(kaggle_sample_submission_path)

    kaggle_test_preds = pipeline.predict(df_kaggle_test)
    kaggle_sample_submission = kaggle_sample_submission.with_columns(
        pl.Series("target", kaggle_test_preds)
    )

    submissions_dir = Path('../data/submissions')
    timestamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
    filename = f"submission_{timestamp}.csv"
    submission_path = submissions_dir / filename

    kaggle_sample_submission.write_csv(submission_path)

    submit_to_kaggle('nlp-getting-started', submission_path)

In [86]:
def run_experiment(cfg, run_submit_to_kaggle = False):
    """Train/optimize a model, and then report the results of the model training run. 
    Also save/return the scores on the test.csv file for submission to kaggle if the model
    appears to perform well.

    So I'll have a train_model function
    """


    df_train, df_val, df_test = load_or_create_data(cfg)

    pipeline = train(df_train, df_val)

    eval_df_test(pipeline, df_test)

    if run_submit_to_kaggle:
        df_full_train = pl.concat([df_train, df_val, df_test])
        full_pipeline = train(df_full_train)
        generate_and_submit_to_kaggle(full_pipeline, cfg.raw_data.test_path, cfg.raw_data.sample_submission_path)
