# Prototype end to end process
1. Save the train/val/test sets (which were generated from the train set)
2. Then have a simple process to train on the train set, optimize on the val set, and then test on the holdout test set. I'll then test the outputs on the actual submission test set.

In [7]:
%load_ext autoreload
%autoreload 2

In [1]:
import polars as pl
from omegaconf import OmegaConf
from pathlib import Path
import os
from typing import Tuple

import torch
import subprocess

In [2]:
from trav_nlp.misc import polars_train_val_test_split

In [20]:
import hydra

In [None]:
# cfg = OmegaConf.create({

#     'mlflow': {
#         'host': '127.0.0.1',
#         'port': '8080',
#         'uri': 'http://127.0.0.1:8080' # TODO: Make this interpolated
#     },

#     'raw_data': {
#         'train_path': '../data/train.csv',
#         'test_path': '../data/test.csv',
#         'sample_submission_path': '../data/sample_submission.csv',
#     },
#     # Split the train dataset into a train/val/test split
#     'training_data': {
#         'train_path': '../data/splits/train.parquet',
#         'val_path': '../data/splits/val.parquet',
#         'test_path': '../data/splits/test.parquet'
#     },

#     'params': {
#         'train_frac': 0.8,
#         'val_frac': 0.1,
#         'test_frac': 0.1,
#         'train_val_test_seed': 42,   
#     }
# })

In [22]:
from hydra import compose, initialize

In [34]:
with initialize(config_path='../conf'):
    cfg = compose(config_name='config')

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize(config_path='../conf'):


In [36]:
cfg.experiment.submit_to_kaggle

False

## Create the train/val/test splits if they don't already exist

## Now, I guess I'll have a large wrapper function which runs a single experiment
1. I suppose the larger wrapper will be run_experiment() or something similar. Then within that run_experiment wrapper I can have various different types of pipelines to train and evaluate, etc.
2. I'll start with the most simple pipeline I can do. An sklearn pipeline
3. The general idea of this will be to run an experiment, get the results of the model experiment, at the very least on the hold out test set, and then also submit the results to kaggle and get the results of that submission as well.
    - So, it'll be train, val, and hold-out test set performance in a chart. Then also I'll submit the kaggle and get that performance.
4. So first I'll code up the various parts of the loop. 
5. Then I'll integrate MLFlow so that I can include all those results into a single chart.

In [25]:
import logging
from trav_nlp.misc import submit_to_kaggle, setup_logging
from trav_nlp.pipeline import train, eval_df_test, generate_and_submit_to_kaggle

from trav_nlp.pipeline import load_or_create_data

In [27]:
setup_logging()
logging.info("Logging is configured.")

2025-02-23 23:38:59 INFO: Logging is configured.


In [28]:
df_train, df_val, df_test = load_or_create_data(cfg)

In [29]:
df_train.head()

id,keyword,location,text,target
i64,str,str,str,i64
9853,"""trauma""",,"""Today was trauma on top of tra…",0
798,"""battle""",,"""Dragon Ball Z: Battle Of Gods …",0
9822,"""trauma""",,"""Hiroshima: They told me to pai…",1
1817,"""buildings%20on%20fire""","""New Hampshire""","""17 people displaced after 3-al…",1
6148,"""hijack""","""Nigeria""","""Criminals Who Hijack Lorries A…",1


In [30]:
# import socket
# import subprocess
# import time

# def is_port_in_use(port, host='localhost'):
#     """
#     Check if a given port on the host is currently in use.
#     Returns True if the port is open (i.e. something is listening).
#     """
#     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
#         # connect_ex returns 0 if the connection is successful
#         return sock.connect_ex((host, port)) == 0

# def start_mlflow_server(port=5000):
#     """
#     Starts the MLflow server on the given port using a subprocess.
#     This function assumes that MLflow is installed and available in your PATH.
#     """
#     # Check if MLflow server is already running
#     if is_port_in_use(port):
#         print(f"MLflow server already running on port {port}. Using the existing server.")
#         return

#     command = ['mlflow', 'server', '--port', str(port)]
#     print(f"Starting MLflow server on port {port}...")
    
#     # Start the server as a background process.
#     process = subprocess.Popen(command)
    
#     # Optionally wait a short time to allow the server to initialize.
#     time.sleep(5)
    
#     if is_port_in_use(port):
#         print("MLflow server started successfully.")
#     else:
#         print("Failed to start the MLflow server.")
    

In [31]:
def run_experiment(cfg, run_submit_to_kaggle = False):
    """Train/optimize a model, and then report the results of the model training run. 
    Also save/return the scores on the test.csv file for submission to kaggle if the model
    appears to perform well.

    So I'll have a train_model function
    """


    df_train, df_val, df_test = load_or_create_data(cfg)

    pipeline = train(df_train, df_val)

    eval_df_test(pipeline, df_test)

    if run_submit_to_kaggle:
        df_full_train = pl.concat([df_train, df_val, df_test])
        full_pipeline = train(df_full_train)
        generate_and_submit_to_kaggle(full_pipeline, cfg.raw_data.test_path, cfg.raw_data.sample_submission_path)


In [32]:
run_experiment(cfg)

[LightGBM] [Info] Number of positive: 2614, number of negative: 3476
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005007 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 699
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.429228 -> initscore=-0.285001
[LightGBM] [Info] Start training from score -0.285001


2025-02-23 23:38:59 INFO: Train ROC: 0.9263938401965869
2025-02-23 23:38:59 INFO: Val ROC: 0.8571826280623607
2025-02-23 23:38:59 INFO: Test ROC: 0.8419177701317206


In [1]:
from trav_nlp.misc import flatten_dict

In [2]:
mydict = {
    'level1': {
        'param1': 12,
        'param2': 13,
        'param3': {
            'param4': 14,
            'param5': 15
        }
    }
}

In [3]:
flatten_dict(mydict)

{'level1.param1': 12,
 'level1.param2': 13,
 'level1.param3.param4': 14,
 'level1.param3.param5': 15}

In [4]:
import mlflow

In [5]:
mlflow.start_run?

[0;31mSignature:[0m
[0mmlflow[0m[0;34m.[0m[0mstart_run[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mrun_id[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexperiment_id[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrun_name[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnested[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mparent_run_id[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtags[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mdict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m