# Prototype end to end process
1. Save the train/val/test sets (which were generated from the train set)
2. Then have a simple process to train on the train set, optimize on the val set, and then test on the holdout test set. I'll then test the outputs on the actual submission test set.

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import polars as pl
from omegaconf import OmegaConf
from pathlib import Path
import os
from typing import Tuple

import torch
import subprocess

In [2]:
from trav_nlp.misc import polars_train_val_test_split

In [20]:
import hydra

In [None]:
# cfg = OmegaConf.create({

#     'mlflow': {
#         'host': '127.0.0.1',
#         'port': '8080',
#         'uri': 'http://127.0.0.1:8080' # TODO: Make this interpolated
#     },

#     'raw_data': {
#         'train_path': '../data/train.csv',
#         'test_path': '../data/test.csv',
#         'sample_submission_path': '../data/sample_submission.csv',
#     },
#     # Split the train dataset into a train/val/test split
#     'training_data': {
#         'train_path': '../data/splits/train.parquet',
#         'val_path': '../data/splits/val.parquet',
#         'test_path': '../data/splits/test.parquet'
#     },

#     'params': {
#         'train_frac': 0.8,
#         'val_frac': 0.1,
#         'test_frac': 0.1,
#         'train_val_test_seed': 42,   
#     }
# })

In [22]:
from hydra import compose, initialize

In [34]:
with initialize(config_path='../conf'):
    cfg = compose(config_name='config')

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize(config_path='../conf'):


In [36]:
cfg.experiment.submit_to_kaggle

False

## Create the train/val/test splits if they don't already exist

## Now, I guess I'll have a large wrapper function which runs a single experiment
1. I suppose the larger wrapper will be run_experiment() or something similar. Then within that run_experiment wrapper I can have various different types of pipelines to train and evaluate, etc.
2. I'll start with the most simple pipeline I can do. An sklearn pipeline
3. The general idea of this will be to run an experiment, get the results of the model experiment, at the very least on the hold out test set, and then also submit the results to kaggle and get the results of that submission as well.
    - So, it'll be train, val, and hold-out test set performance in a chart. Then also I'll submit the kaggle and get that performance.
4. So first I'll code up the various parts of the loop. 
5. Then I'll integrate MLFlow so that I can include all those results into a single chart.

In [25]:
import logging
from trav_nlp.misc import submit_to_kaggle, setup_logging
from trav_nlp.pipeline import train, eval_df_test, generate_and_submit_to_kaggle

from trav_nlp.pipeline import load_or_create_data

In [27]:
setup_logging()
logging.info("Logging is configured.")

2025-02-23 23:38:59 INFO: Logging is configured.


In [28]:
df_train, df_val, df_test = load_or_create_data(cfg)

In [29]:
df_train.head()

id,keyword,location,text,target
i64,str,str,str,i64
9853,"""trauma""",,"""Today was trauma on top of tra…",0
798,"""battle""",,"""Dragon Ball Z: Battle Of Gods …",0
9822,"""trauma""",,"""Hiroshima: They told me to pai…",1
1817,"""buildings%20on%20fire""","""New Hampshire""","""17 people displaced after 3-al…",1
6148,"""hijack""","""Nigeria""","""Criminals Who Hijack Lorries A…",1


In [30]:
# import socket
# import subprocess
# import time

# def is_port_in_use(port, host='localhost'):
#     """
#     Check if a given port on the host is currently in use.
#     Returns True if the port is open (i.e. something is listening).
#     """
#     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
#         # connect_ex returns 0 if the connection is successful
#         return sock.connect_ex((host, port)) == 0

# def start_mlflow_server(port=5000):
#     """
#     Starts the MLflow server on the given port using a subprocess.
#     This function assumes that MLflow is installed and available in your PATH.
#     """
#     # Check if MLflow server is already running
#     if is_port_in_use(port):
#         print(f"MLflow server already running on port {port}. Using the existing server.")
#         return

#     command = ['mlflow', 'server', '--port', str(port)]
#     print(f"Starting MLflow server on port {port}...")
    
#     # Start the server as a background process.
#     process = subprocess.Popen(command)
    
#     # Optionally wait a short time to allow the server to initialize.
#     time.sleep(5)
    
#     if is_port_in_use(port):
#         print("MLflow server started successfully.")
#     else:
#         print("Failed to start the MLflow server.")
    

In [31]:
def run_experiment(cfg, run_submit_to_kaggle = False):
    """Train/optimize a model, and then report the results of the model training run. 
    Also save/return the scores on the test.csv file for submission to kaggle if the model
    appears to perform well.

    So I'll have a train_model function
    """


    df_train, df_val, df_test = load_or_create_data(cfg)

    pipeline = train(df_train, df_val)

    eval_df_test(pipeline, df_test)

    if run_submit_to_kaggle:
        df_full_train = pl.concat([df_train, df_val, df_test])
        full_pipeline = train(df_full_train)
        generate_and_submit_to_kaggle(full_pipeline, cfg.raw_data.test_path, cfg.raw_data.sample_submission_path)


In [32]:
run_experiment(cfg)

[LightGBM] [Info] Number of positive: 2614, number of negative: 3476
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005007 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 699
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.429228 -> initscore=-0.285001
[LightGBM] [Info] Start training from score -0.285001


2025-02-23 23:38:59 INFO: Train ROC: 0.9263938401965869
2025-02-23 23:38:59 INFO: Val ROC: 0.8571826280623607
2025-02-23 23:38:59 INFO: Test ROC: 0.8419177701317206


In [1]:
from trav_nlp.misc import flatten_dict

In [2]:
mydict = {
    'level1': {
        'param1': 12,
        'param2': 13,
        'param3': {
            'param4': 14,
            'param5': 15
        }
    }
}

In [3]:
flatten_dict(mydict)

{'level1.param1': 12,
 'level1.param2': 13,
 'level1.param3.param4': 14,
 'level1.param3.param5': 15}

In [4]:
import mlflow

In [5]:
mlflow.start_run?

[0;31mSignature:[0m
[0mmlflow[0m[0;34m.[0m[0mstart_run[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mrun_id[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexperiment_id[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrun_name[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnested[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mparent_run_id[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtags[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mdict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m

In [13]:
from trav_nlp.misc import verify_git_commit
from hydra import initialize, compose
from omegaconf import OmegaConf

In [7]:
folders = [
    '../trav_nlp',
    '../conf']
verify_git_commit(*folders)

RuntimeError: There are uncommitted changes in '/Users/traviswhitfield/Documents/github/kaggle_nlp_getting_started/conf'.

In [9]:
with initialize(config_path="../conf", job_name="run_pipeline", version_base=None):
    cfg = compose(config_name="config", overrides=[])

In [14]:
print(OmegaConf.to_yaml(cfg.model_params))

num_leaves: 31
n_estimators: 200
learning_rate: 0.15
max_depth: -1
random_state: 42
boosting_type: gbdt
verbose: -1



In [15]:
best_params = {'num_leaves': 12, 'n_estimators': 55}

In [16]:
cfg.model_params.update(best_params)

In [17]:
print(OmegaConf.to_yaml(cfg.model_params))

num_leaves: 12
n_estimators: 55
learning_rate: 0.15
max_depth: -1
random_state: 42
boosting_type: gbdt
verbose: -1



# Try using gensim and downloading GloVe word embeddings
1. Then for the features of my model I'm going to take the average word embedding of the words in the tweet, and use that as the feature.
2. 

In [51]:
from gensim import downloader
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

import datetime
import logging
import os
import sys
import warnings
import zipfile
from pathlib import Path

import kaggle
import lightgbm as lgb
import mlflow
import optuna
import polars as pl
from hydra import compose, initialize
from omegaconf import DictConfig, OmegaConf
from prefect import flow, task
from prefect.cache_policies import DEFAULT, INPUTS
from prefect.context import get_run_context  # New import
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from trav_nlp.misc import (
    flatten_dict,
    polars_train_test_split,
    polars_train_val_test_split,
    submit_to_kaggle,
    verify_git_commit,
)

from sklearn.base import BaseEstimator, TransformerMixin

In [57]:
from trav_nlp.pipeline import train_val_test_split

In [56]:
with initialize(config_path="../conf", job_name="run_pipeline", version_base=None):
    cfg = compose(config_name="config", overrides=[])

In [32]:
GENSIM_EMBEDDING_NAME = 'glove-twitter-25'

In [34]:
%%time
glove_vectors = downloader.load(GENSIM_EMBEDDING_NAME)

CPU times: user 11.5 s, sys: 1.51 s, total: 13.1 s
Wall time: 47.1 s


In [68]:
info = downloader.info()

In [95]:
for key, val in info['models'].items():
    try:
        print(key, f"{val['file_size']/1e6}mb")
    except:
        print(key)

fasttext-wiki-news-subwords-300 1005.007116mb
conceptnet-numberbatch-17-06-300 1225.497562mb
word2vec-ruscorpora-300 208.427381mb
word2vec-google-news-300 1743.56384mb
glove-wiki-gigaword-50 69.182535mb
glove-wiki-gigaword-100 134.300434mb
glove-wiki-gigaword-200 264.336934mb
glove-wiki-gigaword-300 394.362229mb
glove-twitter-25 109.885004mb
glove-twitter-50 209.216938mb
glove-twitter-100 405.932991mb
glove-twitter-200 795.3731mb
__testing_word2vec-matrix-synopsis


In [103]:
for key in info['models']:
    print(f"python trav_nlp/pipeline.py embeddings=glove1 embeddings.name={key}")

Current Prefect logging level: INFO
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=fasttext-wiki-news-subwords-300
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=conceptnet-numberbatch-17-06-300
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=word2vec-ruscorpora-300
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=word2vec-google-news-300
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=glove-wiki-gigaword-50
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=glove-wiki-gigaword-100
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=glove-wiki-gigaword-200
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=glove-wiki-gigaword-300
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=glove-twitter-25
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=glove-twitter-50
python trav_nlp/pipeline.py embeddings=glove1 embeddings.name=glove-twitter-100
python trav_nlp/pipel

In [98]:
import mlflow

In [99]:
mlflow.log_metric?

[0;31mSignature:[0m
[0mmlflow[0m[0;34m.[0m[0mlog_metric[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mkey[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalue[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstep[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msynchronous[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtimestamp[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrun_id[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mOptional[0m[0;34m[[0m[0mmlflow[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0masync_logging[0m[0;34m.[0m[0mrun_operation

In [93]:
info['models']

{'fasttext-wiki-news-subwords-300': {'num_records': 999999,
  'file_size': 1005007116,
  'base_dataset': 'Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens)',
  'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/fasttext-wiki-news-subwords-300/__init__.py',
  'license': 'https://creativecommons.org/licenses/by-sa/3.0/',
  'parameters': {'dimension': 300},
  'description': '1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).',
  'read_more': ['https://fasttext.cc/docs/en/english-vectors.html',
   'https://arxiv.org/abs/1712.09405',
   'https://arxiv.org/abs/1607.01759'],
  'checksum': 'de2bb3a20c46ce65c9c131e1ad9a77af',
  'file_name': 'fasttext-wiki-news-subwords-300.gz',
  'parts': 1},
 'conceptnet-numberbatch-17-06-300': {'num_records': 1917247,
  'file_size': 1225497562,
  'base_dataset': 'ConceptNet, word2vec, GloVe, and OpenSubtitles 2016',
  'reader_code': 'https:/

In [100]:
from prefect.logging import configure_logging


[autoreload of trav_nlp.pipeline failed: Traceback (most recent call last):
  File "/Users/traviswhitfield/miniconda3/envs/kaggle_nlp_getting_started/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/traviswhitfield/miniconda3/envs/kaggle_nlp_getting_started/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "/Users/traviswhitfield/miniconda3/envs/kaggle_nlp_getting_started/lib/python3.12/importlib/__init__.py", line 131, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 866, in _exec
  File "<frozen importlib._bootstrap_external>", line 999, in exec_module
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/Users/traviswhitfield/Documents/github/kaggle_nlp_getting_started/trav_nlp/pipeline.py", line 22, in <module>
    

ImportError: cannot import name 'configure_logging' from 'prefect.logging' (/Users/traviswhitfield/miniconda3/envs/kaggle_nlp_getting_started/lib/python3.12/site-packages/prefect/logging/__init__.py)

In [101]:
import prefect.logging

In [102]:
prefect.logging?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'prefect.logging' from '/Users/traviswhitfield/miniconda3/envs/kaggle_nlp_getting_started/lib/python3.12/site-packages/prefect/logging/__init__.py'>
[0;31mFile:[0m        ~/miniconda3/envs/kaggle_nlp_getting_started/lib/python3.12/site-packages/prefect/logging/__init__.py
[0;31mDocstring:[0m   <no docstring>

In [None]:
# Define a custom log format
log_format = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
date_format = "%Y-%m-%d %H:%M:%S"

# Configure Prefect logging
logging.basicConfig(format=log_format, datefmt=date_format, level=logging.INFO)

# Reconfigure Prefect to use the new format
configure_logging()

In [104]:
print('hello')

hello


In [106]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

# Create an sklearn pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Get probabilities
y_probs = pipeline.predict_proba(X_val)[:, 1]

# Set a custom threshold (e.g., found via optimization)
custom_threshold = 0.4  # Example threshold
y_pred = (y_probs >= custom_threshold).astype(int)

# Compute F1-score
f1 = f1_score(y_val, y_pred)
print(f"F1-score: {f1}")

NameError: name 'X_train' is not defined

In [107]:
print('hello')

hello


In [None]:
from trav_nlp.pipeline import (
    
)