In [1]:
from pathlib import Path
import datetime

import pandas as pd

from datasets import load_dataset, Features, Value
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer

RAW_DATA_DIR = Path("data", "raw")
PROCESSED_DATA_DIR = Path("data", "processed")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# TODO: Use cleaned data + new labelled data for training
train_files = [str(p) for p in PROCESSED_DATA_DIR.iterdir()]
train_files

['data\\processed\\daily_scan_website_(sg)-web_articles-09_12_22-07_39.parquet',
 'data\\processed\\daily_scan_website_(sg)-web_articles-09_16_22-09_06.parquet',
 'data\\processed\\daily_scan_website_(sg)-web_articles-09_21_22-08_05.parquet',
 'data\\processed\\daily_scan_website_(sg)-web_articles-09_23_22-08_00.parquet']

In [3]:
# pd.read_csv("all_tagged_articles - combined.csv")

In [4]:
t_df = pd.read_csv(
    "all_tagged_articles - combined.csv", 
    usecols=["Published", "Headline", "Summary", "Theme", "New Index"],
    na_values="-",
    parse_dates=["Published"],
).rename(
    lambda col_name: col_name.lower().replace(" ", "_"), axis="columns"
).assign(label = lambda df: df[["theme", "new_index"]].fillna("").agg(' > '.join,axis="columns"))

t_df.to_parquet("test.parquet")

In [5]:
t_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10639 entries, 0 to 10638
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   published  10303 non-null  datetime64[ns]
 1   headline   10639 non-null  object        
 2   summary    7894 non-null   object        
 3   theme      10639 non-null  object        
 4   new_index  10639 non-null  object        
 5   label      10639 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 498.8+ KB


In [6]:
# TODO: Replace with taxonomy

# Scaffold for trial
# df = pd.read_csv("tagged_articles.csv", usecols=["Headline", "Theme", "New Index", "New Sub Index", "label"]).rename(lambda col_name: col_name.lower().replace(" ", "_"), axis="columns")
min_labels_list = t_df["label"].value_counts()[lambda s: s>=2].index.to_list()

# TODO: Replace with duckdb schema
features = Features({
    'published': Value('timestamp[ns]'),
    'headline': Value('string'),
    'summary': Value('string'),
    'theme': Value('string'),
    'new_index': Value('string'),
    'label': Value('string'),
})

# TODO: Load real data when ready
# dataset = load_dataset("parquet", data_files={'train': train_files}, features=features)
dataset = load_dataset("parquet", data_files={'train': "test.parquet"}, features=features).filter(lambda row: row['label'] in min_labels_list)

# # Fast train for testing
# train_dataset = sample_dataset(dataset["train"], label_column="label", num_samples=4)
train_dataset = dataset["train"]

# Load a SetFit model
model = SetFitModel.from_pretrained(
    "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", 
    cache_dir="cached_models",
)

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for contrastive learning
    column_mapping={"headline": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)

trainer.train()

trainer.model.save_pretrained(f"trained_models/{datetime.date.today().isoformat()}")

Downloading and preparing dataset parquet/default to C:/Users/edmun/.cache/huggingface/datasets/parquet/default-f88b40d0815b276c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 989.92it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 285.13it/s]
                                                        

Dataset parquet downloaded and prepared to C:/Users/edmun/.cache/huggingface/datasets/parquet/default-f88b40d0815b276c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 55.24it/s]
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
Generating Training Pairs: 100%|██████████| 20/20 [00:15<00:00,  1.26it/s]
***** Running training *****
  Num examples = 425320
  Num epochs = 1
  Total optimization steps = 26583
  Total train batch size = 16
Iteration: 100%|██████████| 26583/26583 [51:45<00:00,  8.56it/s]
Epoch: 100%|██████████| 1/1 [51:45<00:00, 3105.61s/it]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
