In [1]:
import streamlit as st
import pandas as pd
import time
from sentence_transformers import SentenceTransformer
from datasets import Features, Value, load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import datetime
import os
from pathlib import Path
import duckdb

In [2]:
# df = pd.read_csv("traction-analytics/data/combined_data_may_june.csv")
# df = df.drop_duplicates(subset="Link", keep="first")
# df.to_csv("traction-analytics/data/combined_data_may_june_unique.csv", index=False)

In [3]:
def preprocess_daily_scan(file, source: str = "") -> pd.DataFrame:
    df = (
        pd.read_csv(
            file,
            usecols=[
                "Published",
                "Headline",
                "Summary",
                "Link",
                "Domain",
                "Facebook Interactions",
                "date_extracted"
            ],
            dtype={
                "Headline": "string",
                "Summary": "string",
                "Link": "string",
                "Domain": "string",
                "Facebook Interactions": "int",
            },
            parse_dates=["Published","date_extracted"],
        )
        .assign(
            timestamp=lambda df: df["Published"].astype("int64") // 10**9,
            source=source,
        )
        .rename(lambda col_name: col_name.lower().replace(" ", "_"), axis="columns")
    )
    return df


def load_classification_model(model_path=None) -> SetFitModel:
    # data_folder = Path("trained_models")
    # data_folder_date_sorted = sorted(
    #     data_folder.iterdir(), key=os.path.getmtime)
    # latest_model_path = str(data_folder_date_sorted[-1])

    # if model_path is None:
    #     model_path = latest_model_path

    # model = SetFitModel.from_pretrained(model_path)
    model = SetFitModel.from_pretrained(
        "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
        cache_dir="cached_models",
    )
    
    return model

def label_df(df: pd.DataFrame, trainer: SetFitModel, column: str) -> pd.DataFrame:
    trainer.train()
    y_score = trainer.model.predict_proba(df[column])

    label_order = np.argsort(
        y_score, axis=1, kind="stable").numpy()[:, ::-1]
    label_scores_df = pd.DataFrame(
        y_score, columns=model.model_head.classes_)

    sorted_label_list = []
    sorted_scores_list = []
    for (idx, row) in label_scores_df.iterrows():
        sorted_label = row.iloc[label_order[idx]]
        sorted_label_list.append(sorted_label.index.to_list())
        sorted_scores_list.append(sorted_label.to_list())

    labelled_df = df.assign(
        predicted_indexes=sorted_label_list, prediction_prob=sorted_scores_list
    )

    labelled_df = df.assign(
        suggested_labels=sorted_label_list, suggested_labels_score=sorted_scores_list)
    return labelled_df

def prepare_base_training_dataset():
    DATA_DIR = Path("data")
    (DATA_DIR / "train").mkdir(parents=True, exist_ok=True)

    pd.read_csv(
        "all_tagged_articles_new.csv",
        usecols=["Published", "Headline", "Theme", "New Index"],
        na_values="-",
        parse_dates=["Published"],
    ).rename(
        lambda col_name: col_name.lower().replace(" ", "_"), axis="columns"
    ).assign(
        label=lambda df: df[["theme", "new_index"]]
        .fillna("")
        .agg(" > ".join, axis="columns")
    ).drop(
        columns=["theme", "new_index"]
    ).to_parquet(
        DATA_DIR / "train" / "base_training_data.parquet"
    )


def create_dataset(min_labels=2):
    DATA_DIR = Path("data")
    if not (DATA_DIR / "train" / "base_training_data.parquet").exists():
        prepare_base_training_dataset()

    with duckdb.connect(str(DATA_DIR / "news.db")) as con:
        con.sql(
            f"""
            COPY 
            (SELECT published, headline, label FROM daily_news WHERE label IS NOT NULL) 
            TO '{DATA_DIR / "train"}/generated_training_data.parquet'
            (FORMAT PARQUET);
            """
        )

    features = Features(
        {
            "published": Value("timestamp[ns]"),
            "headline": Value("string"),
            "label": Value("string"),
        }
    )

    min_labels_list = (
        pd.concat(
            pd.read_parquet(parquet_file, columns=["label"])
            for parquet_file in (DATA_DIR / "train").glob("*.parquet")
        )["label"]
        .value_counts()[lambda s: s >= min_labels]
        .index.to_list()
    )

    dataset = load_dataset(
        "parquet", data_dir=str(DATA_DIR / "train"), features=features
    ).filter(lambda row: row["label"] in min_labels_list)

    return dataset



In [4]:
model = load_classification_model()

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [5]:
# load data

# df = pd.read_csv("traction-analytics/data/combined_data_may_june.csv")
df_processed = preprocess_daily_scan("traction-analytics/may_jun_jul_aug_sep_combined.csv")
df_processed = df_processed.drop(columns=['source'])
df_processed

Unnamed: 0,published,headline,summary,link,domain,facebook_interactions,date_extracted,timestamp
0,2023-04-25 10:03:11,Zoo otter picks up visitor's dropped iPhone & ...,Otterocious behaviour.,https://www.facebook.com/592308557475467_65627...,mothership.sg,14739,2023-05-01,1682416991
1,2023-04-25 09:20:00,K-pop girl group Twice to hold Singapore conce...,Twice will be returning to Singapore after 4 y...,https://www.facebook.com/129011692114_10159170...,straitstimes.com,10030,2023-05-01,1682414400
2,2023-04-25 05:18:26,Hong Kong superstar Jacky Cheung to hold block...,#JackyCheung is taking steps to ensure EVERYON...,https://www.facebook.com/345185573000_61517080...,sg.style.yahoo.com,922,2023-05-01,1682399906
3,2023-04-25 03:30:00,K-pop girl group TWICE is coming to Singapore,Find out when TWICE will be taking over the Si...,https://www.facebook.com/345185573000_61504389...,sg.news.yahoo.com,867,2023-05-01,1682393400
4,2023-04-27 07:00:09,K-pop girl group Twice to hold Singapore conce...,A musical group so good they named them Twice.,https://www.facebook.com/39533052294_101602403...,tnp.straitstimes.com,443,2023-05-01,1682578809
...,...,...,...,...,...,...,...,...
87632,2023-09-25 12:24:46,We'd love to know what you think!,We want to hear what you think. Help us do a s...,https://www.facebook.com/592308557475467_72021...,,47,2023-09-30,1695644686
87633,2023-09-24 00:00:04,Pro-EU supporters march for Britain to rejoin ...,Britain left the EU after voting in favour of ...,https://www.facebook.com/129011692114_62182373...,straitstimes.com,47,2023-09-30,1695513604
87634,2023-09-23 02:59:02,Ginger cat seeks treatment at a Malaysian hosp...,"An ginger cat, with an injured paw, caught Tik...",https://www.facebook.com/14440041382_707327821...,gutzy.asia,47,2023-09-30,1695437942
87635,2023-09-23 02:56:13,Is #AngePostecoglou still being underrated aft...,Is #AngePostecoglou still being underrated aft...,https://www.facebook.com/345185573000_70189982...,,47,2023-09-30,1695437773


In [6]:
df_processed.columns

Index(['published', 'headline', 'summary', 'link', 'domain',
       'facebook_interactions', 'date_extracted', 'timestamp'],
      dtype='object')

In [7]:
# predict using model
dataset = create_dataset()
train_dataset = dataset["train"]
# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20,  # The number of text pairs to generate for contrastive learning
    num_epochs=1,  # The number of epochs to use for contrastive learning
    column_mapping={
        "headline": "text",
        "label": "label",
    },  # Map dataset columns to text/label expected by trainer
)

df_predicted = label_df(df_processed, trainer, "headline")
df_predicted['published'] = pd.to_datetime(df_predicted['published'])
df_predicted

Downloading and preparing dataset parquet/default to /Users/Salman/.cache/huggingface/datasets/parquet/default-b70e39d7e5c222b7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /Users/Salman/.cache/huggingface/datasets/parquet/default-b70e39d7e5c222b7/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Filter:   0%|          | 0/11790 [00:00<?, ? examples/s]

Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 471520
  Num epochs = 1
  Total optimization steps = 29470
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/29470 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,published,headline,summary,link,domain,facebook_interactions,date_extracted,timestamp,suggested_labels,suggested_labels_score
0,2023-04-25 10:03:11,Zoo otter picks up visitor's dropped iPhone & ...,Otterocious behaviour.,https://www.facebook.com/592308557475467_65627...,mothership.sg,14739,2023-05-01,1682416991,"[general > others, national defence > nsf, nat...","[0.8104666009341985, 0.05570217591864461, 0.02..."
1,2023-04-25 09:20:00,K-pop girl group Twice to hold Singapore conce...,Twice will be returning to Singapore after 4 y...,https://www.facebook.com/129011692114_10159170...,straitstimes.com,10030,2023-05-01,1682414400,"[general > others, national defence > nsf, key...","[0.6474872364872272, 0.10861081829774259, 0.06..."
2,2023-04-25 05:18:26,Hong Kong superstar Jacky Cheung to hold block...,#JackyCheung is taking steps to ensure EVERYON...,https://www.facebook.com/345185573000_61517080...,sg.style.yahoo.com,922,2023-05-01,1682399906,"[general > others, national defence > nsf, key...","[0.8520231845131985, 0.04746251036905978, 0.01..."
3,2023-04-25 03:30:00,K-pop girl group TWICE is coming to Singapore,Find out when TWICE will be taking over the Si...,https://www.facebook.com/345185573000_61504389...,sg.news.yahoo.com,867,2023-05-01,1682393400,"[general > others, national defence > nsf, key...","[0.40799102347488986, 0.13312941070346068, 0.0..."
4,2023-04-27 07:00:09,K-pop girl group Twice to hold Singapore conce...,A musical group so good they named them Twice.,https://www.facebook.com/39533052294_101602403...,tnp.straitstimes.com,443,2023-05-01,1682578809,"[general > others, national defence > nsf, key...","[0.6474872364872272, 0.10861081829774259, 0.06..."
...,...,...,...,...,...,...,...,...,...,...
87632,2023-09-25 12:24:46,We'd love to know what you think!,We want to hear what you think. Help us do a s...,https://www.facebook.com/592308557475467_72021...,,47,2023-09-30,1695644686,"[key events > ndp, national defence > others, ...","[0.5150852272210386, 0.09587506619243362, 0.09..."
87633,2023-09-24 00:00:04,Pro-EU supporters march for Britain to rejoin ...,Britain left the EU after voting in favour of ...,https://www.facebook.com/129011692114_62182373...,straitstimes.com,47,2023-09-30,1695513604,"[foreign affairs > others, russia ukraine war ...","[0.25393659916513595, 0.15339728818224457, 0.1..."
87634,2023-09-23 02:59:02,Ginger cat seeks treatment at a Malaysian hosp...,"An ginger cat, with an injured paw, caught Tik...",https://www.facebook.com/14440041382_707327821...,gutzy.asia,47,2023-09-30,1695437942,"[health > others, covid sg/mtf press con > oth...","[0.3276378400662976, 0.19564441447761732, 0.09..."
87635,2023-09-23 02:56:13,Is #AngePostecoglou still being underrated aft...,Is #AngePostecoglou still being underrated aft...,https://www.facebook.com/345185573000_70189982...,,47,2023-09-30,1695437773,"[covid sg/mtf press con > others, health > oth...","[0.631289355811437, 0.16777474070371048, 0.038..."


In [8]:
df_predicted = df_predicted.sort_values(by=['headline'], ascending=False).reset_index(drop=True)

In [9]:
df_predicted.to_csv("traction-analytics/may_june_jul_aug_sep_data_merged.csv", index=False)