In [1]:
import streamlit as st
import pandas as pd
import time
from sentence_transformers import SentenceTransformer
from setfit import SetFitModel
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import datetime
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
df = pd.read_csv("traction-analytics/data/combined_data_may_june.csv")
df = df.drop_duplicates(subset="Link", keep="first")
df.to_csv("traction-analytics/data/combined_data_may_june_unique.csv", index=False)

In [3]:
def preprocess_daily_scan(file, source: str = "") -> pd.DataFrame:
    df = (
        pd.read_csv(
            file,
            usecols=[
                "Published",
                "Headline",
                "Summary",
                "Link",
                "Domain",
                "Facebook Interactions",
                "date_extracted"
            ],
            dtype={
                "Headline": "string",
                "Summary": "string",
                "Link": "string",
                "Domain": "string",
                "Facebook Interactions": "int",
            },
            parse_dates=["Published","date_extracted"],
        )
        .assign(
            timestamp=lambda df: df["Published"].astype("int64") // 10**9,
            source=source,
        )
        .rename(lambda col_name: col_name.lower().replace(" ", "_"), axis="columns")
    )
    return df


def load_classification_model(model_path=None) -> SetFitModel:
    data_folder = Path("trained_models")
    data_folder_date_sorted = sorted(
        data_folder.iterdir(), key=os.path.getmtime)
    latest_model_path = str(data_folder_date_sorted[-1])

    if model_path is None:
        model_path = latest_model_path

    model = SetFitModel.from_pretrained(model_path)
    return model

def label_df(df: pd.DataFrame, model: SetFitModel, column: str) -> pd.DataFrame:
    y_score = model.predict_proba(df[column])

    label_order = np.argsort(
        y_score, axis=1, kind="stable").numpy()[:, ::-1]
    label_scores_df = pd.DataFrame(
        y_score, columns=model.model_head.classes_)

    sorted_label_list = []
    sorted_scores_list = []
    for (idx, row) in label_scores_df.iterrows():
        sorted_label = row.iloc[label_order[idx]]
        sorted_label_list.append(sorted_label.index.to_list())
        sorted_scores_list.append(sorted_label.to_list())

    labelled_df = df.assign(
        predicted_indexes=sorted_label_list, prediction_prob=sorted_scores_list
    )

    labelled_df = df.assign(
        suggested_labels=sorted_label_list, suggested_labels_score=sorted_scores_list)
    return labelled_df

In [105]:
model = load_classification_model()

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [4]:
# load data

# df = pd.read_csv("traction-analytics/data/combined_data_may_june.csv")
df_processed = preprocess_daily_scan("traction-analytics/data/combined_data_may_june.csv")
df_processed = df_processed.drop(columns=['source'])
df_processed

  timestamp=lambda df: df["Published"].astype("int64") // 10**9,


Unnamed: 0,published,headline,summary,link,domain,facebook_interactions,date_extracted,timestamp
0,2023-04-25 10:03:11,Zoo otter picks up visitor's dropped iPhone & ...,Otterocious behaviour.,https://www.facebook.com/592308557475467_65627...,mothership.sg,14739,2023-05-01,1682416991
1,2023-04-25 09:20:00,K-pop girl group Twice to hold Singapore conce...,Twice will be returning to Singapore after 4 y...,https://www.facebook.com/129011692114_10159170...,straitstimes.com,10030,2023-05-01,1682414400
2,2023-04-25 05:18:26,Hong Kong superstar Jacky Cheung to hold block...,#JackyCheung is taking steps to ensure EVERYON...,https://www.facebook.com/345185573000_61517080...,sg.style.yahoo.com,922,2023-05-01,1682399906
3,2023-04-25 03:30:00,K-pop girl group TWICE is coming to Singapore,Find out when TWICE will be taking over the Si...,https://www.facebook.com/345185573000_61504389...,sg.news.yahoo.com,867,2023-05-01,1682393400
4,2023-04-27 07:00:09,K-pop girl group Twice to hold Singapore conce...,A musical group so good they named them Twice.,https://www.facebook.com/39533052294_101602403...,tnp.straitstimes.com,443,2023-05-01,1682578809
...,...,...,...,...,...,...,...,...
15285,2023-06-20 05:21:34,Housefly Spotted In Luckin Coffee Tampines 1 G...,The wrong kind of buzz.,https://www.facebook.com/292279440975148_58293...,mustsharenews.com,225,2023-06-22,1687238494
15286,2023-06-16 04:40:48,S'pore property agent's flyer made to look lik...,The vehicle owner said the flyer will stick to...,https://www.facebook.com/592308557475467_65442...,mothership.sg,220,2023-06-22,1686890448
15287,2023-06-15 02:57:57,"South Korean 'Snowdrop' actress Park Soo Ryun,...",She was sent to hospital for emergency treatme...,https://www.facebook.com/592308557475467_65371...,mothership.sg,220,2023-06-22,1686797877
15288,2023-06-19 11:05:03,Fish supplier boss who gave employee green lig...,The NTUC FairPrice team leader started buying ...,https://www.facebook.com/93889432933_642642154...,channelnewsasia.com,218,2023-06-22,1687172703


In [107]:
df_processed.columns

Index(['published', 'headline', 'summary', 'link', 'domain',
       'facebook_interactions', 'timestamp'],
      dtype='object')

In [116]:
# predict using model

df_predicted = label_df(df_processed, model, "headline")
df_predicted['published'] = pd.to_datetime(df_predicted['published'])
df_predicted

Unnamed: 0,published,headline,summary,link,domain,facebook_interactions,date_extracted,timestamp,suggested_labels,suggested_labels_score
0,2023-04-25 10:03:11,Zoo otter picks up visitor's dropped iPhone & ...,Otterocious behaviour.,https://www.facebook.com/592308557475467_65627...,mothership.sg,14739,2023-05-01,1682416991,"[general > others, national defence > nsf, gen...","[tensor(0.7606, dtype=torch.float64), tensor(0..."
1,2023-04-25 09:20:00,K-pop girl group Twice to hold Singapore conce...,Twice will be returning to Singapore after 4 y...,https://www.facebook.com/129011692114_10159170...,straitstimes.com,10030,2023-05-01,1682414400,"[general > ndp, general > others, national def...","[tensor(0.3017, dtype=torch.float64), tensor(0..."
2,2023-04-25 05:18:26,Hong Kong superstar Jacky Cheung to hold block...,#JackyCheung is taking steps to ensure EVERYON...,https://www.facebook.com/345185573000_61517080...,sg.style.yahoo.com,922,2023-05-01,1682399906,"[general > ndp, general > others, national def...","[tensor(0.2560, dtype=torch.float64), tensor(0..."
3,2023-04-25 03:30:00,K-pop girl group TWICE is coming to Singapore,Find out when TWICE will be taking over the Si...,https://www.facebook.com/345185573000_61504389...,sg.news.yahoo.com,867,2023-05-01,1682393400,"[general > others, national defence > nsf, gen...","[tensor(0.2009, dtype=torch.float64), tensor(0..."
4,2023-04-27 07:00:09,K-pop girl group Twice to hold Singapore conce...,A musical group so good they named them Twice.,https://www.facebook.com/39533052294_101602403...,tnp.straitstimes.com,443,2023-05-01,1682578809,"[general > ndp, general > others, national def...","[tensor(0.3017, dtype=torch.float64), tensor(0..."
...,...,...,...,...,...,...,...,...,...,...
15285,2023-06-20 05:21:34,Housefly Spotted In Luckin Coffee Tampines 1 G...,The wrong kind of buzz.,https://www.facebook.com/292279440975148_58293...,mustsharenews.com,225,2023-06-22,1687238494,"[general > others, national defence > nsf, hea...","[tensor(0.8466, dtype=torch.float64), tensor(0..."
15286,2023-06-16 04:40:48,S'pore property agent's flyer made to look lik...,The vehicle owner said the flyer will stick to...,https://www.facebook.com/592308557475467_65442...,mothership.sg,220,2023-06-22,1686890448,"[housing > others, housing > bto launches, sus...","[tensor(0.3413, dtype=torch.float64), tensor(0..."
15287,2023-06-15 02:57:57,"South Korean 'Snowdrop' actress Park Soo Ryun,...",She was sent to hospital for emergency treatme...,https://www.facebook.com/592308557475467_65371...,mothership.sg,220,2023-06-22,1686797877,"[death penalty/death row > others, manpower > ...","[tensor(0.2900, dtype=torch.float64), tensor(0..."
15288,2023-06-19 11:05:03,Fish supplier boss who gave employee green lig...,The NTUC FairPrice team leader started buying ...,https://www.facebook.com/93889432933_642642154...,channelnewsasia.com,218,2023-06-22,1687172703,"[general > others, health > healthcare, jobs a...","[tensor(0.5853, dtype=torch.float64), tensor(0..."


In [118]:
df_predicted = df_predicted.sort_values(by=['headline'], ascending=False).reset_index(drop=True)

In [119]:
df_predicted.to_csv("traction-analytics/data/may_june_data_merged.csv", index=False)

In [104]:
may_june = pd.read_excel('traction-analytics/data/combined_data_may_june.xlsx')
may_june.to_csv('traction-analytics/data/combined_data_may_june.csv', index=False)

In [90]:
may_june = pd.read_excel('traction-analytics/data/combined_data_may_june.xlsx')
may_june['Published'] = pd.to_datetime(may_june['Published'])
may_june.rename(columns={'Published': 'published', 'Link':'link', 'Facebook Interactions':"facebook_interactions"}, inplace=True)
may_june = may_june[['published', 'link', 'facebook_interactions', 'date_extracted']]
may_june.sort_values(by=['published'], inplace=True)
may_june


Unnamed: 0,published,link,facebook_interactions,date_extracted
219,2023-04-24 00:00:03,https://www.facebook.com/93889432933_101595887...,1170,2023-05-01
293,2023-04-24 00:00:06,https://www.facebook.com/129011692114_10159167...,769,2023-05-01
419,2023-04-24 00:30:01,https://www.facebook.com/129011692114_10159167...,375,2023-05-01
359,2023-04-24 00:40:01,https://www.facebook.com/93889432933_101595888...,516,2023-05-01
186,2023-04-24 00:48:05,https://www.facebook.com/121790674546188_63473...,342,2023-05-01
...,...,...,...,...
15077,2023-06-21 14:00:41,https://www.facebook.com/129011692114_57390329...,792,2023-06-22
15236,2023-06-21 15:15:09,https://www.facebook.com/129011692114_57393301...,284,2023-06-22
15237,2023-06-21 17:10:01,https://www.facebook.com/592308557475467_65783...,233,2023-06-22
15215,2023-06-21 23:30:03,https://www.facebook.com/129011692114_57408861...,353,2023-06-22


In [103]:
# merge may_june and predicted on published, link and facebook interactions

may_june.merge(df_predicted, on=['published', 'link', 'facebook_interactions'], how='left')


Unnamed: 0,published,link,facebook_interactions,date_extracted,headline,summary,domain,timestamp,suggested_labels,suggested_labels_score
0,2023-04-24 00:00:03,https://www.facebook.com/93889432933_101595887...,1170,2023-05-01,Efficiency of Causeway checkpoints at Johor Ba...,â€œWe're only working at 80% at the moment. So...,channelnewsasia.com,1.682294e+09,"[covid sg/mtf press con > border reopening, co...","[tensor(0.5301, dtype=torch.float64), tensor(0..."
1,2023-04-24 00:00:06,https://www.facebook.com/129011692114_10159167...,769,2023-05-01,Man Utd beat Brighton on penalties to set up F...,Victor Lindelof scored the decisive spot-kick ...,straitstimes.com,1.682294e+09,"[sports > football, sports > world cup, sports...","[tensor(0.6309, dtype=torch.float64), tensor(0..."
2,2023-04-24 00:30:01,https://www.facebook.com/129011692114_10159167...,375,2023-05-01,Wanted: Medical escorts to help the elderly ge...,Medical escorts pick up the elderly at their d...,straitstimes.com,1.682296e+09,"[covid sg/mtf press con > others, national def...","[tensor(0.1677, dtype=torch.float64), tensor(0..."
3,2023-04-24 00:40:01,https://www.facebook.com/93889432933_101595888...,516,2023-05-01,'I could be the last person to speak with her'...,"â€œWhen you see such raw emotions, then you re...",channelnewsasia.com,1.682297e+09,"[death penalty/death row > others, general > o...","[tensor(0.3943, dtype=torch.float64), tensor(0..."
4,2023-04-24 00:48:05,https://www.facebook.com/121790674546188_63473...,342,2023-05-01,Man killed in traffic accident along Dunearn R...,A driver who drove past the site of the accide...,asiaone.com,1.682297e+09,"[death penalty/death row > others, manpower > ...","[tensor(0.2485, dtype=torch.float64), tensor(0..."
...,...,...,...,...,...,...,...,...,...,...
15285,2023-06-21 14:00:41,https://www.facebook.com/129011692114_57390329...,792,2023-06-22,Taylor Swift fans rush to get UOB cards as pre...,"Cardholders in Singapore, Malaysia, Thailand, ...",straitstimes.com,1.687356e+09,"[covid sg/mtf press con > others, technology >...","[tensor(0.3219, dtype=torch.float64), tensor(0..."
15286,2023-06-21 15:15:09,https://www.facebook.com/129011692114_57393301...,284,2023-06-22,Pedestrian killed after accident with Traffic ...,"A photo circulating online showed a man, in wh...",straitstimes.com,1.687361e+09,"[death penalty/death row > others, general > o...","[tensor(0.3462, dtype=torch.float64), tensor(0..."
15287,2023-06-21 17:10:01,https://www.facebook.com/592308557475467_65783...,233,2023-06-22,"Pedestrian, 58, dies in accident with Traffic ...",A 31-year-old male police officer was consciou...,mothership.sg,1.687367e+09,"[death penalty/death row > others, national de...","[tensor(0.3297, dtype=torch.float64), tensor(0..."
15288,2023-06-21 23:30:03,https://www.facebook.com/129011692114_57408861...,353,2023-06-22,Singaporean crew member reported missing in US...,Mr Muhammad Furqan Mohamed Rashid was attendin...,straitstimes.com,1.687390e+09,"[sustainability > others, general > others, tr...","[tensor(0.2057, dtype=torch.float64), tensor(0..."
