In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import os
os.chdir("..")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["PYTHONHASHSEED"] = "0"
import importlib
import collections

from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 0)

from snorkel.labeling import LabelingFunction, PandasLFApplier, LFAnalysis, filter_unlabeled_dataframe
from snorkel.labeling.model import LabelModel
from snorkel.utils import probs_to_preds

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from src.data.utils import load_youtube_spam_dataset

In [2]:
SEED = 123
np.random.seed(SEED)

In [3]:
ABSTAIN = -1
HAM = 0
SPAM = 1

In [4]:
ROOT_PATH = Path("/home/s2210421/dataset/youtube_spam_cmt")
df_train, df_val, df_test = load_youtube_spam_dataset(ROOT_PATH / "data", True)
Y_test = df_test.label.values

len(df_train), len(df_val), len(df_test)

(1566, 195, 195)

In [5]:
df_train_spam = df_train[df_train.label == 1]
df_train_ham = df_train[df_train.label == 0]

len(df_train_spam), len(df_train_ham)

(805, 761)

In [6]:
VOCAB_DIR = ROOT_PATH / "vocab"
unigram_path = VOCAB_DIR / "unigrams.csv"

unigrams = collections.defaultdict(list)
for i, row in tqdm(df_train.iterrows()):
    words = list(set(row["text"].split()))
    for w in words:
        unigrams[w].append(row["label"])

unigrams_freq = {}
for w, l in unigrams.items():
    unigrams_freq[w] = sum(l) / len(l)

1566it [00:00, 13154.45it/s]


In [16]:
def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


def make_keyword_lf(keywords, label=SPAM):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))


def select_dev_example(df):
    while True:
        example = df.sample(n=1).iloc[0]
        # if len(lfs) != 0:
        #     has_label = False
        #     for lf in lfs:
        #         if lf(example) != ABSTAIN:
        #             has_label = True
        #             break
        #     if has_label:
        #         continue
        return example


def select_primitive(example, list_primitives, primitive_freq, primitive_t, use_weight_probs=False):
    words = example.text.lower().split()
    candidates, candidates_w = [], []
    for w in words:
        if w not in primitive_freq or (w, example.label) in list_primitives:
            continue

        if (example.label == 0 and primitive_freq[w] < 1 - primitive_t) or \
           (example.label == 1 and primitive_freq[w] > primitive_t):
            candidates.append(w)
            if example.label == 1:
                candidates_w.append(primitive_freq[w])
            else:
                candidates_w.append(1 - primitive_freq[w])
    if len(candidates) == 0:
        return None
    candidate_probs = np.exp(candidates_w) / np.sum(np.exp(candidates_w), axis=0)
    if use_weight_probs:
        primitive = np.random.choice(candidates, size=1, p=candidate_probs)[0]
    else:
        primitive = np.random.choice(candidates, size=1)[0]
    return primitive


def build_primitive_lf(df, list_primitives, primitive_freq, primitive_t, use_weight_probs=False):
    primitive = None
    while primitive is None:
        example   = select_dev_example(df)
        primitive = select_primitive(example, list_primitives, primitive_freq, primitive_t, use_weight_probs)
    return (primitive, example.label), make_keyword_lf(keywords=[primitive], label=example.label)

In [17]:
NUM_ITERS = 50
EVAL_EVERY = 5
use_weight_probs = False

df = None
segment_flag = False

primitive_t = 0.7
list_primitives = []
lfs = []
for iter in range(1, NUM_ITERS + 1):
    if iter == 1:
        for _ in range(3):
            if segment_flag:
                df = df_train_spam
            else:
                df = df_train_ham
            segment_flag = not segment_flag
            primitive, lf = build_primitive_lf(df, list_primitives, unigrams_freq, primitive_t, use_weight_probs)
            list_primitives.append(primitive)
            lfs.append(lf)
    else:
        if segment_flag:
            df = df_train_spam
        else:
            df = df_train_ham
        segment_flag = not segment_flag
        primitive, lf = build_primitive_lf(df, list_primitives, unigrams_freq, primitive_t, use_weight_probs)
        list_primitives.append(primitive)
        lfs.append(lf)
    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=df_train, progress_bar=False)
    L_test = applier.apply(df=df_test, progress_bar=False)

    if iter % EVAL_EVERY == 0:
        print(f"- Iter {iter}:")
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L_train=L_train, n_epochs=500, seed=SEED, progress_bar=False)
        label_model_acc = label_model.score(
            L=L_test, Y=df_test.label.values, tie_break_policy="random")["accuracy"]
        print(f"LM accuracy: {label_model_acc * 100:.1f}%")

        prob_labels = label_model.predict_proba(L=L_train)
        df_train_filtered, prob_labels_filtered = filter_unlabeled_dataframe(
            X=df_train, y=prob_labels, L=L_train)

        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
        X_test = vectorizer.transform(df_test.text.tolist())
        
        pred_labels_filtered = probs_to_preds(probs=prob_labels_filtered)
        sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
        sklearn_model.fit(X=X_train, y=pred_labels_filtered)
        print(f"LR accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

- Iter 5:
LM accuracy: 53.3%
LR accuracy: 43.6%
- Iter 10:
LM accuracy: 54.4%
LR accuracy: 44.6%
- Iter 15:
LM accuracy: 54.9%
LR accuracy: 46.2%
- Iter 20:
LM accuracy: 55.4%
LR accuracy: 48.2%
- Iter 25:
LM accuracy: 53.3%
LR accuracy: 48.7%
- Iter 30:
LM accuracy: 52.3%
LR accuracy: 45.1%
- Iter 35:
LM accuracy: 55.9%
LR accuracy: 49.2%
- Iter 40:
LM accuracy: 55.4%
LR accuracy: 49.2%
- Iter 45:
LM accuracy: 53.3%
LR accuracy: 48.7%
- Iter 50:
LM accuracy: 54.9%
LR accuracy: 51.8%


In [None]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()