In [1]:
import os
import json
import random
from datetime import datetime as dt

import fasttext
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

import cleanlab
from cleanlab.pruning import get_noise_indices
from cleanlab.models.fasttext import FastTextClassifier, data_loader

In [2]:
RND_SEED = 2021
np.random.seed(RND_SEED)
random.seed(RND_SEED)

In [3]:
df = pd.read_csv("severity_tags.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)
df["excerpt"] = df["excerpt"].str.replace("\s+", " ", regex=True)
df["tag_value"] = df["tag_value"].str.replace(" ", "_").str.lower()
df["tag_value"].unique()

array(['no_problem/minor_problem', 'of_concern', 'critical'], dtype=object)

In [4]:
df["severity_level"] = df["tag_value"].map({'no_problem/minor_problem':0, 'of_concern':1, 'critical':2})

In [5]:
df["severity_level"].value_counts()

2    27129
1    10915
0     3008
Name: severity_level, dtype: int64

In [6]:
# Separate majority and minority classes
df_0 = df[df["severity_level"] == 0]
df_1 = df[df["severity_level"] == 1]
df_2 = df[df["severity_level"] == 2]

df_0_upsampled = resample(
    df_0,
    replace=True,  # sample without replacement
    n_samples=len(df_2),  # to match minority class
)
df_1_upsampled = resample(
    df_1,
    replace=True,  # sample without replacement
    n_samples=len(df_2),  # to match minority class
)
df = pd.concat([df_0_upsampled, df_1_upsampled, df_2])

In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    df["excerpt"].values,
    df["severity_level"].values,
    stratify=df["severity_level"].values,
    test_size=0.1)
#
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train,
                                                  stratify=y_train,
                                                  test_size=0.1)

In [8]:
def write_data(X, y, path):
    with open(path, "w") as f:
        for xi, yi in zip(X, y):
            f.write(f"__label__{yi} {xi}\n")


##
def ft_pred(model, X):
    preds = model.predict(X)
    return [int(p[0][9:]) for p in preds[0]]


##
def ft_pred_proba(model, X):
    preds = model.predict(X)
    return [p[0] for p in preds[1]]


##
def ft_pred_proba_all(model, X):
    num_classes = 3
    _, preds_proba = model.predict(X, k=num_classes)
    return np.array(preds_proba)


##
def train_and_pred_proba(x_train_k, x_test_k, y_train_k, kfold):
    path_train = f"ft_data_severity/train_{kfold}_fold.txt"
    write_data(x_train_k, y_train_k, path_train)
    model = fasttext.train_supervised(input=path_train)
    preds_y_test_k = ft_pred_proba_all(model, x_test_k)

    return preds_y_test_k

In [9]:
# path_train = "ft_data_severity/train.txt"
# write_data(x_train, y_train, path_train)
# model = fasttext.train_supervised(input=path_train)

In [10]:
N_FOLDS = 2
n = len(x_train)
m = len(set(y_train))
psx = np.zeros((n, m))
skf = StratifiedKFold(n_splits=N_FOLDS)
for k, (train_index, test_index) in enumerate(skf.split(x_train, y_train), start=1):
    x_train_k, x_test_k = x_train[train_index], x_train[test_index]
    y_train_k, y_test_k = y_train[train_index], y_train[test_index]
    preds_y_test_k = train_and_pred_proba(x_train_k.tolist(), x_test_k.tolist(), y_train_k, k)
    psx[test_index] = preds_y_test_k

In [11]:
ordered_label_errors = get_noise_indices(
    s=y_train,
    psx=psx,
    sorted_index_method='both', # Orders label errors
 )

In [12]:
y_train[ordered_label_errors[:10]], psx[ordered_label_errors[:10]]

(array([2, 2, 2, 1, 2, 2, 2, 2, 2, 2]),
 array([[1.00000179e+00, 1.67876624e-05, 1.14268223e-05],
        [9.99991775e-01, 2.41959588e-05, 1.41088567e-05],
        [9.99956846e-01, 5.65370246e-05, 1.66348109e-05],
        [9.99971986e-01, 4.78059956e-05, 1.02377226e-05],
        [9.99933958e-01, 8.42809823e-05, 1.17571562e-05],
        [9.99928832e-01, 7.88064353e-05, 2.23837305e-05],
        [9.99944568e-01, 4.59538860e-05, 3.93841910e-05],
        [9.99894857e-01, 9.89133114e-05, 3.62666760e-05],
        [9.99836624e-01, 1.79731534e-04, 1.36497429e-05],
        [9.99815762e-01, 2.04237847e-04, 1.00572533e-05]]))

In [None]:
x_train[ordered_label_errors[:10]]