In [70]:
import os
import string
import numpy as np
import pandas as pd

from collections import Counter
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import load, dump

from sys import path
path.append("../utils/")

from utils import *

In [2]:
DATASETS = ["20ng", "acm", "webkb", "reut"]

DTESTS = ["webkb"]

ALGORITHMS = ["centroide", "gbm", "knn", "lr", "rf", "svm", "bert"]

BERT_ALGS = ["gbm", "knn", "lr", "rf", "svm"]

In [3]:
CLF_ID = { clf:idx for clf, idx in zip(ALGORITHMS, range(len(ALGORITHMS)))}
CLF_ID

{'centroide': 0, 'gbm': 1, 'knn': 2, 'lr': 3, 'rf': 4, 'svm': 5, 'bert': 6}

In [9]:
pd_datasets = {}
for dset in DATASETS:
    pd_datasets[dset] = pd.read_csv(f"../../../stacking/output/datasets/{dset}.csv")

In [19]:
#hits_count = np.sum(df.values == pd_datasets["webkb"].classes.values[:, None], axis=1)

In [82]:
with open("/home/welton/data/stop_words_english.txt") as fd:
    stop_words = {}
    for w in fd.read().split('\n'):
        stop_words[w] = True

In [80]:
def is_a_number(text):
    try:
        float(text)
        return True
    except:
        return False

def clean_text(text, stop_words):

    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join([ word for word in text.split(' ') if word not in stop_words and not is_a_number(word)])
    return text

In [25]:
clean_text("Don't hate the player, hate the game. 000. 000f", stop_words)

'dont hate player hate game 000f'

In [86]:
def get_feature_importance(dset, df, X, fold):

    models_dir = f"data/feature_importance/rf_models/{dset}/"
    os.makedirs(models_dir, exist_ok=True)
    model_path = f"{models_dir}/{fold}.joblib"
    if os.path.exists(model_path):
        return load(model_path)
    else:
        train = df[df.folds_id != fold]
        test = df[df.folds_id == fold]
        X_train, y_train = X[train.index], df.classes.values[train.index]
        rf = RandomForestClassifier(random_state=42, n_jobs=4)
        rf.fit(X_train, y_train)
        dump(rf, model_path)
        return rf

def word_importance(doc, tf, rf):

    tokens = doc.split(' ')
    return [ rf.feature_importances_[tf.vocabulary_[word]] if word in tf.vocabulary_ else 0 for word in tokens ]

def fast_word_importance(doc, tf, feats_name, rf):

    x = tf.transform([doc]).toarray()[0]
    x[x > 0] = 1
    fi = x * rf.feature_importances_
    return [ feats_name[idx] for idx in np.argsort(-fi)[:15] ]


def docs_word_importance(df, tf, rf):

    docs_wi = {}
    feats_name = tf.get_feature_names_out()
    for doc in tqdm(df.itertuples()):
        #docs_wi[doc.index] = word_importance(doc.clean_docs, tf, rf)
        docs_wi[doc.Index] = fast_word_importance(doc.clean_docs, tf, feats_name, rf)
    return docs_wi

In [87]:
def get_docs_word_importance(pd_datasets, DATASETS, stop_words):
    
    docs_wi = {}
    for dset in DATASETS:
        
        print(f"{dset.upper()}")
        docs_wi[dset] = {}
        df = pd_datasets[dset]
        df["clean_docs"] = df.docs.apply(lambda t: clean_text(t, stop_words))

        tf = TfidfVectorizer(min_df=2)
        X = tf.fit_transform(df.clean_docs).toarray()

        for fold in np.arange(10):
            print(f"FOLD: {fold}", end="\r")
            rf = get_feature_importance(dset, df, X, fold)
            missed_docs = df[(df.folds_id == fold) & (df.hits_count == 0)].copy(deep=True)
            docs_wi[dset][fold] = docs_word_importance(missed_docs, tf, rf)    
    return docs_wi

In [88]:
docs_wi = get_docs_word_importance(pd_datasets, DATASETS, stop_words)

20NG
FOLD: 0

32it [00:07,  4.25it/s]


FOLD: 1

46it [00:10,  4.34it/s]


FOLD: 2

35it [00:08,  4.04it/s]


FOLD: 3

40it [00:08,  4.46it/s]


FOLD: 4

34it [00:07,  4.52it/s]


FOLD: 5

44it [00:10,  4.31it/s]


FOLD: 6

39it [00:08,  4.36it/s]


FOLD: 7

37it [00:08,  4.24it/s]


FOLD: 8

36it [00:08,  4.36it/s]


FOLD: 9

41it [00:09,  4.51it/s]


ACM
FOLD: 0

144it [00:24,  5.89it/s]


FOLD: 1

142it [00:26,  5.29it/s]


FOLD: 2

154it [00:28,  5.48it/s]


FOLD: 3

156it [00:26,  5.95it/s]


FOLD: 4

128it [00:22,  5.62it/s]


FOLD: 5

148it [00:26,  5.52it/s]


FOLD: 6

156it [00:27,  5.62it/s]


FOLD: 7

147it [00:27,  5.43it/s]


FOLD: 8

136it [00:24,  5.54it/s]


FOLD: 9

130it [00:25,  5.19it/s]


WEBKB
FOLD: 0

17it [00:02,  6.05it/s]


FOLD: 1

23it [00:03,  6.21it/s]


FOLD: 2

25it [00:03,  6.65it/s]


FOLD: 3

19it [00:02,  6.46it/s]


FOLD: 4

36it [00:06,  5.99it/s]


FOLD: 5

13it [00:02,  6.27it/s]


FOLD: 6

14it [00:02,  6.95it/s]


FOLD: 7

16it [00:02,  6.77it/s]


FOLD: 8

17it [00:02,  6.51it/s]


FOLD: 9

16it [00:02,  6.33it/s]


REUT
FOLD: 0

109it [00:17,  6.22it/s]


FOLD: 1

118it [00:18,  6.38it/s]


FOLD: 2

104it [00:16,  6.19it/s]


FOLD: 3

98it [00:16,  6.11it/s]


FOLD: 4

93it [00:15,  6.17it/s]


FOLD: 5

90it [00:14,  6.21it/s]


FOLD: 6

70it [00:11,  6.25it/s]


FOLD: 7

75it [00:12,  6.17it/s]


FOLD: 8

89it [00:14,  6.05it/s]


FOLD: 9

79it [00:12,  6.39it/s]


In [108]:
import pickle
with open("data/words_importance.pickle", "wb") as fd:
    pickle.dump(docs_wi, fd)

In [4]:
with open("data/words_importance.pickle", "rb") as fd:
    docs_wi = pickle.load(fd)

In [26]:
labels_desc = {}
for dset in DATASETS:
    labels_desc[dset] = {}
    with open(f"data/class_desc/{dset}.txt", 'r') as fd:
        for idx, label in enumerate(fd.read().split('\n')):
            labels_desc[dset][idx] = label


In [106]:
def topword_class(pd_datasets, dset, docs_wi, prefix="topword_class"):

    doc_word_list = []
    df = pd_datasets[dset]
    for fold in docs_wi[dset]:
        docs = docs_wi[dset][fold]
        for doc in docs:
            doc_info = df.iloc[doc]
            y, so_pred = doc_info["classes"], doc_info["conc_pred"]
            doc_word_list.append([
                fold,
                doc,
                ', '.join(docs[doc]),
                f"{y} -- {labels_desc[dset][y]}",
                f"{so_pred} -- {labels_desc[dset][so_pred]}"
            ])
    
    pd.DataFrame(doc_word_list, columns=["Fold", "DocID", "TOP15", "Label", "Stacking"]).to_excel(f"data/{prefix}/{dset}.xlsx")

In [103]:
for dset in DATASETS:
    topword_class(pd_datasets, dset, docs_wi)

In [34]:
bert_preds = load_preds(DATASETS, ALGORITHMS)

LOADING PREDS FROM SCRATCH


280it [00:00, 382.88it/s]


In [110]:
def get_bert_datasets(DATASETS, bert_preds, pd_datasets):
    bert_datasets = {}
    for dset in DATASETS:
        df = pd.DataFrame(bert_preds[dset])
        hits_count = np.sum(df.values == pd_datasets[dset].classes.values[:, None], axis=1)
        folds_id = pd_datasets[dset].folds_id.values
        conc_pred = np.array([ Counter(row).most_common()[0][0] for row in df.values ])
        data = np.vstack([folds_id, conc_pred, hits_count, pd_datasets[dset].docs.values, pd_datasets[dset].classes.values]).T
        bert_datasets[dset] = pd.DataFrame(data, columns=["folds_id", "conc_pred", "hits_count", "docs", "classes"])
    return bert_datasets

In [111]:
bert_datasets = get_bert_datasets(DATASETS, bert_preds, pd_datasets)

In [104]:
bert_docs_wi = get_docs_word_importance(bert_datasets, DATASETS, stop_words)

20NG
FOLD: 0

170it [00:27,  6.13it/s]


FOLD: 1

219it [00:38,  5.76it/s]


FOLD: 2

177it [00:29,  6.02it/s]


FOLD: 3

184it [00:26,  6.91it/s]


FOLD: 4

199it [00:28,  7.09it/s]


FOLD: 5

172it [00:26,  6.41it/s]


FOLD: 6

205it [00:32,  6.33it/s]


FOLD: 7

182it [00:28,  6.36it/s]


FOLD: 8

199it [00:27,  7.28it/s]


FOLD: 9

207it [00:29,  7.00it/s]


ACM
FOLD: 0

436it [00:36, 12.00it/s]


FOLD: 1

439it [01:13,  5.95it/s]


FOLD: 2

439it [01:08,  6.43it/s]


FOLD: 3

434it [01:09,  6.23it/s]


FOLD: 4

417it [01:05,  6.38it/s]


FOLD: 5

458it [01:12,  6.32it/s]


FOLD: 6

379it [00:57,  6.54it/s]


FOLD: 7

402it [01:00,  6.59it/s]


FOLD: 8

417it [00:58,  7.09it/s]


FOLD: 9

442it [00:58,  7.58it/s]


WEBKB
FOLD: 0

68it [00:07,  9.04it/s]


FOLD: 1

78it [00:08,  9.74it/s]


FOLD: 2

82it [00:10,  8.07it/s]


FOLD: 3

80it [00:08,  9.25it/s]


FOLD: 4

91it [00:10,  8.52it/s]


FOLD: 5

84it [00:09,  8.54it/s]


FOLD: 6

60it [00:08,  7.49it/s]


FOLD: 7

69it [00:09,  7.34it/s]


FOLD: 8

71it [00:09,  7.76it/s]


FOLD: 9

84it [00:11,  7.16it/s]


REUT
FOLD: 0

1330it [02:54,  7.63it/s]


FOLD: 1

274it [00:35,  7.65it/s]


FOLD: 2

228it [00:35,  6.47it/s]


FOLD: 3

228it [00:34,  6.52it/s]


FOLD: 4

214it [00:32,  6.68it/s]


FOLD: 5

220it [00:33,  6.59it/s]


FOLD: 6

1279it [03:08,  6.78it/s]


FOLD: 7

173it [00:25,  6.71it/s]


FOLD: 8

184it [00:26,  6.85it/s]


FOLD: 9

185it [00:26,  6.91it/s]


In [112]:
for dset in DATASETS:
    topword_class(bert_datasets, dset, bert_docs_wi, prefix="bert_topword_class")