In [1]:
import pickle
import numpy as np
import pandas as pd
import jsonlines
import os
import io

In [2]:
from optuna.distributions import FloatDistribution


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from src.models.models import get_clf

In [5]:
est, _ = get_clf("gbm")

In [6]:
hyp = {
    "n_estimators": 100,
    "num_leaves": 31,
    "max_depth": 13}

In [8]:
est.set_params(**hyp)

In [9]:
est, _ = get_clf("centroid")

In [12]:
hyp = {
    "metric":  "cosine",
    "shrink_threshold": 0.5
}

In [13]:
est.set_params(**hyp)

In [55]:
from cProfile import label
from itertools import product

DATASETS = ['20ng', 'acm', 'webkb', 'reut']

for dset, fold in product(DATASETS, range(10)):

    
    ids = pickle.load( open(f'../data/datasets/{dset}/splits/split_10_with_val.pkl', 'rb') ) 
    jp = f"/home/claudiovaliense/projetos/kaggle/{dset}_bert{fold}.json"
    fdj = jsonlines.open(jp)
    docs = []
    for line in fdj:
        docs.append([line['id'], line['bert'], line['label']])
        
    dest_dir = f"../data/reps/split_10_with_val/bert_base/fine_tuning/{dset}/{fold}"
    os.makedirs(dest_dir, exist_ok=True)
    
    
    X = pd.DataFrame(docs, columns=['id', 'bert', 'label'])
    x_train = np.array(X.query(f"id == {ids['train_idxs'][fold]}").bert.values.tolist())
    x_val = np.array(X.query(f"id == {ids['val_idxs'][fold]}").bert.values.tolist())
    x_test = np.array(X.query(f"id == {ids['test_idxs'][fold]}").bert.values.tolist())

    train_path = f"{dest_dir}/train"
    np.save(train_path, x_train)

    val_path = f"{dest_dir}/val"
    np.save(val_path, x_val)

    test_path = f"{dest_dir}/test"
    np.save(test_path, x_test)


    label_dir = f"../data/labels/split_10_with_val/{dset}/{fold}/"
    os.makedirs(label_dir, exist_ok=True)

    y_train = np.array(X.query(f"id == {ids['train_idxs'][fold]}").label.values.tolist())
    y_val = np.array(X.query(f"id == {ids['val_idxs'][fold]}").label.values.tolist())
    y_test = np.array(X.query(f"id == {ids['test_idxs'][fold]}").label.values.tolist())

    train_path = f"{label_dir}/train"
    np.save(train_path, y_train)

    val_path = f"{label_dir}/val"
    np.save(val_path, y_val)

    test_path = f"{label_dir}/test"
    np.save(test_path, y_test)

In [7]:
def read_texts(txt_file: str):
    with io.open(txt_file, errors='ignore', newline="\n") as fp:
        texts = fp.readlines()
    return texts

def read_dataset(documents_file: str, classes_file: str):
    documents = read_texts(txt_file=documents_file)
    classes = read_texts(txt_file=classes_file)
    classes = list(map(int, classes))

    return documents, classes

def fix_classes(classes):

    if np.min(classes) > 0:
        classes = classes - 1
        return classes
    return classes


In [9]:
datasets = ["20ng", "acm", "webkb", "reut"]

for dset in datasets:
    split_path = f"/home/welton/data/datasets/data/{dset}/splits/split_10.pkl"
    text_path = f"/home/welton/data/datasets/data/{dset}/texts.txt"
    labels_path = f"/home/welton/data/datasets/data/{dset}/score.txt"
    split_df = pd.read_pickle(split_path)
    
    _, labels = read_dataset(text_path, labels_path)
    labels = fix_classes(np.array(labels))
    for fold in np.arange(10):
        train_idxs = split_df.iloc[fold]["train_idxs"]
        test_idxs = split_df.iloc[fold]["test_idxs"]
        y_train = labels[train_idxs]
        y_test = labels[test_idxs]

        base_path = f"/home/welton/data/datasets/labels/split_10/{dset}/{fold}"
        os.makedirs(base_path, exist_ok=True)
        np.save(f"{base_path}/train", y_train)
        np.save(f"{base_path}/test", y_test)



In [6]:
split_df

Unnamed: 0,fold_id,train_idxs,test_idxs
0,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[19, 21, 24, 34, 42, 64, 66, 86, 88, 105, 107,..."
1,1,"[0, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 1...","[1, 9, 10, 22, 28, 36, 46, 50, 55, 68, 71, 97,..."
2,2,"[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 15, 18, 33, 35, 39, 57, 63, 73, 89, 92, 94..."
3,3,"[1, 2, 3, 4, 5, 6, 7, 9, 10, 13, 14, 15, 16, 1...","[0, 8, 11, 12, 29, 40, 52, 54, 59, 69, 93, 96,..."
4,4,"[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14...","[7, 23, 31, 37, 43, 48, 49, 51, 58, 77, 79, 10..."
5,5,"[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15...","[2, 14, 27, 30, 32, 41, 60, 62, 80, 100, 102, ..."
6,6,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[17, 67, 70, 74, 78, 85, 90, 91, 101, 118, 120..."
7,7,"[0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 20, 56, 65, 72, 81, 82, 83, 87, 108, 115, ..."
8,8,"[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15...","[3, 13, 25, 26, 45, 47, 53, 61, 76, 112, 114, ..."
9,9,"[0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 16, 38, 44, 75, 84, 95, 98, 106, 110, 116,..."


In [15]:
y_test = np.load("../../data/datasets/labels/split_10/20ng/0/test.npy")
y_test.shape

(1892,)

In [11]:
y_train = np.load("../../data/datasets/labels/split_10/20ng/0/train.npy")
y_train.shape

(16954,)

In [12]:
y_train = np.load("../../data/datasets/labels/split_10_with_val/20ng/0/train.npy")
y_train.shape

(15258,)

In [14]:
y_test_val = np.load("../../data/datasets/labels/split_10_with_val/20ng/0/test.npy")
y_test.shape

(1892,)

In [18]:
np.unique(y_test == y_test_val)

array([ True])

In [None]:
spv = pd.read_pickle("../input/webkb/split_10_with_val.pkl")

In [None]:
sp = pd.read_pickle("/home/welton/data/datasets/data/webkb/splits/split_10.pkl")

In [None]:
for dataset in ['webkb', '20ng', 'reut', 'acm']:
    sp = pd.read_pickle("/home/welton/data/datasets/data/webkb/splits/split_10.pkl")
    spv = pd.read_pickle("/home/welton/data/datasets/data/webkb/splits/split_10_with_val.pkl")
    for fold in sp.fold_id.values:
        v1 = np.hstack([spv.iloc[fold]["train_idxs"], spv.iloc[fold]["val_idxs"]])
        v1.sort()
        v2 = sp.iloc[fold]["train_idxs"]
        print(np.unique(v1 == v2))