In [None]:
# April 2025
# Active learning for text classification of English data using Small-Text 
# Violeta Berdejo-Espinola

In [None]:
# %pip install small_text, datasets 
# %pip install small-text[transformers]==2.0.0dev2, 


import torch
import numpy as np

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

%env TOKENIZERS_PARALLELISM=false

from matplotlib import rcParams
rcParams.update({'xtick.labelsize': 14, 'ytick.labelsize': 14, 'axes.labelsize': 16})

# read data

In [39]:
import polars as pl

pos = pl.read_csv('../data/outputs_pre-processing/pos_metadta.csv')
pos_matched = pl.read_parquet('../data/outputs_similarity_matches/pos_matches.parquet')
pos_unmatched = pl.read_parquet('../data/outputs_similarity_matches/pos_unmatched.parquet')
repo = pl.read_csv('../data/outputs_pre-processing/repo_metadata.csv')

len(pos_matched), len(pos_unmatched), len(repo), len(pos)

(4582, 829, 378165, 5392)

In [None]:
pos_matched

In [None]:
pos.with_columns(
    pl.col(['journal'] == "Biodiversity and Conservation")

journal
bool
false
false
false
false
false
…
false
false
false
false


In [35]:
4582 + 829 # --> positives

5411

In [28]:
# add label to pos_match

pos_matched = pos_matched.with_columns(
    pl.lit(1).alias('label')
)
print(len(pos_matched))
# subset pos matches tiltes

pos_matched_title =  pos_matched.select(
    pl.col(['name_in_repo','label'])
)
print(len(pos_matched_title))
print(len(repo))
# merge pos_matched and repo

repo_full = repo.join(pos_matched_title, how='left', left_on='title', right_on='name_in_repo')
print(len(repo_full))

4582
4582
378227
378240


In [34]:
repo_full = repo_full.with_columns(
    pl.when(pl.col('label').is_null()).then(0).otherwise(pl.col('label')).alias('label')
)

repo_full

index,title,abstract,journal,year,authors,language,label,literal
i64,str,str,str,i64,str,str,i32,i32
250,"""Mammalian mesopredators on isl…","""Medium-sized mammalian predato…","""Oecologia""",2014,"""Suraci, JP; Clinchy, M; Zanett…","""en""",0,0
251,"""Sex biases in kin shoaling and…","""Animal dispersal is associated…","""Oecologia""",2014,"""van Dongen, WFD; Wagner, RH; M…","""en""",0,0
252,"""Decreased emergence of emerald…","""The emerald ash borer (EAB; Ag…","""Oecologia""",2014,"""Whitehill, JGA; Rigsby, C; Cip…","""en""",0,0
253,"""Effects of experimentally-enha…","""Resistance, recovery and resil…","""Oecologia""",2014,"""Xu, ZW; Ren, HY; Cai, JP; Wang…","""en""",0,0
254,"""Gopher mounds decrease nutrien…","""Fossorial mammals may affect n…","""Oecologia""",2014,"""Yurkewycz, RP; Bishop, JG; Cri…","""en""",0,0
…,…,…,…,…,…,…,…,…
524428,"""Marine conservation in oceania…","""The people of Oceania have lon…","""Marine pollution bulletin""",2018,"""Friedlander, AM""","""en""",0,0
524429,"""Macrobenthic community charact…","""Development of substrate organ…","""Marine pollution bulletin""",2018,"""Liu, ZQ; Yu, P; Chen, MH; Cai,…","""en""",0,0
524430,"""Mercury contents in relation t…","""Total liver and muscle mercury…","""Marine pollution bulletin""",2018,"""Sánchez-Muros, MJ; Morote, E; …","""en""",0,0
524431,"""Effects of temperature and sal…","""As typical submerged aquatic v…","""Marine pollution bulletin""",2018,"""Gu, RT; Zhou, Y; Song, XY; Xu,…","""en""",0,0


In [29]:
repo_full.unique(subset="title")


index,title,abstract,journal,year,authors,language,label
i64,str,str,str,i64,str,str,i32
13445,"""Genetic structure in periphera…","""Decreased fitness due to loss …","""Conservation genetics""",2010,"""Rogell, B; Thörngren, H; Palm,…","""en""",
189256,"""Predator-prey coupling: intera…","""In this paper we explore varia…","""Oikos""",2009,"""Holmengen, N; Seip, KL; Boyce,…","""en""",
372873,"""The jumping spiders (araneae, …","""The paper presents data on the…","""Annales zoologici fennici""",1992,"""LOGUNOV, DV; WESOLOWSKA, W""","""en""",
57152,"""Characteristics of eastern can…","""Sphagnum cultivation on harves…","""Mires and peat""",2015,"""Aubé, M; Quenum, M; Ranasinghe…","""en""",
123750,"""Low initial refueling rate at …","""For various reasons, migrating…","""Auk""",2001,"""Schwilch, R; Jenni, L""","""en""",
…,…,…,…,…,…,…,…
1894,"""Predicting cyanobacteria domin…","""A controversial precept of aqu…","""Canadian journal of fisheries …",2001,"""Downing, JA; Watson, SB; McCau…","""en""",
485915,"""Testing of male sockeye-salmon…","""Infectious hematopoietic necro…","""Canadian journal of fisheries …",1987,"""MULCAHY, D; PASCHO, RJ; BATTS,…","""en""",
24572,"""Fire effects on belowground su…","""The overall effects of fire on…","""Forest ecology and management""",1999,"""Neary, DG; Klopatek, CC; DeBan…","""en""",
498017,"""Sub-alpine tree growth, climat…","""LaMarche et al. (1984) hypothe…","""Ecology""",1991,"""GRAUMLICH, LJ""","""en""",


In [24]:
repo_full 

index,title,abstract,journal,year,authors,language,label
i64,str,str,str,i64,str,str,i32
250,"""Mammalian mesopredators on isl…","""Medium-sized mammalian predato…","""Oecologia""",2014,"""Suraci, JP; Clinchy, M; Zanett…","""en""",
251,"""Sex biases in kin shoaling and…","""Animal dispersal is associated…","""Oecologia""",2014,"""van Dongen, WFD; Wagner, RH; M…","""en""",
252,"""Decreased emergence of emerald…","""The emerald ash borer (EAB; Ag…","""Oecologia""",2014,"""Whitehill, JGA; Rigsby, C; Cip…","""en""",
253,"""Effects of experimentally-enha…","""Resistance, recovery and resil…","""Oecologia""",2014,"""Xu, ZW; Ren, HY; Cai, JP; Wang…","""en""",
254,"""Gopher mounds decrease nutrien…","""Fossorial mammals may affect n…","""Oecologia""",2014,"""Yurkewycz, RP; Bishop, JG; Cri…","""en""",
…,…,…,…,…,…,…,…
524428,"""Marine conservation in oceania…","""The people of Oceania have lon…","""Marine pollution bulletin""",2018,"""Friedlander, AM""","""en""",
524429,"""Macrobenthic community charact…","""Development of substrate organ…","""Marine pollution bulletin""",2018,"""Liu, ZQ; Yu, P; Chen, MH; Cai,…","""en""",
524430,"""Mercury contents in relation t…","""Total liver and muscle mercury…","""Marine pollution bulletin""",2018,"""Sánchez-Muros, MJ; Morote, E; …","""en""",
524431,"""Effects of temperature and sal…","""As typical submerged aquatic v…","""Marine pollution bulletin""",2018,"""Gu, RT; Zhou, Y; Song, XY; Xu,…","""en""",


In [20]:
pos_matched_title


name_in_repo,label
str,i32
"""Experimental ivermectin treatm…",1
"""Impact of grazing management o…",1
"""Mammals and agri-environment s…",1
"""Conservation potential of pres…",1
"""The role of forest harvesting …",1
…,…
"""Effects of habitat alteration …",1
"""Effectiveness of a deep-sea co…",1
"""Early growth adaptability of f…",1
"""Hay strewing, brush harvesting…",1


# create datasets

In [None]:
import numpy as np
from small_text.data import SklearnDataset

# create exemplary features and labels randomly
x = np.random.rand(100, 30)
x = 
y = np.random.randint(0, 2, size=100)

dataset = SklearnDataset(x, y, target_labels=np.arange(2))

In [12]:
y

array([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1])

In [None]:
import datasets
datasets.logging.set_verbosity_error()

import logging

In [None]:
# raw_dataset= datasets.load_dataset('../data/outputs_similarity_matches/pos_matches.parquet')

# num_classes = raw_dataset['train'].features['label'].num_classes

In [None]:
live_dataset = pl.DataFrame(data, columns=['label', 'text'])

# active learner 

# query strategy

# classifier factory