In [1]:
from src.config import SNIPS_1, SNIPS_2, SST2_1, SST2_2, YELP_1, YELP_2, YELP_test, SNIPS2_TRAIN, SNIPS2_TEST, SNIPS2_DEV
from src.config import SST5_1, SST5_2, SST5_val_1, SST5_val_2, SST5_test, SNIPS2_1, SNIPS2_2, SNIPS2_val_1, SNIPS2_val_2, SNIPS2_test
from src.config import IMDB_1, IMDB_2, IMDB_test
from src.config import AGNEWS_1, AGNEWS_2, AGNEWS_test
from src.config import NEWS20_1, NEWS20_2, NEWS20_test
from src.config import SST2_val_1, SST2_val_2, SST2_test
from src.config import create_tree

In [2]:
import torch
from datasets import load_dataset
from sklearn import preprocessing as prep
import numpy as np
import argparse
import re
import pandas as pd
import requests
from sklearn.utils import shuffle
from functools import partial

create_tree()
torch.cuda.empty_cache()

In [3]:
import requests
# Monkey patch the requests functions
from functools import partial

# Monkey patch the requests functions
requests.request = partial(requests.request, verify=False)
requests.get = partial(requests.get, verify=False)
requests.head = partial(requests.head, verify=False)
requests.post = partial(requests.post, verify=False)
requests.put = partial(requests.put, verify=False)
requests.patch = partial(requests.patch, verify=False)
requests.delete = partial(requests.delete, verify=False)
# Remove warning
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)


In [12]:
test_mode=False


def huggingface_to_ours(path="rungalileo/20_Newsgroups_Fixed", test_mode=False, hierarchycal=1, join_train_val=False):
    raw_datasets = load_dataset(path)

    CLEANR = re.compile('<.{0,6}>')

    def cleanhtml(raw_html):
        return re.sub(CLEANR, '', raw_html)

    def cleantext(raw_text):
        return raw_text.replace("\t")

    # Dataframe preparation
    datasets = []
    print("KEYS: ", list(raw_datasets.keys()))
    for k in raw_datasets.keys():
        ds = raw_datasets[k].to_pandas()
        if join_train_val:
            ds["split"] = "test" if k == "test" else "train"
        else:
            ds["split"] = k
        datasets.append(ds)
    df = pd.concat(datasets).reset_index(drop=True)
    print("Label", list(df['label'].unique()))
    if hierarchycal > 1:
        df[['cat1', 'cat2']] = df['label'].str.split('.', 1, expand=True)
        del df['label']
    df = df.rename(columns={"label": "cat1", "text": "query", "sentence": "query"})

    # Encoding
    categories = [x for x in df.columns if 'cat' in x]
    for cat in categories:
        tag_encoded = prep.LabelEncoder()
        if not isinstance(df[cat][0], str):
            df_sub = df[df[cat]!=-1]
            df_sub[cat] = tag_encoded.fit_transform(df_sub[cat])
            df[df[cat]!=-1][cat] = df_sub[cat]
        else:
            df[cat] = tag_encoded.fit_transform(df[cat])
    df["query"] = df["query"].apply(lambda x: x.replace("\n", "") if isinstance(x, str) else np.nan)
    df.dropna(inplace=True)
    df['query'] = df['query'].map(cleanhtml)

    # Recovering
    outputs = dict()
    for k in raw_datasets.keys():
        df_sub = df[df["split"] == k]
        df_sub = shuffle(df_sub)
        if test_mode:
            df_sub = df_sub[:200]
        df_sub2 = df_sub[["query"] + categories]
        df_sub1 = df_sub["query"]
        outputs[k + "1"] = df_sub1
        outputs[k + "2"] = df_sub2
    return outputs

## DATA PREPROCESS

### SNIPS2

In [13]:
snips2 = dict()
snips2['train'] = pd.read_csv(SNIPS2_TRAIN)
snips2['dev'] = pd.read_csv(SNIPS2_DEV)
snips2['test'] = pd.read_csv(SNIPS2_TEST)
tag_encoded = prep.LabelEncoder()
snips2['train']['cat1'] = tag_encoded.fit_transform(snips2['train']['intent'])
snips2['dev']['cat1'] = tag_encoded.transform(snips2['dev']['intent'])
snips2['test']['cat1'] = tag_encoded.transform(snips2['test']['intent'])
for k in snips2.keys():
    del snips2[k]['slot'], snips2[k]['intent']

In [14]:
snips2["train"]["text"].to_csv(SNIPS_1, index=False)
snips2["train"].rename(columns={'text':'query'}).to_csv(SNIPS_2, index=False)
snips2["dev"]["text"].to_csv(SNIPS2_val_1, index=False)
snips2["dev"].rename(columns={'text':'query'}).to_csv(SNIPS2_val_2, index=False)
snips2["test"].rename(columns={'text':'query'}).to_csv(SNIPS2_test, index=False)

In [19]:
print("train", snips2["train"].shape[0])
print("validation", snips2["dev"].shape[0])
print("test", snips2["test"].shape[0])
snips2["train"]["len"] = snips2["train"]['text'].apply(lambda x: len(x.split()))
snips2["dev"]["len"] = snips2["dev"]['text'].apply(lambda x: len(x.split()))
snips2["test"]["len"] = snips2["test"]['text'].apply(lambda x: len(x.split()))
total = pd.concat([snips2["train"]["len"], snips2["test"]["len"], snips2["dev"]["len"]])
print("max number of words in test2: {}".format(total.max()))
print("mean number of words: {}".format(total.min()))
print("mean number of words: {}".format(total.mean()))

train 13084
validation 700
test 700
max number of words in test2: 35
mean number of words: 2
mean number of words: 9.005661419497377


### SST2

In [41]:
sst2 = huggingface_to_ours(path="sst2", test_mode=test_mode, join_train_val=False)

Using custom data configuration default
Reusing dataset sst2 (/home/alejo/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


KEYS:  ['train', 'validation', 'test']
Label [0, 1, -1]


In [5]:
sst2 = huggingface_to_ours(path="sst2", test_mode=test_mode, join_train_val=False)

sst2['train1'] = pd.concat([sst2["test1"], sst2["train1"]]).reset_index(drop=True)
# sst2['train1'].to_csv(SST2_1, index=False)
# sst2['train2'].to_csv(SST2_2, index=False)
# sst2['validation2'].to_csv(SST2_test, index=False)

Using custom data configuration default
Reusing dataset sst2 (/home/alejo/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

KEYS:  ['train', 'validation', 'test']
Label [0, 1, -1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
sst2.keys()

dict_keys(['train1', 'train2', 'validation1', 'validation2', 'test1', 'test2'])

In [12]:
sst2['train2']

Unnamed: 0,query,cat1
33310,watch and -- especially -- to listen to,1
35678,skin of man gets a few cheap shocks from its k...,0
37335,whole lot scarier,1
57084,like acid,0
16418,city by the sea would slip under the waves .,0
...,...,...
50635,entertaining,1
57254,liking showgirls,1
65859,there are so few films about the plight of ame...,1
46358,also happens to be good,1


In [12]:
print("train 1", sst2['train1'].shape[0])
print("train 2", sst2['train2'].shape[0])
print("validation", sst2['validation2'].shape[0])
sst2['train2']["len"] = sst2['train2']['query'].apply(lambda x: len(x.split()))
sst2['validation2']["len"] = sst2['validation2']['query'].apply(lambda x: len(x.split()))
print("mean number of words in validation2: {}".format(int(sst2['validation2']["len"].mean())))
print("max number of words in validation2: {}".format(sst2['validation2']["len"].max()))
print("mean number of words: {}".format(int(sst2['train2']["len"].mean())))
print("max number of words: {}".format(sst2['train2']["len"].max()))

train 1 69170
train 2 67349
validation 872
mean number of words in validation2: 19
max number of words in validation2: 47
mean number of words: 9
max number of words: 52


### SNIPS

In [60]:
# ############################################ snips
snips = huggingface_to_ours(path="snips_built_in_intents", test_mode=test_mode)
snips['train1'].to_csv(SNIPS_1, index=False)
snips['train2'].to_csv(SNIPS_2, index=False)

Found cached dataset snips_built_in_intents (/home/alejo/.cache/huggingface/datasets/snips_built_in_intents/default/0.0.0/f7f10213b60cb830d41cb190315da7151437d62acb6f21f0159f7afb4cb7c784)


  0%|          | 0/1 [00:00<?, ?it/s]

KEYS:  ['train']
Label [5, 0, 4, 3, 7, 1, 8, 9, 6, 2]


In [61]:
snips.keys()

dict_keys(['train1', 'train2'])

In [62]:
print("train 1", snips['train1'].shape[0])
print("train 2", snips['train2'].shape[0])
snips['train2']["len"] = snips['train2']['query'].apply(lambda x: len(x.split()))
print("mean number of words: {}".format(int(snips['train2']["len"].mean())))
print("max number of words: {}".format(snips['train2']["len"].max()))

train 1 328
train 2 328
mean number of words: 9
max number of words: 20


### NEWS 20

In [46]:
news220 = huggingface_to_ours(path="rungalileo/20_Newsgroups_Fixed", test_mode=test_mode, hierarchycal=2)
news220['train1'] = news220['train1'].apply(lambda x: " " if len(x) == 0 else x)
news220['train2']['query'] = news220['train2']['query'].apply(lambda x: " " if len(x) == 0 else x)
news220['test2']['query'] = news220['test2']['query'].apply(lambda x: " " if len(x) == 0 else x)
# news220['train1'].to_csv(NEWS20_1, index=False)
# news220['train2'].to_csv(NEWS20_2, index=False)
# news220['test2'].to_csv(NEWS20_test, index=False)

Downloading readme:   0%|          | 0.00/5.42k [00:00<?, ?B/s]

Using custom data configuration rungalileo--20_Newsgroups_Fixed-edf414ecc72dd622


Downloading and preparing dataset csv/rungalileo--20_Newsgroups_Fixed to /home/alejo/.cache/huggingface/datasets/rungalileo___csv/rungalileo--20_Newsgroups_Fixed-edf414ecc72dd622/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.46M [00:00<?, ?B/s]

  

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Dataset csv downloaded and prepared to /home/alejo/.cache/huggingface/datasets/rungalileo___csv/rungalileo--20_Newsgroups_Fixed-edf414ecc72dd622/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  df[['cat1', 'cat2']] = df['label'].str.split('.', 1, expand=True)


KEYS:  ['train', 'test']
Label ['rec.autos', 'comp.sys.mac.hardware', 'comp.graphics', 'sci.space', 'talk.politics.guns', 'sci.med', 'comp.sys.ibm.pc.hardware', 'comp.os.ms-windows.misc', 'rec.motorcycles', 'talk.religion.misc', 'None', 'misc.forsale', 'alt.atheism', 'sci.electronics', 'comp.windows.x', 'rec.sport.hockey', 'rec.sport.baseball', 'soc.religion.christian', 'talk.politics.mideast', 'talk.politics.misc', 'sci.crypt']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news220['test2']['query'] = news220['test2']['query'].apply(lambda x: " " if len(x) == 0 else x)


In [49]:
print("train 1", news220['train1'].shape[0])
print("train 2", news220['train2'].shape[0])
print("test", news220['test2'].shape[0])
news220['train2']["len"] = news220['train2']['query'].apply(lambda x: len(x.split()))
news220['test2']["len"] = news220['test2']['query'].apply(lambda x: len(x.split()))
print("mean number of words in test2: {}".format(int(news220['test2']["len"].mean())))
print("max number of words in test2: {}".format(news220['test2']["len"].max()))
print("mean number of words: {}".format(int(news220['train2']["len"].mean())))
print("max number of words: {}".format(news220['train2']["len"].max()))

train 1 11096
train 2 11096
test 7370
mean number of words in test2: 166
max number of words in test2: 9187
mean number of words: 175
max number of words: 11694


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news220['test2']["len"] = news220['test2']['query'].apply(lambda x: len(x.split()))


### YELP

In [50]:
# Yelp ratings
yelp = huggingface_to_ours(path="yelp_review_full", test_mode=test_mode)
# yelp['train1'].to_csv(YELP_1, index=False)
# yelp['train2'].to_csv(YELP_2, index=False)
# yelp['test2'].to_csv(YELP_test, index=False)

Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_review_full/yelp_review_full to /home/alejo/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf...


Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset yelp_review_full downloaded and prepared to /home/alejo/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

KEYS:  ['train', 'test']
Label [4, 1, 3, 0, 2]


In [51]:
print("train 1", yelp['train1'].shape[0])
print("train 2", yelp['train2'].shape[0])
print("test", yelp['test2'].shape[0])
yelp['train2']["len"] = yelp['train2']['query'].apply(lambda x: len(x.split()))
yelp['test2']["len"] = yelp['test2']['query'].apply(lambda x: len(x.split()))
print("mean number of words in test2: {}".format(int(yelp['test2']["len"].mean())))
print("max number of words in test2: {}".format(yelp['test2']["len"].max()))
print("mean number of words: {}".format(int(yelp['train2']["len"].mean())))
print("max number of words: {}".format(yelp['train2']["len"].max()))

train 1 650000
train 2 650000
test 50000
mean number of words in test2: 134
max number of words in test2: 1009
mean number of words: 134
max number of words: 1052


### SST5

In [63]:
# Rating
sst5 = huggingface_to_ours(path="SetFit/sst5", test_mode=test_mode)
sst5['train1'].to_csv(SST5_1, index=False)
sst5['train2'].to_csv(SST5_2, index=False)
sst5['validation1'].to_csv(SST5_val_1, index=False)
sst5['validation2'].to_csv(SST5_val_2, index=False)
sst5['test2'].to_csv(SST5_test, index=False)

Using custom data configuration SetFit--sst5-4c07b9d5881ae209
Found cached dataset json (/home/alejo/.cache/huggingface/datasets/SetFit___json/SetFit--sst5-4c07b9d5881ae209/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/3 [00:00<?, ?it/s]

KEYS:  ['train', 'test', 'validation']
Label [4, 1, 2, 3, 0]


In [65]:
print("train 2", sst5['train2'].shape[0])
print("train 1", sst5['train1'].shape[0])
print("test", sst5['test2'].shape[0])
sst5['train2']["len"] = sst5['train2']['query'].apply(lambda x: len(x.split()))
sst5['test2']["len"] = sst5['test2']['query'].apply(lambda x: len(x.split()))
print("mean number of words in test2: {}".format(int(sst5['test2']["len"].mean())))
print("max number of words in test2: {}".format(sst5['test2']["len"].max()))
print("mean number of words: {}".format(int(sst5['train2']["len"].mean())))
print("max number of words: {}".format(sst5['train2']["len"].max()))

train 2 8544
train 1 8544
test 2210
mean number of words in test2: 19
max number of words in test2: 56
mean number of words: 19
max number of words: 52


### AGnews

In [55]:
ag_news = huggingface_to_ours(path="ag_news", test_mode=test_mode)
ag_news['train1'].to_csv(AGNEWS_1, index=False)
ag_news['train2'].to_csv(AGNEWS_2, index=False)
ag_news['test2'].to_csv(AGNEWS_test, index=False)
# this version of the data has the title and the description connected with a space

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading and preparing dataset ag_news/default to /home/alejo/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /home/alejo/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

KEYS:  ['train', 'test']
Label [2, 3, 1, 0]


In [56]:
print("train 1", ag_news['train1'].shape[0])
print("train 2", ag_news['train2'].shape[0])
print("test", ag_news['test2'].shape[0])
ag_news['train2']["len"] = ag_news['train2']['query'].apply(lambda x: len(x.split()))
ag_news['test2']["len"] = ag_news['test2']['query'].apply(lambda x: len(x.split()))
print("mean number of words in test2: {}".format(int(ag_news['test2']["len"].mean())))
print("max number of words in test2: {}".format(ag_news['test2']["len"].max()))
print("mean number of words: {}".format(int(ag_news['train2']["len"].mean())))
print("max number of words: {}".format(ag_news['train2']["len"].max()))

train 1 120000
train 2 120000
test 7600
mean number of words in test2: 37
max number of words in test2: 137
mean number of words: 37
max number of words: 177


### IMDB

In [72]:
imdb = huggingface_to_ours(path="imdb", test_mode=test_mode)
imdb_1 = pd.concat([imdb["train1"], imdb["unsupervised1"]]).reset_index(drop=True)

imdb_1.to_csv(IMDB_1, index=False)
imdb["train2"].to_csv(IMDB_2, index=False)
imdb['test2'].to_csv(IMDB_test, index=False)

Found cached dataset imdb (/home/alejo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub[cat] = tag_encoded.fit_transform(df_sub[cat])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df[cat]!=-1][cat] = df_sub[cat]


KEYS:  ['train', 'test', 'unsupervised']
Label [0, 1, -1]


In [73]:
print("train 2", imdb['train2'].shape[0])
print("test", imdb['test2'].shape[0])
imdb['train2']["len"] = imdb['train2']['query'].apply(lambda x: len(x.split()))
imdb['test2']["len"] = imdb['test2']['query'].apply(lambda x: len(x.split()))
print("mean number of words in test2: {}".format(int(imdb['test2']["len"].mean())))
print("max number of words in test2: {}".format(imdb['test2']["len"].max()))
print("mean number of words: {}".format(int(imdb['train2']["len"].mean())))
print("max number of words: {}".format(imdb['train2']["len"].max()))

train 2 25000
test 25000
mean number of words in test2: 224
max number of words in test2: 2192
mean number of words: 229
max number of words: 2450
