In [20]:
import pandas as pd
import numpy as np
import os
from transformers import AutoTokenizer
import datasets
from datasets import Dataset, DatasetDict
from datasets import load_dataset

# Custom util functions
import sys; sys.path.append("./libraries/")
from utils import *

ANNOTATIONS_DATA = "../annotations/"

### PREPROCESSING

Loading annotated data

In [21]:
# Container for individual annotation responses datasets
dfs = []

# Look at the JSON files, parse and join
for file in os.listdir(ANNOTATIONS_DATA + "annotators_results"):
    if file.endswith(".json"):
        # Use our custom function to parse the response file
        df = parse_label_studio_file(ANNOTATIONS_DATA + "annotators_results/" + file)
        # Append to the container
        dfs.append(df)

# Join all files
annotations = pd.concat(dfs).reset_index(drop=True)

print(f"A total of {annotations.shape[0]} are now joined.")

A total of 1008 are now joined.


Encoding the labels as part of the text.

In [22]:
text_array = []

for ix, row in annotations.iterrows():

    masks = []
    
    for col in annotations.columns[2:]:

        if row[col] == 1.0:
            sentiment = "Positive"
        elif row[col] == 0.0:
            sentiment = "Neutral"
        elif row[col] == -1.0:
            sentiment = "Negative"
        else:
            sentiment = "None"
        masks.append(f"{col}:[{sentiment}]")        

    text_array.append(row["text"] + "||" + "||".join(masks))

In [23]:
text_array[:2]

['Sweet nice young people working, they have a smile on their lips and are always helpful say hi and goodbye..||Not Determined:[Positive]||Staff:[Positive]||Equipment:[None]||Hygiene:[None]||Location:[None]',
 'Has trained for many years here and Bornholmsvej, has just come from the Factory and good staff but has always been to you here as machines are the coolest and straight to, and everything is as I left it •20 figures •||Not Determined:[Positive]||Staff:[Positive]||Equipment:[Positive]||Hygiene:[None]||Location:[None]']

In [24]:
df = pd.DataFrame(text_array, columns=["text"])
df.head(2)

Unnamed: 0,text
0,"Sweet nice young people working, they have a s..."
1,Has trained for many years here and Bornholmsv...


Creating the HuggingFace dataset instance.

In [25]:
tds = Dataset.from_pandas(df[:800])
vds = Dataset.from_pandas(df[800:])

datasets = DatasetDict()

datasets['train'] = tds
datasets['validation'] = vds

In [26]:
# Assuming you have a pandas DataFrame
data = {
    'text': ['Text 1', 'Text 2', 'Text 3', 'Text 4'],
    'labels': [[1, 0, 1], [0, 1, 0], [1, 1, 0], [0, 0, 1]]
}

df = pd.DataFrame(data)

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into train and test sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = dataset.train_test_split(test_size=test_size)

# You can access the data using train_dataset['text'], train_dataset['labels'], test_dataset['text'], test_dataset['labels']


In [27]:
train_dataset

'train'

Tokenizer

Masked language modeling
For masked language modeling (MLM) we are going to use the same preprocessing as before for our dataset with one additional step: we will randomly mask some tokens (by replacing them by [MASK]) and the labels will be adjusted to only include the masked tokens (we don't have to predict the non-masked tokens).

We will use the distilroberta-base model for this example. You can pick any of the checkpoints listed here instead:

In [28]:
model_checkpoint = "distilroberta-base"
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize_function(examples):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("distilroberta-base", use_fast=True)
    return tokenizer(examples["text"])


tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at C:\Users\ginof/.cache\huggingface\transformers\42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.2",
  "type_vocab_s

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/208 [00:00<?, ? examples/s]

In [29]:
def group_texts(examples):
    block_size = 128
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/208 [00:00<?, ? examples/s]

In [30]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at C:\Users\ginof/.cache\huggingface\transformers\42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingfa

In [31]:
from transformers import Trainer, TrainingArguments

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-reviews",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [32]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [33]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)



TypeError: 'int' object is not callable

In [None]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]
     

Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_review_full/yelp_review_full to C:/Users/ginof/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf...


Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset yelp_review_full downloaded and prepared to C:/Users/ginof/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to C:\Users\ginof\.cache\huggingface\transformers\tmpxs52lnzm


Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/tokenizer_config.json in cache at C:\Users\ginof/.cache\huggingface\transformers\ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
creating metadata file for C:\Users\ginof/.cache\huggingface\transformers\ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
https://huggingface.co/bert-base-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to C:\Users\ginof\.cache\huggingface\transformers\tmpvoapuu1i


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/config.json in cache at C:\Users\ginof/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
creating metadata file for C:\Users\ginof/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at C:\Users\ginof/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dro

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/vocab.txt in cache at C:\Users\ginof/.cache\huggingface\transformers\6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
creating metadata file for C:\Users\ginof/.cache\huggingface\transformers\6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to C:\Users\ginof\.cache\huggingface\transformers\tmpttj9mn51


Downloading tokenizer.json:   0%|          | 0.00/426k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json in cache at C:\Users\ginof/.cache\huggingface\transformers\226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
creating metadata file for C:\Users\ginof/.cache\huggingface\transformers\226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
loading file https://huggingface.co/bert-base-cased/resolve/main/vocab.txt from cache at C:\Users\ginof/.cache\huggingface\transformers\6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json from cache at C:\Users\ginof/.cache\huggingface\transformers\226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [35]:
def num_to_sent(x):
    if x == 1.0:
        return "Positive"
    elif x == 0.0:
        return "Neutral"
    elif x == -1.0:
        return "Negative"
    else:
        return "None"

df = annotations.copy()
df.iloc[:,2:] = X.iloc[:,2:].applymap(lambda x: num_to_sent(x))

df.head()

Unnamed: 0,ID,text,Not Determined,Staff,Equipment,Hygiene,Location
0,3,"Sweet nice young people working, they have a s...",,,,,
1,4,Has trained for many years here and Bornholmsv...,,,,,
2,6,I exercise early in the morning and start befo...,,,,,
3,7,there is a nice atmosphere in the morning ther...,,,,,
4,8,"I trained 4-6 times a week for several years, ...",,,,,


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
np.random.seed = 7

for cat in X.columns[2:]:
    print(f"Category: {cat}")
    # Assuming 'X' is a list of text samples and 'y' is a list of corresponding categories
    X_train, X_test, y_train, y_test = train_test_split(X.text, X[cat], test_size=0.2, random_state=42)

    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    
    classifier = MultinomialNB()
    classifier.fit(X_train_tfidf, y_train)

    scores = cross_val_score(classifier, X.text, X[cat], cv=5)
    print(scores)
    #y_pred = classifier.predict(X_test_tfidf)

    #accuracy = accuracy_score(y_test, y_pred)
    #print(f"Accuracy: {accuracy:.2f}")

    #print("\nClassification Report:")
    #print(classification_report(y_test, y_pred))

Category: Not Determined


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\naive_bayes.py", line 747, in fit
    X, y = self._check_X_y(X, y)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\naive_bayes.py", line 581, in _check_X_y
    return self._validate_data(X, y, accept_sparse="csr", reset=reset)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 554, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1104, in check_X_y
    X = check_array(
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 877, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\series.py", line 857, in __array__
    return np.asarray(self._values, dtype)
ValueError: could not convert string to float: 'Super cool training, skilled physiotherapist and good atmosphere 💪'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\naive_bayes.py", line 747, in fit
    X, y = self._check_X_y(X, y)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\naive_bayes.py", line 581, in _check_X_y
    return self._validate_data(X, y, accept_sparse="csr", reset=reset)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 554, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1104, in check_X_y
    X = check_array(
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 877, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\Users\ginof\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\series.py", line 857, in __array__
    return np.asarray(self._values, dtype)
ValueError: could not convert string to float: 'Sweet nice young people working, they have a smile on their lips and are always helpful say hi and goodbye..'


In [37]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df.text)

for cat in df.columns[2:]:
    print(f"Category: {cat}")
    # Assuming 'X' is a list of text samples and 'y' is a list of corresponding categories
    
    classifier = MultinomialNB()

    scores = cross_val_score(classifier, X, df[cat], cv=5)
    print(scores)
    #y_pred = classifier.predict(X_test_tfidf)

    #accuracy = accuracy_score(y_test, y_pred)
    #print(f"Accuracy: {accuracy:.2f}")

    #print("\nClassification Report:")
    #print(classification_report(y_test, y_pred))

Category: Not Determined
[1. 1. 1. 1. 1.]
Category: Staff
[1. 1. 1. 1. 1.]
Category: Equipment
[1. 1. 1. 1. 1.]
Category: Hygiene
[1. 1. 1. 1. 1.]
Category: Location
[1. 1. 1. 1. 1.]
