In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

# Prepare and tokenize dataset
dataset = load_dataset("yelp_review_full")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", cache_dir='model/.cache')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(200))

# Setup evaluation 
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Load pretrained model and evaluate model after each epoch
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 650000/650000 [04:25<00:00, 2445.45 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [00:20<00:00, 2487.40 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [214,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [214,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` 

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 1695, in forward
    outputs = self.bert(
              ^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 1077, in forward
    embedding_output = self.embeddings(
                       ^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/transformers/models/bert/modeling_bert.py", line 217, in forward
    embeddings = self.LayerNorm(embeddings)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/modules/normalization.py", line 202, in forward
    return F.layer_norm(
           ^^^^^^^^^^^^^
  File "/mnt/data_disk/chu123/anaconda3/envs/jyuzh/lib/python3.12/site-packages/torch/nn/functional.py", line 2576, in layer_norm
    return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



] Assertion `srcIndex < srcSelectDimSize` failed.
/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [830,0,0], thread: [90,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [830,0,0], thread: [91,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [830,0,0], thread: [92,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu:1284: indexSelectLargeIndex: block: [830,0,0], thread: [93,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/

k: [812] Assertion `srcIndex < srcSelectDimSize,0` failed.
` failed.
/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu:1284,0: indexSelectLargeIndex], thread: [106:1284: indexSelectLargeIndex: block: [832: block: [672,0,0,0,0], thread: [96,0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
,0,0], thread: [41] Assertion `srcIndex < srcSelectDimSize,0], thread: [86` failed.
,0/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu] Assertion `srcIndex < srcSelectDimSize,0,0/home/conda/feedstock_root/build_artifacts/libtorch_1724898583682/work/aten/src/ATen/native/cuda/Indexing.cu] Assertion `srcIndex < srcSelectDimSize:1284` failed.
: indexSelectLargeIndex:1284: block: [832: indexSelectLargeIndex` failed.
/home/conda/feedstock_root/build_artifacts/libtorch_17248985836

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5427/5427 [00:01<00:00, 2754.68 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5427/5427 [00:00<00:00, 77087.29 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


RuntimeError: CUDA error: peer mapping resources exhausted
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [1]:
import os
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
# Convert labels to multi-label format
def convert_labels_to_multilabel(examples):
    labels = examples['labels']
    multi_labels = np.zeros(28)  # Assuming 28 emotion labels
    for i, label_list in enumerate(labels):
        for label in label_list:
            multi_labels[i][label] = 1
    examples['labels'] = multi_labels.tolist()
    return examples

# Êï∞ÊçÆÈ¢ÑÂ§ÑÁêÜ
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

# ÂÆö‰πâËá™ÂÆö‰πâÁöÑPyTorchÊ®°ÂûãÁ±ª
class CustomBERTModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=28):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]  # Ëé∑ÂèñÊ±†ÂåñÁöÑËæìÂá∫
        logits = self.classifier(pooled_output)
        return logits

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")


In [1]:
import torch
import transformers
torch.__version__, transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


('2.4.0.post301', '4.44.1')

In [2]:
torch.cuda.is_available()

True

In [1]:
import pandas as pd
from datasets import Dataset

# ÂÅáËÆæ‰Ω†Â∑≤ÁªèÊúâ‰∏Ä‰∏™Pandas DataFrame
data = {
    'text': ["I love programming!", "I feel sad today."],
    'label': [1, 0]
}
df = pd.DataFrame(data)

# Â∞ÜPandas DataFrameËΩ¨Êç¢‰∏∫Hugging FaceÁöÑDatasetÂØπË±°
dataset = Dataset.from_pandas(df)

# ÊâìÂç∞DatasetÂØπË±°‰ª•È™åËØÅËΩ¨Êç¢
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text', 'label'],
    num_rows: 2
})


In [4]:
import torch
import transformers
from transformers import BertTokenizer, BertPreTrainedModel
from bert_finetune import CustomBERTModel
# ‰ΩøÁî®ÂæÆË∞ÉÂêéÁöÑÊ®°ÂûãËøõË°åÈ¢ÑÊµã
texts = [
    "I love programming!",
    "I feel sad today."
]
tokenizer = BertTokenizer.from_pretrained("model/fine_tuned_bert")
model = CustomBERTModel.from_pretrained("model/fine_tuned_bert")
# inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
# outputs = model(**inputs)
# predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

# # ÊâìÂç∞È¢ÑÊµãÁªìÊûú
# for i, text in enumerate(texts):
#     print(f"Text: {text}")
#     print(f"Predicted probabilities: {predictions[i].detach().numpy()}")

  from .autonotebook import tqdm as notebook_tqdm


BertConfig {
  "_name_or_path": "model/fine_tuned_bert",
  "architectures": [
    "CustomBERTModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_num_labels": 28,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_name": "bert-base-uncased",
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

BertConfig {
  "_name_or_path": "model/fine_tuned_bert",
  "architectures": [
    "CustomBERTModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_num_labels": 28,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  

In [32]:
model.cuda(1)
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
inputs = {k: v.cuda(1) for k, v in inputs.items()}
outputs = model(**inputs)

RuntimeError: Invalid device string: '1'

In [31]:
inputs

{'input_ids': tensor([[ 101, 1045, 2293, 4730,  999,  102,    0],
         [ 101, 1045, 2514, 6517, 2651, 1012,  102]], device='cuda:1'),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], device='cuda:1'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1]], device='cuda:1')}

In [23]:
inputs['input_ids'].shape

torch.Size([2, 7])

In [15]:
lst = []
lst.extend(outputs['logits'].softmax(dim=-1)[:,4].tolist())

In [19]:
outputs['logits'].softmax(dim=-1)[:,4].tolist()

[0.002086709486320615, 0.002517945133149624]

In [19]:
model.classifier(outputs['pooler_output'])

AttributeError: 'BertModel' object has no attribute 'classifier'

In [28]:
from datasets import load_dataset
datasets = load_dataset("go_emotions", "simplified")
viewdata = datasets['train'].select(range(5))

In [32]:
viewdata[0]['labels']

AttributeError: 'list' object has no attribute 'id2str'

In [33]:
datasets.features

AttributeError: 'DatasetDict' object has no attribute 'features'

In [2]:
label_map = {
    0: 'admiration',
    1: 'amusement',
    2: 'anger',
    3: 'annoyance',
    4: 'approval',
    5: 'caring',
    6: 'confusion',
    7: 'curiosity',
    8: 'desire',
    9: 'disappointment',
    10: 'disapproval',
    11: 'disgust',
    12: 'embarrassment',
    13: 'excitement',
    14: 'fear',
    15: 'gratitude',
    16: 'grief',
    17: 'joy',
    18: 'love',
    19: 'nervousness',
    20: 'optimism',
    21: 'pride',
    22: 'realization',
    23: 'relief',
    24: 'remorse',
    25: 'sadness',
    26: 'surprise',
    27: 'neutral'
}

print(label_map.values())

dict_values(['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'])


In [1]:
import pandas as pd
import os
import itertools

# ËØªÂèñÊñá‰ª∂Â§π‰∏≠ÁöÑÊâÄÊúâÊñá‰ª∂
file_list = os.listdir('data')

# Á≠õÈÄâÂá∫ÂâçÁºÄ‰∏∫ 'Appliance_disappointment' Âíå 'Appliance_regression' ÁöÑÊñá‰ª∂
disappointment_files = [f for f in file_list if f.__contains__('disappointment')]
regression_files = [f for f in file_list if f.__contains__('regression')]
pair = [(i,j) for i,j in itertools.product(disappointment_files, regression_files) if i.split('_')[0] == j.split('_')[0]]
# pair = = [(i, [(i, j) j) for for i, i, j j in in itertools.product(disappointment_files, itertools.product(disappointment_files, regression_files) regression_files) if if i.split('_')[0] i.split('_')[0] == == j.split('_')[0]] j.split('_')[0]]
# disappointment_files, regression_files
# ËØªÂèñÂπ∂ÂêàÂπ∂Êñá‰ª∂



In [3]:
for file1,file2 in pair:
    df1 = pd.read_csv(os.path.join('data', file1))
    print(df1.head())
    df2 = pd.read_csv(os.path.join('data', file2))
    print(df2.head())
    merged_df = pd.merge(df1,df2, on=['asin','parent_asin','user_id'])
    merged_df.to_csv(os.path.join('data', file1.split('_')[0] + '_merged.csv'), index=False)
    # break
# merged_df.head()

         asin parent_asin                       user_id  disappointment
0  B09BGPFTDB  B09BGPFTDB  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ        0.021089
1  0593235657  0593235657  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ        0.003102
2  1782490671  1782490671  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ        0.001153
3  0593138228  0593138228  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ        0.001506
4  0823098079  0823098079  AFKZENTNBQ7A7V7UXW5JJI6UGRYQ        0.001676
   rating                                              title  \
0       1      Not a watercolor book! Seems like copies imo.   
1       5  Updated: after 1st arrived damaged this one is...   
2       5                              Excellent! I love it!   
3       5       Updated after 1st arrived damaged. Excellent   
4       5                                Beautiful patterns!   

                                                text  \
0  It is definitely not a watercolor book.  The p...   
1  Updated: after first book arrived very damaged...   
2  I bought it 

In [4]:
df1

Unnamed: 0,asin,parent_asin,user_id
0,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
1,0593235657,0593235657,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
2,1782490671,1782490671,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
3,0593138228,0593138228,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
4,0823098079,0823098079,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
...,...,...,...
999995,B008SLDX20,B008SLDX20,AEMUH2AACMXFE7JPIJZ373GL2HIQ
999996,0440207622,0440207622,AEMUH2AACMXFE7JPIJZ373GL2HIQ
999997,044021422X,044021422X,AEMUH2AACMXFE7JPIJZ373GL2HIQ
999998,0440200563,0440200563,AEMUH2AACMXFE7JPIJZ373GL2HIQ
