In [15]:
%pwd

'/home/yukikongju/Projects/tidytuesday/financials_news_sentimentanalysis/notebooks'

In [21]:
import pandas as pd
import torch

from torch import nn, optim

from transformers import pipeline

from datasets import load_dataset, Dataset, DatasetDict
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

### Get Data

In [3]:
df = pd.read_csv('../data/data.csv')
# df = df[:400]

In [4]:
sentiment_dict = {'positive': 2, 'negative': 0, 'neutral': 1}
df['Sentiment'] = df['Sentiment'].apply(lambda x: sentiment_dict.get(str(x))).tolist()
df = df.rename(columns = {'Sentence': 'text', 'Sentiment': 'label'})

In [5]:
num_classes = len(df['label'].unique())

In [6]:
len(df)

5842

In [7]:
# dataset = load_dataset('csv', data_files='../data/data.csv')
# dataset = load_dataset('csv', data_files = {'train': ['../data/data.csv'], 'test': })
dataset = Dataset.from_pandas(df)
dataset = DatasetDict({
    "train": Dataset.from_pandas(df[:4000]),
    "eval": Dataset.from_pandas(df[4001:])
})

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 1841
    })
})

### Preprocessing the data with Tokenizer

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pretrained_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, 
            num_labels = num_classes).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [27]:
def tokenize_function(examples):
#     return tokenizer(examples['text'], padding = True, truncation = True)
    return tokenizer(examples['text'], padding = "max_length", truncation = True)



tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [28]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4000
    })
    eval: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1841
    })
})

### Train Model with dataset

In [29]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model = model,
    args = training_args,
#     train_dataset=tokenized_datasets,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
#     tokenizer=tokenizer,
#     data_collator = data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1250


tensor([[-0.4475,  0.3415],
        [ 0.8887,  0.3479]], grad_fn=<AddmmBackward0>)

In [22]:
pipeline = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [23]:
pipeline('i am sad:(')

[{'label': 'NEGATIVE', 'score': 0.9992462396621704}]