In [9]:
%pip install transformers ml_things

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import io
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (
    set_seed,
    TrainingArguments,
    Trainer,
    AutoConfig,
    AutoTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
    AutoModelForSequenceClassification
)

In [12]:
directory = '/content/drive/MyDrive/Colab Notebooks' # './'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Hyperparameters #1
model_name_or_path = 'gpt2'
random_seed = 42
epochs = 4
batch_size = 32
learning_rate = 2e-5

set_seed(random_seed)

labels_ids = {'neg': 0, 'pos': 1}
n_labels = len(labels_ids)

In [13]:
def make_id_file(task, tokenizer):
    def make_data_strings(file_name):
        data_strings = []
        # 구글 드라이브 Colab Notebooks 폴더에 데이터셋(train, dev, csv) 올려두기
        with open(os.path.join(directory, file_name), 'r', encoding='utf-8') as f:
            id_file_data = [line.lower().strip() for line in tqdm(f.readlines())]
        for item in id_file_data:
            data_strings.append(''.join([k for k in item]))

        return data_strings
    
    print('it will take some times...')
    train_pos = make_data_strings('sentiment.train.1')
    train_neg = make_data_strings('sentiment.train.0')
    dev_pos = make_data_strings('sentiment.dev.1')
    dev_neg = make_data_strings('sentiment.dev.0')

    print('make id file finished!')
    return train_pos, train_neg, dev_pos, dev_neg

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
train_pos, train_neg, dev_pos, dev_neg = make_id_file('yelp', tokenizer)

train_pos

it will take some times...


100%|██████████| 266041/266041 [00:00<00:00, 1882165.43it/s]
100%|██████████| 177218/177218 [00:00<00:00, 1804977.95it/s]
100%|██████████| 2000/2000 [00:00<00:00, 1576213.45it/s]
100%|██████████| 2000/2000 [00:00<00:00, 1408666.33it/s]

make id file finished!





['excellent food .',
 'superb customer service .',
 'they also have daily specials and ice cream which is really good .',
 "it 's a good toasted hoagie .",
 'the staff is friendly .',
 'good bar food .',
 'good service .',
 'soup of day is homemade and lots of specials .',
 'great place for lunch or bar snacks and beer .',
 'the new range looks amazing .',
 'this place was very good .',
 'but the people are friendly & the food is good .',
 "traditional `` mom 'n pop '' quality and perfection .",
 "the best fish and chips you 'll ever enjoy and equally superb fried shrimp .",
 'you will love it .',
 'wonderful reuben .',
 'good fish sandwich .',
 'this is a hidden gem , no really .',
 'it took us forever to find but well worth it .',
 'huge sandwich !',
 'i added mushrooms , it was very flavorful .',
 'my boyfriend got the fish sandwich , he enjoyed it as well .',
 'fast and friendly service .',
 'will definitely be back .',
 "my dad 's favorite , as he knows the original owners .",
 'h

In [14]:
class SentimentDataset(object):
    def __init__(self, tokenizer, pos, neg):
        self.tokenizer = tokenizer
        self.texts = []
        self.label = []

        for pos_sent in pos:
            self.texts += [pos_sent]
            self.label += [[1]]
        for neg_sent in neg:
            self.texts += [pos_sent]
            self.label += [[0]]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        sample = self.texts[index]
        return {'text':sample, 'label':torch.tensor(self.label[index])}

In [15]:
class Gpt2ClassificationCollator(object):
    def __init__(self, used_tokenizer, labels=True):
        self.used_tokenizer = used_tokenizer
        self.is_labels = labels

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        inputs = self.used_tokenizer(text=texts, return_tensors='pt', padding=True)
        
        if self.is_labels:
            labels = [sequence['label'] for sequence in sequences]
            # encode labels => not needed
            inputs.update({'labels': torch.tensor(labels)})
        
        return inputs    

In [16]:
def train(dataloader, optimizer_, scheduler_, device_):
    global model

    predictions_labels = []
    true_labels = []

    total_loss = 0

    model.train()

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()

        batch = {k:v.type(torch.long).to(device_) for k, v in batch.items()}

        model.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer_.step()
        scheduler_.step()

        logits = logits.detach().cpu().numpy()

        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss

In [17]:
def validation(dataloader, device_):
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.eval()
    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()

        batch = {k:v.type(torch.long).to(device_) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()

            total_loss += loss.item()
            predict_content = logits.argmax(axis=-1).flatten().tolist()
            predictions_labels += predict_content

    avg_epoch_loss = total_loss / len(dataloader)
    return true_labels, predictions_labels, avg_epoch_loss

In [18]:
print('Loading configuration...')
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

model.resize_token_embeddings(len(tokenizer))

model.config.pad_token_id = model.config.eos_token_id

model.to(device)
print(f'Model loaded to {device}')

Loading configuration...
Loading tokenizer...
Loading model...


Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to cuda:0


In [19]:
gpt2_classification_collator = Gpt2ClassificationCollator(used_tokenizer=tokenizer, labels=True)

print('Dealing with Train...')
train_dataset = SentimentDataset(tokenizer, train_pos, train_neg)
print('Created `train_dataset` with %d examples!'%len(train_dataset))
# Valid
print(train_dataset[0])
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classification_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

print('Dealing with Valid...')
valid_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

# Move pytorch dataset into dataloader.
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classification_collator)
print('Created `Valid_dataloader` with %d batches!'%len(valid_dataloader))


Dealing with Train...
Created `train_dataset` with 443259 examples!
{'text': 'excellent food .', 'label': tensor([1])}
Created `train_dataloader` with 13852 batches!

Dealing with Valid...
Created `valid_dataset` with 4000 examples!
Created `Valid_dataloader` with 125 batches!


In [20]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}



In [None]:
print('Epoch')
for epoch in range(epochs):
    print(f'Epoch {epoch}: Training on batches')

    train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
    train_acc = accuracy_score(train_labels, train_predict)

    print('Validation on batches')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_acc = accuracy_score(valid_labels, valid_predict)

    print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
    print()

    # Store the loss value for plotting the learning curve.
    all_loss['train_loss'].append(train_loss)
    all_loss['val_loss'].append(val_loss)
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)

# Plot loss curves.
plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])

# Plot accuracy curves.
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])

Epoch
Epoch 0: Training on batches


  8%|▊         | 1091/13852 [01:02<11:30, 18.48it/s]

In [None]:
# Get prediction form model on validation data. This is where you should use
# your test data.
true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader, device)

# Create the evaluation report.
evaluation_report = classification_report(true_labels, predictions_labels, labels=list(labels_ids.values()), target_names=list(labels_ids.keys()))
# Show the evaluation report.
print(evaluation_report)

# Plot confusion matrix.
plot_confusion_matrix(y_true=true_labels, y_pred=predictions_labels, 
                      classes=list(labels_ids.keys()), normalize=True, 
                      magnify=0.1,
                      );

In [None]:
import pandas as pd
test_df = pd.read_csv(directory+'/test_no_label.csv')

test_dataset = test_df['Id']

def make_id_file_test(test_dataset):
    data_strings = []
    id_file_data = [sent.lower().strip() for sent in test_dataset]
    for item in id_file_data:
        data_strings.append(''.join([k for k in item]))
    return data_strings

test = make_id_file_test(test_dataset)

# %%
test[:10]

In [None]:
class SentimentTestDataset(object):
    def __init__(self, tokenizer, test):
        self.tokenizer = tokenizer
        self.texts = []

        for sent in test:
            self.texts += [test]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        sample = self.texts[index]
        return {'text':sample}
        
# %%
test_dataset = SentimentTestDataset(tokenizer, test)



# %%
test_batch_size = 32
gpt2_classification_collator = Gpt2ClassificationCollator(used_tokenizer=tokenizer, labels=False)

print('Dealing with Test...')
test_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)
print('Created `test_dataset` with %d examples!'%len(test_dataset))

# Move pytorch dataset into dataloader.
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classification_collator)
print('Created `test_dataloader` with %d batches!'%len(test_dataloader))

In [None]:
def test(dataloader, device_):
    global model

    predictions_labels = []
    total_loss = 0

    model.eval()
    for batch in tqdm(dataloader, total=len(dataloader)):
        batch = {k:v.type(torch.long).to(device_) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()

            total_loss += loss.item()
            predict_content = logits.argmax(axis=-1).flatten().tolist()
            predictions_labels += predict_content

    avg_epoch_loss = total_loss / len(dataloader)
    return predictions_labels, avg_epoch_loss

predictions_labels, avg_epoch_loss = test(test_dataloader, device)

# %%
test_df['Category'] = predictions_labels

# %%
print(predictions_labels)

# %%
test_df.to_csv(f'submission_{model_name_or_path}.csv', index=False)

# %%
!cp /content/submission.csv /content/drive/MyDrive/submission.csv

In [None]:
gpt2_classification_collator = Gpt2ClassificationCollator(used_tokenizer=tokenizer, labels=True)


print('Dealing with Test...')
test_dataset = SentimentDataset(tokenizer, dev_pos, dev_neg)
print('Created `test_dataset` with %d examples!'%len(test_dataset))

# Move pytorch dataset into dataloader.
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classification_collator)
print('Created `test_dataloader` with %d batches!'%len(test_dataloader))
