<a href="https://colab.research.google.com/github/ucaokylong/Some_small_projects/blob/main/Transformer_Application_Text_classification_and_BERT_Finetune_and_Vision_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 # import libs
import os
import time
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data

from torchvision import datasets , models , transforms

from torchsummary import summary

import matplotlib.pyplot as plt
from PIL import Image

In [None]:
device = "cuda" if torch.cuda.is_available() else 'cpu'


class TokenAndPositionEmbedding(nn.Module):
  def __init__(self, vocab_size , embed_dim, max_length, device= "cuda" if torch.cuda.is_available() else 'cpu'):
    super().__init__()
    self.device = device
    self.word_emb = nn.Embedding(
        num_embeddings = vocab_size,
        embedding_dim = embed_dim
    )

    self.pos_emb = nn.Embedding(
        num_embeddings = max_length,
        embedding_dim = embed_dim
    )

  def forward(self,x):
    N, seq_len = x.size()
    positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)
    output1 = self.word_emb(x)
    output2 = self.pos_emb(positions)
    output = output1 + output2
    return output

In [None]:
class TransformerEncoderBlock(nn.Module):
  def __init__(self, embed_dim, num_heads, ff_dim, dropout = 0.1):
    super().__init__()
    self.attn = nn.MultiheadAttention(
        embed_dim = embed_dim,
        num_heads=num_heads,
        batch_first = True
    )

    self.ffn = nn.Sequential(
        nn.Linear(in_features=embed_dim, out_features = ff_dim, bias = True),
        nn.ReLU(),
        nn.Linear(in_features=ff_dim, out_features = embed_dim, bias = True)
    )
    self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim, eps = 1e-6)
    self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim, eps = 1e-6)
    self.dropout_1 = nn.Dropout(p=dropout)
    self.dropout_2 = nn.Dropout(p=dropout)

  def forward(self, query, key, value):
    attn_output, _ = self.attn(query, key, value)
    attn_output = self.dropout_1(attn_output)
    out_1 = self.layernorm_1(query + attn_output)
    ffn_output = self.ffn(out_1)
    ffn_output = self.dropout_2(ffn_output)
    out_2 = self.layernorm_2(out_1 + ffn_output)
    return out_2


In [None]:
class TransformerEncoder( nn.Module ):
  def __init__(self , src_vocab_size , embed_dim , max_length , num_layers ,
               num_heads , ff_dim , dropout =0.1 ,
               device= "cuda" if torch.cuda.is_available() else 'cpu'):
    super().__init__()
    self.embedding = TokenAndPositionEmbedding(
        src_vocab_size, embed_dim, max_length, device)
    self.layers = nn.ModuleList(
        [
            TransformerEncoderBlock(
                embed_dim, num_heads, ff_dim, dropout
            ) for i in range(num_layers)

        ]
    )

  def forward(self, x):
    output = self.embedding(x)
    for layer in self.layers:
      output = layer(output, output, output)
    return output




In [None]:
class TransformerDecoderBlock(nn.Module):
  def __init__( self , embed_dim , num_heads , ff_dim , dropout =0.1):
    super().__init__()
    self.attn = nn.MultiheadAttention(
        embed_dim = embed_dim,
        num_heads=num_heads,
        batch_first=True
    )

    self.cross_attn = nn.MultiheadAttention(
        embed_dim = embed_dim ,
        num_heads = num_heads ,
        batch_first = True
    )

    self.ffn = nn.Sequential(
        nn.Linear( in_features = embed_dim , out_features = ff_dim , bias = True ) ,
        nn.ReLU() ,
        nn.Linear( in_features = ff_dim , out_features = embed_dim , bias = True )
    )
    self.layernorm_1 = nn.LayerNorm( normalized_shape = embed_dim , eps =1e-6)
    self.layernorm_2 = nn.LayerNorm( normalized_shape = embed_dim , eps =1e-6)
    self.layernorm_3 = nn.LayerNorm( normalized_shape = embed_dim , eps =1e-6)
    self.dropout_1 = nn.Dropout( p = dropout )
    self.dropout_2 = nn.Dropout( p = dropout )
    self.dropout_3 = nn.Dropout( p = dropout )

  def forward ( self , x , enc_output , src_mask , tgt_mask ) :
    attn_output , _ = self.attn(x , x , x , attn_mask = tgt_mask )
    attn_output = self.dropout_1( attn_output )
    out_1 = self.layernorm_1( x + attn_output )

    attn_output , _ = self.cross_attn(out_1 , enc_output , enc_output , attn_mask = src_mask)
    attn_output = self.dropout_2( attn_output )
    out_2 = self.layernorm_2( out_1 + attn_output )
    ffn_output = self.ffn( out_2 )
    ffn_output = self.dropout_2( ffn_output )
    out_3 = self.layernorm_2( out_2 + ffn_output )
    return out_3



In [None]:
class TransformerDecoder( nn.Module ):
  def __init__ ( self ,tgt_vocab_size , embed_dim , max_length , num_layers ,
                num_heads , ff_dim , dropout =0.1 ,
                 device = "cuda" if torch.cuda.is_available() else 'cpu'):
    super().__init__()
    self.embedding = TokenAndPositionEmbedding(
        tgt_vocab_size, embed_dim, max_length, device)
    self.layers = nn.ModuleList(
        [
            TransformerDecoderBlock(
                embed_dim, num_heads, ff_dim, dropout
            ) for i in range(num_layers)
        ]
    )

  def forward( self , x , enc_output , src_mask , tgt_mask ):
    output = self.embedding( x )
    for layer in self.layers:
      output = layer( output , enc_output , src_mask , tgt_mask )
    return output
















In [None]:
class Transformer(nn.Module):
  def __init__(self, src_vocab_size , tgt_vocab_size , embed_dim ,
               max_length , num_layers , num_heads , ff_dim , dropout =0.1 ,
               device= "cuda" if torch.cuda.is_available() else 'cpu'):
    super().__init__()
    self.device = device
    self.encoder = TransformerEncoder(
        src_vocab_size , embed_dim , max_length , num_layers , num_heads , ff_dim
    )
    self.decoder = TransformerDecoder(
        tgt_vocab_size , embed_dim , max_length , num_layers , num_heads , ff_dim
    )
    self.fc = nn.Linear( embed_dim , tgt_vocab_size )

  def generate_mask(self, src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    src_mask = torch.zeros(
        ( src_seq_len , src_seq_len ),
        device=self.device).type(torch.bool)

    tgt_mask = (torch.triu(torch.ones((tgt_seq_len, tgt_seq_len), device = self.device)) ==1).transpose(0,1)
    tgt_mask = tgt_mask.float().masked_fill(tgt_mask == 0, float("-inf")).masked_fill(tgt_mask == 1, float(0.0))

    return src_mask, tgt_mask

  def forward(self, src, tgt):
    src_mask, tgt_mask = self.generate_mask(src, tgt)
    enc_output = self.encoder(src)
    dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
    output = self.fc(dec_output)
    return output



In [None]:
batch_size = 128
src_vocab_size = 1000
tgt_vocab_size = 2000
embed_dim = 200
max_length = 100
num_layers = 2
num_heads = 4
ff_dim = 256

model = Transformer(src_vocab_size , tgt_vocab_size ,embed_dim , max_length , num_layers , num_heads , ff_dim)
model.to(device)
src = torch.randint( high =2 , size =( batch_size , max_length ) , dtype = torch.int64)
src = src.to(device)

tgt = torch.randint(high =2 ,size =( batch_size , max_length ) , dtype = torch.int64)
tgt = tgt.to(device)
prediction = model( src , tgt )
prediction.shape # batch_size x max_length x tgt_vocab_size

torch.Size([128, 100, 2000])

In [None]:
print(src.device)
print(tgt.device)

cpu
cpu


#TEXT CLASSIFICATION

In [None]:
#download
!git clone https://github.com/congnghia0609/ntc-scv.git
!unzip ./ntc-scv/data/data_test.zip -d ./data
!unzip ./ntc-scv/data/data_train.zip -d ./data
!rm -rf ./ntc-scv



[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
  inflating: ./data/data_train/train/pos/31228.txt  
  inflating: ./data/data_train/train/pos/3123.txt  
  inflating: ./data/data_train/train/pos/31232.txt  
  inflating: ./data/data_train/train/pos/31233.txt  
  inflating: ./data/data_train/train/pos/31234.txt  
  inflating: ./data/data_train/train/pos/31236.txt  
  inflating: ./data/data_train/train/pos/31238.txt  
  inflating: ./data/data_train/train/pos/31240.txt  
  inflating: ./data/data_train/train/pos/31242.txt  
  inflating: ./data/data_train/train/pos/31247.txt  
  inflating: ./data/data_train/train/pos/31250.txt  
  inflating: ./data/data_train/train/pos/31251.txt  
  inflating: ./data/data_train/train/pos/31252.txt  
  inflating: ./data/data_train/train/pos/31257.txt  
  inflating: ./data/data_train/train/pos/31260.txt  
  inflating: ./data/data_train/train/pos/31261.txt  
  inflating: ./data/data_train/train/pos/31262.txt  
  inflating: ./data/data_trai

In [None]:
# load data
import os
import pandas as pd

def load_data_from_path(folder_path):
  examples = []
  for label in os.listdir(folder_path):
    full_path = os.path.join(folder_path, label)
    for file_name in os.listdir(full_path):
      file_path = os.path.join(full_path, file_name)
      with open(file_path, "r", encoding= 'utf-8') as f:
        lines = f.readlines()
        sentence = " ".join(lines)
        if label == 'neg':
          label = 0
        if label == "pos":
          label = 1
        data = {
            "sentence": sentence ,
            "label": label
        }
        examples.append(data)
  return pd.DataFrame(examples)

folder_paths = {
    "train": './data/data_train/train',
    'valid': "./data/data_train/test",
    'test' : "./data/data_test/test"
}

train_df = load_data_from_path( folder_paths['train'])
valid_df = load_data_from_path( folder_paths['valid'])
test_df = load_data_from_path( folder_paths['test'])

In [None]:
train_df.head(10)

Unnamed: 0,sentence,label
0,"Năm mới tăng_giá gấp 3 năm cũ nhé , 1 ly cafe ...",0
1,M học Hsu nên ún ở đây nhìu .\n Về mức giá thì...,0
2,"Chỗ ngồi nóng , phòng có điều_hoà nhưng chạy q...",0
3,Chất_lượng các món ăn tồi hơn cái giá của quán...,0
4,"I came here for lunch , the eatery is a like t...",0
5,Quán gần khu tôi sống . Quán ko gian giống lề_...,0
6,"Mình vs bạn ăn chè trứng bột năng bột báng , b...",0
7,Nói về quán cafe thần_thánh này thì mìh chỉ có...,0
8,Lâu lâun thèm ăn lại mà thấy ngán . Nước_chấm ...,0
9,"14/2 / 2016 mình có ghe cafe ở đây , khi ra về...",0


In [None]:
#Preprocessing
import re
import string

def preprocess_text(text):
  # remove URLs https://www.
  url_pattern = re.compile(r'https?://\s+\wwww\.\s+')
  text = url_pattern.sub(r" ", text)

  # remove HTML Tags: <>
  html_pattern = re.compile(r"<[^<>]+>")
  text = html_pattern.sub(" ", text)

  #remove puncs and digits
  replace_chars = list(string.punctuation + string.digits)
  for char in replace_chars:
    text = text.replace(char," ")

  #remove emoji
  emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F" # emoticons
    u"\U0001F300-\U0001F5FF" # symbols & pictographs
    u"\U0001F680-\U0001F6FF" # transport & map symbols
    u"\U0001F1E0-\U0001F1FF" # flags (iOS)
    u"\U0001F1F2-\U0001F1F4" # Macau flag
    u"\U0001F1E6-\U0001F1FF" # flags
    u"\U0001F600-\U0001F64F"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U0001F1F2"
    u"\U0001F1F4"
    u"\U0001F620"
    u"\u200d"
    u"\u2640-\u2642"
    "]+",  flags = re.UNICODE)

  text = emoji_pattern.sub(r" ", text)

  #normalize whitespace
  text = " ".join(text.split())

  # lowercasing
  text = text.lower()
  return text

train_df['sentence'] = [ preprocess_text( row['sentence']) for index , row in train_df.iterrows() ]
valid_df['sentence'] = [ preprocess_text( row['sentence']) for index , row in valid_df.iterrows() ]
test_df['sentence'] = [preprocess_text( row['sentence']) for index , row in test_df.iterrows() ]


In [None]:
#Representation
 # ! pip install -q torchtext ==0.16.0
def yield_tokens( sentences , tokenizer ):
  for sentence in sentences :
    yield tokenizer( sentence )

 # word - based tokenizer
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")

#build vocabulary
from torchtext.vocab import build_vocab_from_iterator

vocab_size = 10000
vocabulary = build_vocab_from_iterator(
    yield_tokens(train_df['sentence'], tokenizer),
    max_tokens = vocab_size,
    specials=["<pad>", "<unk>"]
)

vocabulary.set_default_index(vocabulary["<unk>"])

 # convert torchtext dataset
from torchtext.data.functional import to_map_style_dataset

def prepare_dataset(df):
  # create iterator for dataset : ( sentence , label )
  for index , row in df.iterrows():
    sentence = row['sentence']
    encoded_sentence = vocabulary( tokenizer(sentence))
    label = row['label']
    yield encoded_sentence, label

train_dataset = prepare_dataset( train_df )
train_dataset = to_map_style_dataset( train_dataset)

valid_dataset = prepare_dataset( valid_df )
valid_dataset = to_map_style_dataset( valid_dataset)

test_dataset = prepare_dataset( test_df )
test_dataset = to_map_style_dataset( test_dataset )










In [None]:
#dataloader
import torch
seq_length = 100

def collate_batch(batch):
  # create inputs , offsets , labels for batch
  sentences , labels = list(zip(*batch))
  encoded_sentences = [sentence + ([0]* (seq_length - len(sentence))) if len(sentence) < seq_length
                       else sentence[ :seq_length] for sentence in sentences]

  encoded_sentences = torch.tensor(encoded_sentences, dtype = torch.int64)
  labels = torch.tensor(labels)

  return encoded_sentences, labels

from torch.utils.data import DataLoader
batch_size = 128

train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn= collate_batch
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = batch_size,
    shuffle = False,
    collate_fn= collate_batch
)

train_dataloader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    shuffle = False,
    collate_fn= collate_batch
)

In [None]:
#train epoch
import time
import torch.nn.functional as F
def train_epoch( model , optimizer , criterion , train_dataloader , device , epoch =0, log_interval = 50):
  model.train()
  total_acc, total_count = 0,0
  losses = []
  start_time = time.time()

  for idx, (inputs, labels) in enumerate(train_dataloader):
    inputs = inputs.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    #predictions = F.softmax(model(inputs))
    predictions = model(inputs)

    #compute loss
    loss = criterion(predictions, labels)
    losses.append(loss.item())

    #backward
    loss.backward()
    optimizer.step()
    total_acc += (predictions.argmax(1) == labels).sum().item()
    total_count += labels.size(0)
    if idx % log_interval == 0 and idx > 0:
      elapsed = time.time() - start_time
      print(f"| epoch {epoch} | {idx}/{len(train_dataloader)} "
            f"| accuracy {total_acc / total_count}"
      )
      total_acc, total_count = 0,0
      start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_acc, epoch_loss

#evaluate
def evaluate_epoch(model, criterion, valid_dataloader, device):
  model.eval()
  total_acc, total_count = 0,0
  losses = []

  with torch.no_grad():
    for idx, (inputs, labels) in enumerate(valid_dataloader):
      inputs = inputs.to(device)
      labels = labels.to(device)
      #predictions = F.softmax(model(inputs))
      predictions = model(inputs)

      loss = criterion(predictions, labels)
      losses.append(loss.item())

      total_acc += (predictions.argmax(1) == labels).sum().item()
      total_count += labels.size(0)

  epoch_acc = total_acc / total_count
  epoch_loss = sum(losses) / len(losses)
  return epoch_acc, epoch_loss

#train
def train(model , model_name , save_model , optimizer , criterion , train_dataloader,
          valid_dataloader, num_epochs, device):
  train_accs, train_losses = [], []
  eval_accs, eval_losses = [], []
  best_loss_eval = 100
  times = []

  for epoch in range(1 , num_epochs +1):
    epoch_start_time = time.time()
    #training
    train_acc, train_loss = train_epoch(model, optimizer, criterion, train_dataloader, device, epoch)
    train_accs.append(train_acc)
    train_losses.append(train_loss)

    #Evaluation
    eval_acc, eval_loss = evaluate_epoch(model, criterion, valid_dataloader, device)
    eval_accs.append( eval_acc )
    eval_losses.append( eval_loss )

    #save best model
    if eval_loss < best_loss_eval:
      torch.save(model.state_dict(), save_model + f'/{model_name}.pt')

    times.append(time.time() - epoch_start_time)
    print("-"*59)
    print(
        f"| End of epoch {epoch} | Time : { time . time () - epoch_start_time}s | Train Accuracy {train_acc} | Train Loss { train_loss} "
        f"| Valid Accuracy {eval_acc} | Valid Loss {eval_loss} "
    )
    print ("-" * 59)

  # Load best model
  model.load_state_dict( torch.load( save_model + f'/{model_name}.pt'))
  model.eval()
  metrics = {
      "train_accuracy": train_accs,
      "train_loss": train_losses,
      "valid_accuracy": eval_accs,
      "valid_loss": eval_losses,
      "time": times
  }
  return model, metrics

#report
import matplotlib.pyplot as plt

def plot_result(num_epochs , train_accs , eval_accs , train_losses , eval_losses):
  epochs = list( range( num_epochs) )
  fig , axs = plt.subplots( nrows = 1 , ncols =2 , figsize = (12 ,6) )
  axs[0].plot( epochs , train_accs , label = " Training ")
  axs[0].plot( epochs , eval_accs , label = " Evaluation ")
  axs[1].plot( epochs , train_losses , label = " Training ")
  axs[1].plot( epochs , eval_losses , label = " Evaluation ")
  axs [0].set_xlabel(" Epochs ")
  axs [1].set_xlabel(" Epochs ")
  axs [0].set_ylabel(" Accuracy ")
  axs [1].set_ylabel(" Loss ")
  plt.legend()















In [None]:
#Modeling
class TransformerEncoderCls(nn.Module):
  def __init__(self,
               vocab_size, max_length, num_layers,
               embed_dim, num_heads, ff_dim, dropout = 0.1, device= "cuda" if torch.cuda.is_available() else 'cpu'):
    super().__init__()
    self.encoder = TransformerEncoder(
        vocab_size, embed_dim, max_length,
        num_layers, num_heads, ff_dim, dropout, device)
    self.pooling = nn.AvgPool1d(kernel_size=max_length)
    self.fc1 = nn.Linear(in_features= embed_dim, out_features=20)
    self.fc2 = nn.Linear(in_features=20, out_features=2)
    self.dropout = nn.Dropout(p = dropout)
    self.relu = nn.ReLU()

  def forward(self, x):
    output = self.encoder(x)
    output = self.pooling(output.permute(0,2,1)).squeeze()
    output = self.dropout( output )
    output = self.fc1( output )
    output = self.dropout( output )
    output = self.fc2( output )
    return output




In [None]:
import torch.optim as optim

vocab_size = 10000
max_length = 100
embed_dim = 100
num_layers = 2
num_heads = 4
ff_dim = 128
dropout = 0.1

model = TransformerEncoderCls(
    vocab_size, max_length, num_layers, embed_dim, num_heads, ff_dim, dropout, device
)
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.00005)

num_epochs = 50
save_model = "./model"
os.makedirs(save_model, exist_ok = True)
model_name = "model"
model, metrics = train(
    model , model_name , save_model , optimizer , criterion , train_dataloader,
    valid_dataloader , num_epochs , device
)

  predictions = F.softmax(model(inputs))
  predictions = F.softmax(model(inputs))


-----------------------------------------------------------
| End of epoch 1 | Time : 0.55946946144104s | Train Accuracy 0.984375 | Train Loss 0.6276842951774597 | Valid Accuracy 0.5 | Valid Loss 0.69852949773209 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch 2 | Time : 0.4897315502166748s | Train Accuracy 1.0 | Train Loss 0.6196860074996948 | Valid Accuracy 0.5 | Valid Loss 0.6994796101051041 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch 3 | Time : 0.4870924949645996s | Train Accuracy 1.0 | Train Loss 0.6112455129623413 | Valid Accuracy 0.5 | Valid Loss 0.7004878475696226 
-----------------------------------------------------------
-----------------------------------------------------------
| End of epoch 4 | Time : 0.48160624504089355s | Train Accuracy 1.0 | Train Loss 0.6045477986335754 | Valid Accuracy 0.5

KeyboardInterrupt: 

#FINE TUNE BERT FOR TEXT CLASSIFICATION

In [1]:
# install libs
!pip install -q -U transformers datasets accelerate evaluate
# download
!git clone https://github.com/congnghia0609/ntc-scv.git
!unzip ./ntc-scv/data/data_test.zip -d ./data
!unzip ./ntc-scv/data/data_train.zip -d ./data
!rm -rf ./ntc-scv

# load data
import os


[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
  inflating: ./data/data_train/train/pos/31228.txt  
  inflating: ./data/data_train/train/pos/3123.txt  
  inflating: ./data/data_train/train/pos/31232.txt  
  inflating: ./data/data_train/train/pos/31233.txt  
  inflating: ./data/data_train/train/pos/31234.txt  
  inflating: ./data/data_train/train/pos/31236.txt  
  inflating: ./data/data_train/train/pos/31238.txt  
  inflating: ./data/data_train/train/pos/31240.txt  
  inflating: ./data/data_train/train/pos/31242.txt  
  inflating: ./data/data_train/train/pos/31247.txt  
  inflating: ./data/data_train/train/pos/31250.txt  
  inflating: ./data/data_train/train/pos/31251.txt  
  inflating: ./data/data_train/train/pos/31252.txt  
  inflating: ./data/data_train/train/pos/31257.txt  
  inflating: ./data/data_train/train/pos/31260.txt  
  inflating: ./data/data_train/train/pos/31261.txt  
  inflating: ./data/data_train/train/pos/31262.txt  
  inflating: ./data/data_trai

In [5]:
import pandas as pd

def load_data_from_path(folder_path):
  examples = []
  for label in os.listdir(folder_path):
    full_path = os.path.join(folder_path, label)
    for file_name in os.listdir(full_path):
      file_path = os.path.join(full_path, file_name)
      with open(file_path, "r", encoding = "utf-8") as f:
        lines = f.readlines()
        sentence = " ".join(lines)
        if label == "neg":
          label = 0
        if label == "pos":
          label = 1
        data = {
            "sentence": sentence,
            "label": label
        }
        examples.append(data)
  return pd.DataFrame(examples)


folder_paths = {
'train' : '/content/data/data_train/train' ,
'valid' : '/content/data/data_train/test',
'test' : '/content/data/data_test/test'}

train_df = load_data_from_path( folder_paths['train'])
valid_df = load_data_from_path( folder_paths['valid'])
test_df = load_data_from_path( folder_paths['test'])

# convert to Dataset Object
from datasets import Dataset , DatasetDict

raw_dataset = DatasetDict({
'train' : Dataset.from_pandas( train_df ) ,
'valid': Dataset.from_pandas( valid_df ) ,
'test': Dataset.from_pandas( test_df )
})


In [6]:
import re
import string

def preprocess_text(text):
  #remove URLs https://www.
  url_pattern = re.compile(r'https?://\s+\ wwww\.\s+')
  text = url_pattern.sub(r" ", text)

  # remove HTML Tags : <>
  html_pattern = re.compile(r'<[^<>]+>')
  text = html_pattern.sub (" ", text )

  # remove puncs and digits
  replace_chars = list ( string.punctuation + string.digits )
  for char in replace_chars:
    text = text.replace( char , " ")

  # remove emoji
  emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F" # emoticons
    u"\U0001F300-\U0001F5FF" # symbols & pictographs
    u"\U0001F680-\U0001F6FF" # transport & map symbols
    u"\U0001F1E0-\U0001F1FF" # flags (iOS)
    u"\U0001F1F2-\U0001F1F4" # Macau flag
    u"\U0001F1E6-\U0001F1FF" # flags
    u"\U0001F600-\U0001F64F"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U0001F1F2"
    u"\U0001F1F4"
    u"\U0001F620"
    u"\u200d"
    u"\u2640-\u2642"
    "]+", flags = re.UNICODE )

  text = emoji_pattern.sub(r" ", text)

  # normalize whitespace
  text = " ". join( text.split())

  # lowercasing
  text = text.lower()
  return text

train_df['sentence'] = [ preprocess_text( row['sentence']) for index , row in train_df.iterrows() ]
valid_df['sentence'] = [ preprocess_text( row['sentence']) for index , row in valid_df.iterrows() ]
test_df['sentence'] = [ preprocess_text( row['sentence']) for index , row in test_df.iterrows() ]








In [7]:
# tokenization
from transformers import AutoTokenizer

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast = True)
max_seq_length = 100
max_seq_length = min( max_seq_length , tokenizer.model_max_length )

def preprocess_function( examples):
# Tokenize the texts
  result = tokenizer(
    examples["sentence"] ,
    padding ="max_length",
    max_length = max_seq_length ,
    truncation = True)
  result ["label"] = examples ['label']

  return result


# Running the preprocessing pipeline on all the datasets
processed_dataset = raw_dataset.map(
  preprocess_function ,
  batched = True ,
  desc ="Running tokenizer on dataset",
)

# collator with padding max length
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding( tokenizer = tokenizer )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Running tokenizer on dataset:   0%|          | 0/30000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoConfig , AutoModelForSequenceClassification

num_labels = 2

config = AutoConfig.from_pretrained(
  model_name ,
  num_labels = num_labels ,
  finetuning_task ="text-classification")

model = AutoModelForSequenceClassification.from_pretrained(
  model_name ,config = config)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics( eval_pred ) :
  predictions , labels = eval_pred
  predictions = np.argmax( predictions , axis =1)
  result = metric.compute( predictions = predictions , references = labels )
  return result

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [11]:
from transformers import TrainingArguments , Trainer

training_args = TrainingArguments(
  output_dir ="save_model",
  learning_rate =2e-5 ,
  per_device_train_batch_size =128 ,
  per_device_eval_batch_size =128 ,
  num_train_epochs =10 ,
  evaluation_strategy ="epoch",
  save_strategy ="epoch",
  load_best_model_at_end = True)

trainer = Trainer(
  model = model ,
  args = training_args ,
  train_dataset = processed_dataset["train"] ,
  eval_dataset = processed_dataset["valid"] ,
  compute_metrics = compute_metrics ,
  tokenizer = tokenizer ,
  data_collator = data_collator ,)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.520228,0.7722
2,No log,0.463004,0.7979
3,0.422500,0.394243,0.8322
4,0.422500,0.424515,0.8393
5,0.319600,0.430169,0.8354
6,0.319600,0.444951,0.8384
7,0.245600,0.49025,0.8336
8,0.245600,0.513967,0.8349
9,0.186000,0.538388,0.8316
10,0.186000,0.539872,0.8345


TrainOutput(global_step=2350, training_loss=0.272990711293322, metrics={'train_runtime': 5200.2858, 'train_samples_per_second': 57.689, 'train_steps_per_second': 0.452, 'total_flos': 1.54166634e+16, 'train_loss': 0.272990711293322, 'epoch': 10.0})

#VISION TRANSFORMER