<a href="https://colab.research.google.com/github/tarangga/Machine-Learning/blob/main/Experiment_Covid_Classification_pretrained_indobert_base_uncased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Library

In [None]:
def sheet2csv(url, save_path):
  import requests
  resp = requests.get(url)

  output = open(save_path, 'wb')
  output.write(resp.content)
  output.close()

sheet2csv(
    'https://docs.google.com/spreadsheets/d/1kCwlP_LmXQm7DBagptRPHVBBQDUMmbAAhap0nKyVFyE/export?format=csv&id=1kCwlP_LmXQm7DBagptRPHVBBQDUMmbAAhap0nKyVFyE&gid=209928702',
    'combine.csv'
)

In [None]:
pip install transformers



In [None]:
import transformers
from transformers import AutoTokenizer, AutoModel, get_scheduler, AdamW

In [None]:
import numpy as np
import pandas as pd
import re
from spacy.lang.id import STOP_WORDS
from tqdm import tqdm

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch import tensor, nn, optim
import torch


# Preprocessing Data

In [None]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', text)

def clean(text):
  text = str(text)
  text = text.replace('&amp;', ' ')
  text = text.lower()
  text = deEmojify(text)
  text = re.sub(r'@[0-9a-z]+', ' [USERNAME] ', text)
  text = re.sub(r'#[0-9a-z]+', ' [HASHTAG] ', text)
  text = re.sub(r'http\S+', ' [URL] ', text)
  text = re.sub(r'[-]', '', text)
  text = re.sub(r'[^0-9a-z]', ' ', text)
  text = re.sub(r'[0-9]+', ' ', text)
  return text

def clean_stopwords(text, stop_words=STOP_WORDS):
  text = str(text)
  text = text.split()
  text = [t for t in text if t not in stop_words]
  return ' '.join(text)

In [None]:
df = pd.read_csv('/content/combine.csv')[['TWEET', 'Arah Voting']]
df = df[~df['Arah Voting'].isnull()]
df['Arah Voting'] = df['Arah Voting'].map({'S': 3, 'P':0, 'O':1, 'N':2})
df['TWEET'] = df['TWEET'].apply(clean).apply(clean_stopwords)
df.head()

Unnamed: 0,TWEET,Arah Voting
0,yixing menerima sumbanganmu pencegahan penyeba...,3
1,suga bts donasikan rp miliar korban virus koro...,3
2,d tanggal february donasi terkumpul rp berhubu...,3
3,kayaknya kampanye yg ngebantu korban virus kor...,3
4,cuman ngasih tau org yg namanya bryan emg ky v...,3


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['TWEET'], df['Arah Voting'], test_size=.2)


In [None]:
max(df['TWEET'].str.split().apply(len))

49

In [None]:
class CovidDataset(Dataset):
  def __init__(self, text, label):
    self.text = text
    self.label = label
    self.tokenizer = AutoTokenizer.from_pretrained('indolem/indobert-base-uncased')
    self.encode = self.tokenizer(self.text, max_length=101, 
                     padding='max_length', return_tensors='pt')

  def __getitem__(self, idx):
    return {k:v[idx] for k, v in self.encode.items()} , tensor(self.label[idx])

  def __len__(self):
    return len(self.label)

train_dataset = CovidDataset(list(train_texts.values), train_labels.values)
val_dataset = CovidDataset(list(val_texts.values), val_labels.values)

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/234k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Load Model

In [None]:
class CovidModel(nn.Module):
  def __init__(self):
    super(CovidModel, self).__init__()
    self.indobert = AutoModel.from_pretrained("indolem/indobert-base-uncased")
    self.conv = nn.Sequential(
        nn.Conv1d(101, 256, (7,)),
        nn.Conv1d(256, 128, (7,)),
        nn.AvgPool1d(116)
    )
    self.seq = nn.Sequential(
        nn.Linear(128, 32),
        nn.ReLU(),
        nn.Linear(32, 4),
        nn.ReLU()
    )
    self.lstm = nn.LSTM(768, 128, 2, batch_first=True)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.indobert(**x)
    x, _ = self.lstm(x['last_hidden_state'])
    x = self.conv(x)
    x = x.squeeze()
    x = self.seq(x)
    return self.sigmoid(x)

In [None]:
criterion = nn.CrossEntropyLoss()
torch.cuda.empty_cache()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = CovidModel()
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
n_epochs = 73
num_training_steps = n_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optim,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

AttributeError: ignored

In [None]:
for epoch in range(n_epochs):
  losses = []
  total = 0.0
  model.train()
  for batch in tqdm(train_loader, desc='{}/{} epochs'.format(epoch+1, n_epochs)):
    X, y = batch
    X = {k: v.to(device) for k, v in X.items()}
    y = y.to(device)

    outputs = model(X)
    loss = criterion(outputs, y)
    losses.append(loss)
    loss.backward()

    total = total + sum(outputs.argmax(1) == y)

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    
  print('loss :', round(torch.mean(tensor(losses)).item(), 7), end=' ')
  print('| acc :', round(total.item()/len(train_dataset), 4), end=' ')

  model.eval()
  total = 0.0
  for batch in val_loader:
    X, y = batch
    X = {k: v.to(device) for k, v in X.items()}
    y = y.to(device)
    outputs = model(X)
    total = total + sum(outputs.argmax(1) == y)

  print('| val_acc :', round(total.item()/len(val_dataset), 4))
  

1/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.71it/s]


loss : 1.3716196 | acc : 0.428 | val_acc : 0.4813


2/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3553642 | acc : 0.428 | val_acc : 0.4813


3/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3433301 | acc : 0.428 | val_acc : 0.4813


4/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3300116 | acc : 0.428 | val_acc : 0.4813


5/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3237131 | acc : 0.428 | val_acc : 0.4813


6/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3225026 | acc : 0.428 | val_acc : 0.4813


7/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3230155 | acc : 0.428 | val_acc : 0.4813


8/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3212284 | acc : 0.428 | val_acc : 0.4813


9/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3221142 | acc : 0.428 | val_acc : 0.4813


10/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3219894 | acc : 0.428 | val_acc : 0.4813


11/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3228302 | acc : 0.428 | val_acc : 0.4813


12/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3212798 | acc : 0.428 | val_acc : 0.4813


13/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3201315 | acc : 0.428 | val_acc : 0.4813


14/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3230354 | acc : 0.428 | val_acc : 0.4813


15/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3241906 | acc : 0.428 | val_acc : 0.4813


16/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3224536 | acc : 0.428 | val_acc : 0.4813


17/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227252 | acc : 0.428 | val_acc : 0.4813


18/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.322719 | acc : 0.428 | val_acc : 0.4813


19/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3227152 | acc : 0.428 | val_acc : 0.4813


20/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3234382 | acc : 0.428 | val_acc : 0.4813


21/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3227154 | acc : 0.428 | val_acc : 0.4813


22/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3205082 | acc : 0.428 | val_acc : 0.4813


23/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3221858 | acc : 0.428 | val_acc : 0.4813


24/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3228852 | acc : 0.428 | val_acc : 0.4813


25/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227998 | acc : 0.428 | val_acc : 0.4813


26/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3234948 | acc : 0.428 | val_acc : 0.4813


27/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227464 | acc : 0.428 | val_acc : 0.4813


28/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3212752 | acc : 0.428 | val_acc : 0.4813


29/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227292 | acc : 0.428 | val_acc : 0.4813


30/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3234555 | acc : 0.428 | val_acc : 0.4813


31/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.323452 | acc : 0.428 | val_acc : 0.4813


32/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219873 | acc : 0.428 | val_acc : 0.4813


33/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3234468 | acc : 0.428 | val_acc : 0.4813


34/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219832 | acc : 0.428 | val_acc : 0.4813


35/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219814 | acc : 0.428 | val_acc : 0.4813


36/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219801 | acc : 0.428 | val_acc : 0.4813


37/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3241721 | acc : 0.428 | val_acc : 0.4813


38/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.322709 | acc : 0.428 | val_acc : 0.4813


39/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227081 | acc : 0.428 | val_acc : 0.4813


40/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3212454 | acc : 0.428 | val_acc : 0.4813


41/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219757 | acc : 0.428 | val_acc : 0.4813


42/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.322706 | acc : 0.428 | val_acc : 0.4813


43/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227055 | acc : 0.428 | val_acc : 0.4813


44/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227051 | acc : 0.428 | val_acc : 0.4813


45/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219737 | acc : 0.428 | val_acc : 0.4813


46/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227043 | acc : 0.428 | val_acc : 0.4813


47/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3227041 | acc : 0.428 | val_acc : 0.4813


48/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3241661 | acc : 0.428 | val_acc : 0.4813


49/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.321972 | acc : 0.428 | val_acc : 0.4813


50/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3219717 | acc : 0.428 | val_acc : 0.4813


51/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3227028 | acc : 0.428 | val_acc : 0.4813


52/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3227024 | acc : 0.428 | val_acc : 0.4813


53/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3234334 | acc : 0.428 | val_acc : 0.4813


54/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3234334 | acc : 0.428 | val_acc : 0.4813


55/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3227019 | acc : 0.428 | val_acc : 0.4813


56/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3227017 | acc : 0.428 | val_acc : 0.4813


57/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3227015 | acc : 0.428 | val_acc : 0.4813


58/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3212388 | acc : 0.428 | val_acc : 0.4813


59/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3234323 | acc : 0.428 | val_acc : 0.4813


60/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219697 | acc : 0.428 | val_acc : 0.4813


61/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3234322 | acc : 0.428 | val_acc : 0.4813


62/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219694 | acc : 0.428 | val_acc : 0.4813


63/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3234317 | acc : 0.428 | val_acc : 0.4813


64/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.321969 | acc : 0.428 | val_acc : 0.4813


65/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3227003 | acc : 0.428 | val_acc : 0.4813


66/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3219689 | acc : 0.428 | val_acc : 0.4813


67/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3241626 | acc : 0.428 | val_acc : 0.4813


68/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3212374 | acc : 0.428 | val_acc : 0.4813


69/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3219684 | acc : 0.428 | val_acc : 0.4813


70/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3205059 | acc : 0.428 | val_acc : 0.4813


71/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.3219684 | acc : 0.428 | val_acc : 0.4813


72/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.72it/s]


loss : 1.323431 | acc : 0.428 | val_acc : 0.4813


73/73 epochs: 100%|██████████| 94/94 [00:54<00:00,  1.73it/s]


loss : 1.3226994 | acc : 0.428 | val_acc : 0.4813
