In [None]:
!pip install transformers



In [None]:
!pip install --upgrade transformers



In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

import pandas as pd
import numpy as np
import random
import re
import spacy
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset
import torch.nn.functional as functional
import gc
from sklearn.metrics import roc_auc_score,f1_score, confusion_matrix
import time
import datetime

In [2]:
torch.manual_seed(0)
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
if use_cuda:
    torch.cuda.manual_seed(0)

print("Using GPU: {}".format(use_cuda))

Using GPU: True


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train = pd.read_csv('/content/drive/MyDrive/azhar/nlp_project/csv_dataset/csv_dataset/train.csv')
val = pd.read_csv('/content/drive/MyDrive/azhar/nlp_project/csv_dataset/csv_dataset/val.csv')
test = pd.read_csv('/content/drive/MyDrive/azhar/nlp_project/csv_dataset/csv_dataset/test.csv')

In [5]:
train.head()

Unnamed: 0,text,labels,category
0,پاکستان کے وزیراعظم عمران خان سعودی عرب کے دار...,0.0,Business
1,پاکستان اسٹاک ایکسچینج (پی ایس ایکس) میں کاروب...,0.0,Business
2,روس نے طالبان کو اسلحہ فراہم کرنے کا امریکی ال...,0.0,Business
3,انٹر بینک مارکیٹ میں ڈالر سستا ہو گیا ، سٹاک ا...,0.0,Business
4,چین نے پاکستان پر ڈالروں کی بارش کردی، ایک ارب...,0.0,Business


In [6]:
train.labels.unique()

array([0., 1.])

In [7]:
test.head()

Unnamed: 0,text,labels
0,ملکی اور غیر ملکی مارکیٹ میں سونے کی قیمت میں ...,0
1,بہار میں چمکی بخار سے مرنے والے بچوں کی تعداد ...,0
2,اکشے کمار کی والدہ کی طبعیت ناساز، آئی سی یو م...,0
3,وہ ملک جس نے عربی بولنے والی روبوٹ نرس متعارف ...,0
4,اسرائیلی سائبرکمپنی نےمتحدہ عرب امارات میں اپن...,0


In [8]:
train['labels'].value_counts()

labels
0.0    600
1.0    438
Name: count, dtype: int64

### Preprocessing

In [9]:
def preprocessing(news):
    processed_news = []  # list to store processed data
    urdu_diacritics = ['ِ', 'ٰ', 'ُ', 'ٍ', 'ً', 'َ']
    urdu_digits = ['۶', '۴', '۵', '۸', '۲', '۰', '۷', '۹', '۳', '۱']
    english_digits = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
    with open('/content/drive/MyDrive/azhar/nlp_junior_project/stop_words.txt', encoding='utf-8') as f:
        stopwords = set(line.strip() for line in f)
    for sentence in news:
        #urdu diacritics removal
        for diacritic in urdu_diacritics:
            sentence = sentence.replace(diacritic, '')
        #deletion punctuation
        sentence = re.sub(r'[؛۔٫٪+=@#!؟،۔)(}{]', '', sentence)
        #symbols removed
        for digit in urdu_digits + english_digits:
            sentence = sentence.replace(digit, '')
        #stopwords removal
        words = sentence.split()
        filtered_words = [word for word in words if word not in stopwords]
        # Join the filtered words back into a sentence
        processed_sentence = ' '.join(filtered_words)
        processed_news.append(processed_sentence)
    return processed_news

train_preprocessed = preprocessing(train.text)
val_preprocessed = preprocessing(val.text)
test_preprocessed = preprocessing(test.text)

In [10]:
train['news'] = train_preprocessed
val['news'] = val_preprocessed
test['news'] = test_preprocessed

In [11]:
train

Unnamed: 0,text,labels,category,news
0,پاکستان کے وزیراعظم عمران خان سعودی عرب کے دار...,0.0,Business,پاکستان وزیراعظم عمران خان سعودی عرب دارالحکوم...
1,پاکستان اسٹاک ایکسچینج (پی ایس ایکس) میں کاروب...,0.0,Business,پاکستان اسٹاک ایکسچینج پی ایس ایکس کاروباری ہف...
2,روس نے طالبان کو اسلحہ فراہم کرنے کا امریکی ال...,0.0,Business,روس طالبان اسلحہ فراہم امریکی الزام مسترد کابل...
3,انٹر بینک مارکیٹ میں ڈالر سستا ہو گیا ، سٹاک ا...,0.0,Business,انٹر بینک مارکیٹ ڈالر سستا سٹاک ایکسچینج کاروب...
4,چین نے پاکستان پر ڈالروں کی بارش کردی، ایک ارب...,0.0,Business,چین پاکستان ڈالروں بارش ایک ارب ڈالر دیے کراچی...
...,...,...,...,...
1033,﻿ایک تازہ ترین خبر کے مطابق ايشيا کے تیز ترین ...,1.0,Technology,﻿ایک تازہ خبر مطابق ايشيا تیز سپر کمپیوٹرز جاپ...
1034,﻿\nدبئی (صباح نیوز) متحدہ عرب امارات کا دسواں ...,1.0,Technology,﻿ دبئی صباح نیوز متحدہ عرب امارات دسواں سیٹلائ...
1035,﻿پاکستان میں سورج گرہن دن بارہ بجے دیکھا جا سک...,1.0,Technology,﻿پاکستان سورج گرہن دن بارہ بجے جا سکے اتوار اي...
1036,﻿ویب ڈیسک:عمر بڑھنے کی ساتھ ساتھ انسان کی خوب ...,1.0,Technology,﻿ویب ڈیسک:عمر بڑھنے انسان خوب صورتی ميں اضافہ ...


In [12]:
#importing tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("urduhack/roberta-urdu-small")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [13]:
def getTensor(dataframe):
  news = dataframe.news.values
  labels = dataframe.labels.values
  processed_news = preprocessing(news)

  input_ids = []
  attention_masks = []
  for news in processed_news:
      encoded_dict = tokenizer.encode_plus(
                          news,                      # data encode.
                          add_special_tokens = True,
                          max_length = 512,           #max length retained 512
                          padding = 'max_length',
                          truncation = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )


      input_ids.append(encoded_dict['input_ids'])


      attention_masks.append(encoded_dict['attention_mask'])
  #lists to tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  dataset = TensorDataset(input_ids, attention_masks, labels)
  return dataset

In [14]:
train_dataset = getTensor(train)
val_dataset = getTensor(val)
test_dataset = getTensor(test)

In [None]:
# from sklearn.preprocessing import LabelEncoder

# # Assuming labels are in a list or numpy array called 'labels_list'
# label_encoder = LabelEncoder()
# encoded_labels = label_encoder.fit_transform(train.labels)
# val_labels = label_encoder.fit_transform(val.labels)

In [15]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 16
train_dataloader = DataLoader(
            train_dataset,
            shuffle = True,
            batch_size = batch_size
        )
validation_dataloader = DataLoader(
            val_dataset,
            shuffle = False,
            batch_size = batch_size
        )

In [16]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [17]:
#importing pretrained model
model2 = AutoModelForSequenceClassification.from_pretrained("urduhack/roberta-urdu-small", num_labels=2)
model2.to(device)

pytorch_model.bin:   0%|          | 0.00/507M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at urduhack/roberta-urdu-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

###model training

In [18]:
from torch.nn import Dropout

# Dropout 20%
dropout_prob = 0.25
model2.add_module("dropout", Dropout(dropout_prob))

# L2 regularization
optimizer = AdamW(model2.parameters(), lr = 2e-5)# removed this from this line 'weight_decay=0.02, eps = 1e-8'

# Early stopping
#early_stopping_steps = 3
#early_stopping_tolerance = 0.005
training_stats = []
total_t0 = time.time()
best_accuracy = 0
early_stopping_counter = 0

epochs = 4
criterion = nn.BCELoss()
#criterion = torch.nn.BCEWithLogitsLoss (pos_weight = torch.tensor (0.5))
#criterion = torch.nn.BCEWithLogitsLoss (weight=torch.tensor([1.0, 12.0]))


import random
import numpy as np

seed_val = 16

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()
best_accuracy = 0
for epoch_i in range(0, epochs):
    #Training
    print("")
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model2.train()
    y_train_true = []
    y_train_pred = []

    for step, batch in enumerate(train_dataloader):

        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)
        #labels = torch.tensor(encoded_labels[step * batch_size: (step + 1) * batch_size]).to(device)
        labels = labels.type(torch.LongTensor).to(device)
        model2.zero_grad()
        out = model2(input_ids, token_type_ids=None, attention_mask=input_mask, labels=labels)
        loss = out[0]
        logits = out[1]

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model2.parameters(), 1.0)
        optimizer.step()

        pred = torch.argmax(logits, dim = 1)
        total_train_accuracy +=  torch.sum(pred == labels).item()
        y_train_true.extend(labels.flatten().cpu().numpy())
        y_train_pred.extend(pred.flatten().cpu().numpy())

    avg_train_accuracy = total_train_accuracy / len(train_dataloader.dataset)
    avg_train_loss = total_train_loss / len(train_dataloader.dataset)
    train_conf_matrix = confusion_matrix(y_train_true, y_train_pred)
    print("  Accuracy: {}".format(avg_train_accuracy))
    print("  Training loss: {}".format(avg_train_loss))
    print("  Training confusion matrix: {}".format(train_conf_matrix))


    # Validation
    print("")
    print("Validation...")
    model2.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    y_true = []
    y_pred = []
    y_val_true = []
    y_val_pred = []

    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)
        labels = labels.type(torch.LongTensor).to(device)
        with torch.no_grad():
            out = model2(input_ids, token_type_ids=None, attention_mask=input_mask,labels=labels)
            loss = out[0]
            logits = out[1]

        total_eval_loss += loss.item()
        pred = torch.argmax(logits, dim = 1)
        total_eval_accuracy += torch.sum(pred == labels).item()
        y_true.append(labels.flatten())
        y_pred.append(pred.flatten())
        y_val_true.extend(labels.flatten().cpu().numpy())
        y_val_pred.extend(pred.flatten().cpu().numpy())

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)
    print("  Accuracy: {}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader.dataset)
    print("  Validation loss: {}".format(avg_val_loss))
    val_conf_matrix = confusion_matrix(y_val_true, y_val_pred)
    print("  Validation confusion matrix: {}".format(val_conf_matrix))
    training_time = format_time(time.time() - t0)
    print()

    y_true = torch.cat(y_true).tolist()
    y_pred = torch.cat(y_pred).tolist()
    print("This epoch took: {:}".format(training_time))
    print('roc_auc score: ', roc_auc_score(y_true,y_pred))
    print('F1 score:',f1_score(y_true, y_pred))
    print()

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Train Accur.': avg_train_accuracy,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
        }
    )
    print()

    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy

print()
print("="*10)
print("Summary")
print("Total time {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))




Epoch 1 / 4
Training...
  Accuracy: 0.6454720616570327
  Training loss: 0.03886873310930697
  Training confusion matrix: [[467 133]
 [235 203]]

Validation...
  Accuracy: 0.7366412213740458
  Validation loss: 0.040844972295160514
  Validation confusion matrix: [[140  10]
 [ 59  53]]

This epoch took: 0:01:41
roc_auc score:  0.7032738095238096
F1 score: 0.6057142857142856



Epoch 2 / 4
Training...
  Accuracy: 0.7447013487475915
  Training loss: 0.03106268605859753
  Training confusion matrix: [[475 125]
 [140 298]]

Validation...
  Accuracy: 0.7442748091603053
  Validation loss: 0.03213612086900318
  Validation confusion matrix: [[106  44]
 [ 23  89]]

This epoch took: 0:01:42
roc_auc score:  0.7506547619047619
F1 score: 0.726530612244898



Epoch 3 / 4
Training...
  Accuracy: 0.8188824662813102
  Training loss: 0.02413870827770417
  Training confusion matrix: [[510  90]
 [ 98 340]]

Validation...
  Accuracy: 0.7748091603053435
  Validation loss: 0.02989669068083736
  Validation confu

###Model Testing

In [19]:
model2.eval()

batch_size = 16
test_dataloader = DataLoader(
            test_dataset,
            shuffle = False,
            batch_size = batch_size)

total_test_accuracy = 0
total_test_loss = 0
y_true = []
y_pred = []
y_test_true = []
y_test_pred = []

for batch in test_dataloader:
    input_ids = batch[0].to(device)
    input_mask = batch[1].to(device)
    labels = batch[2].to(device)

    with torch.no_grad():
        out = model2(input_ids, token_type_ids=None, attention_mask=input_mask,labels=labels)
        loss = out[0]
        logits = out[1]

    total_test_loss += loss.item()
    pred = torch.argmax(logits, dim = 1)
    total_test_accuracy += torch.sum(pred == labels).item()
    y_true.append(labels.flatten())
    y_pred.append(pred.flatten())
    y_test_true.extend(labels.flatten().cpu().numpy())
    y_test_pred.extend(pred.flatten().cpu().numpy())

avg_test_accuracy = total_test_accuracy / len(test_dataloader.dataset)
print("Accuracy: {}".format(avg_test_accuracy))
test_conf_matrix = confusion_matrix(y_test_true, y_test_pred)
print("test conf matrix: {}".format(test_conf_matrix))
avg_test_loss = total_test_loss / len(test_dataloader.dataset)
print("loss: {}".format(avg_test_loss))


y_true = torch.cat(y_true).tolist()
y_pred = torch.cat(y_pred).tolist()
print('roc_auc: ', roc_auc_score(y_true,y_pred))
print('F1:',f1_score(y_true, y_pred))

Accuracy: 0.7166666666666667
test conf matrix: [[189  11]
 [ 74  26]]
loss: 0.053217197830478354
roc_auc:  0.6024999999999999
F1: 0.3795620437956205


In [None]:
test.labels.value_counts()

0    200
1    100
Name: labels, dtype: int64

In [None]:
val.labels.value_counts()

0.0    150
1.0    112
Name: labels, dtype: int64