<a href="https://colab.research.google.com/github/vin-thomas/BBC_news_classification/blob/main/BBC_LSTM_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import torch
import glob
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**2.0 Download and unzip the dataset**

In [None]:
URL = "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip" 
!wget -P 'Data/' $URL

--2022-01-24 07:13:30--  http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
Resolving mlg.ucd.ie (mlg.ucd.ie)... 137.43.93.132
Connecting to mlg.ucd.ie (mlg.ucd.ie)|137.43.93.132|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2874078 (2.7M) [application/zip]
Saving to: ‘Data/bbc-fulltext.zip.1’


2022-01-24 07:13:31 (4.58 MB/s) - ‘Data/bbc-fulltext.zip.1’ saved [2874078/2874078]



In [None]:
!unzip "/content/Data/bbc-fulltext.zip" -d 'Data/'

Archive:  /content/Data/bbc-fulltext.zip
replace Data/bbc/business/001.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


##**3.0 Save Labels and Content to lists**

In [None]:
file_name_iter = glob.iglob('/content/Data/bbc/**/*.txt')

In [None]:
def get_topic(file):
  return file.split('/')[-2]

In [None]:
def get_content(file):
  with open(file, 'rb') as f:
    content = f.read()
    return content

In [None]:
labels = []
content = []

for file in file_name_iter:
  labels.append(get_topic(file))
  content.append (get_content(file))

In [None]:
len(labels), len(content)

(2225, 2225)

##**4.0 Prepare a word index, dictionary**

In [None]:
word2idx= {}
idx = 0
n_l = []
news_len =0

for item in content:
  word_list = word_tokenize(str(item))
  n_l.append(len(word_list))
  for word in word_list:
    if word not in word2idx:
      word2idx[word]= idx
      idx += 1
      


In [None]:
max_len_article = max(n_l)
no_of_articles= len(content)

In [None]:
dataset = np.zeros((no_of_articles, 100), dtype= int)
dataset.shape

(2225, 100)

In [None]:
i=0
idx_list=[]
for item in content:
  word_list= word_tokenize(str(item))
  for token in word_list:
    if word2idx.get(token) is not None:
      idx = word2idx.get(token)
    else:
      idx= 0
    idx_list.append(idx)
  pad_list = [0]*(max_len_article- len(idx_list))
  idx_list = idx_list + pad_list
  dataset[i]= idx_list[0:100]
  idx_list=[]
  i+=1

In [None]:
dataset= torch.tensor(dataset)

####**5.0 DownLoad the pretrained vectors**

In [None]:
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"
!wget -P $FILE $URL
!unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

--2022-01-24 07:57:13--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘fastText/crawl-300d-2M.vec.zip.2’


2022-01-24 07:57:53 (36.4 MB/s) - ‘fastText/crawl-300d-2M.vec.zip.2’ saved [1523785255/1523785255]

Archive:  fastText/crawl-300d-2M.vec.zip
replace fastText/crawl-300d-2M.vec? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


###**6.0 Load the pre-trained vectors and create the embedding matrix**

In [None]:
fin = open('/content/fastText/crawl-300d-2M.vec', 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())

In [None]:
embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))

In [None]:
count =0
for line in fin:
  tokens = line.rstrip().split(' ')
  word = tokens[0]
  if word in word2idx:
    embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

In [None]:
embeddings= torch.tensor(embeddings)

Now we have an embedding array wich has the vector for each word in our dictionary. Further, we have 'dataset' which gives the word index for each article

###**7.0 Create Pytorch Dataloaders**

In [None]:
labels, uniques = pd.factorize(labels)

In [None]:
labels = torch.from_numpy(labels)

In [None]:
dataset.shape, labels.shape

(torch.Size([2225, 100]), torch.Size([2225]))

In [None]:
from sklearn.model_selection import train_test_split

# Train Test Split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(dataset, labels, test_size=0.1, random_state=42)

In [None]:
len(train_inputs), len(val_inputs)

(2002, 223)

In [None]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler)


batch_size=50

# Create DataLoader for training data
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for validation data
val_data = TensorDataset(val_inputs, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=50)

##**8.Device**

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


##**8. LSTM Model Architecture**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class LSTM(nn.Module):
  def __init__(self,
               pretrained_embedding,
               ):

      super(LSTM, self).__init__()
      self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=False)

      self.lstm= nn.LSTM(input_size= 300, hidden_size= 128, num_layers=2, batch_first= True)

      self.fc = nn.Linear(128, 5)
        
      self.dropout = nn.Dropout(0.5)
        
    
  def forward(self, dataset):

    x_embed = self.embedding(dataset).float()
   

    x, _ = self.lstm(x_embed)

    x = x[:, -1, :]
   
    logits = self.fc(x)
    
    return logits



##**9. Instatiate the LSTM Model**

In [None]:
model = LSTM(embeddings)
model = model.to(device)
model

LSTM(
  (embedding): Embedding(50108, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=5, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

##**9.  Optimizer and Loss Function**

In [None]:
import torch.optim as optim

# Instantiate Adadelta optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=.001)
criterion = torch.nn.CrossEntropyLoss()

##**10. Train**

In [None]:
def evaluate ():
  eval_model = model
  eval_model.eval
  val_accuracy = []
  val_loss = []
  
  for inputs, labels in val_dataloader:
    inputs = inputs.to(device)
    labels = labels.to(device)
    logits = eval_model(inputs)
    
    loss = criterion (logits, labels)
    val_loss.append(loss.item())
    _, predicted = torch.max(logits, 1)

        
    accuracy = (predicted == labels).cpu().numpy().mean() * 100
    
    val_accuracy.append(accuracy)

  val_loss = np.mean(val_loss)
  val_accuracy = np.mean(val_accuracy)

  return val_loss, val_accuracy


In [None]:
# No of Epochs
epoch = 10

# keeping the network in train mode
model.train()
train_losses,  train_accuracy = [], []

# Loop for no of epochs
for e in range(epoch):
    train_loss = 0
    correct = 0
    # Iterate through all the batches in each epoch
    for inputs, labels in train_dataloader:

      # Convert the image and label to gpu for faster execution
      inputs = inputs.to(device)
      labels = labels.to(device)
          
      # Zero the parameter gradients
      optimizer.zero_grad()
      
      # Passing the data to the model (Forward Pass)
      outputs = model(inputs)

                
      # Calculating the loss
      loss = criterion(outputs, labels)
      train_loss += loss.item()

      # Performing backward pass (Backpropagation)
      loss.backward()

      # optimizer.step() updates the weights accordingly
      optimizer.step()

      _, predicted = torch.max(outputs, 1)
      correct += (predicted == labels).sum().item()

      

      val_loss, val_accuracy = evaluate()
      
      
    # Accuracy calculation
    
    train_losses.append(train_loss/len (train_data))
    train_accuracy.append(100 * correct/len(train_data))
    print('epoch: {}, Train Loss:{:.6f} Train Accuracy: {:.2f} Validation loss: {:.2f} Validation accuracy: {:.8f} '.format(e+1,train_losses[-1], train_accuracy[-1], val_loss, val_accuracy))

epoch: 1, Train Loss:0.026706 Train Accuracy: 40.36 Validation loss: 0.79 Validation accuracy: 79.72173913 
epoch: 2, Train Loss:0.011409 Train Accuracy: 83.57 Validation loss: 0.96 Validation accuracy: 66.45217391 
epoch: 3, Train Loss:0.010099 Train Accuracy: 85.41 Validation loss: 0.51 Validation accuracy: 84.05217391 
epoch: 4, Train Loss:0.004983 Train Accuracy: 94.31 Validation loss: 0.43 Validation accuracy: 88.59130435 
epoch: 5, Train Loss:0.003264 Train Accuracy: 96.35 Validation loss: 0.44 Validation accuracy: 89.32173913 
epoch: 6, Train Loss:0.007537 Train Accuracy: 88.81 Validation loss: 0.50 Validation accuracy: 88.92173913 
epoch: 7, Train Loss:0.001945 Train Accuracy: 98.05 Validation loss: 0.50 Validation accuracy: 88.05217391 
epoch: 8, Train Loss:0.002257 Train Accuracy: 96.95 Validation loss: 0.43 Validation accuracy: 88.85217391 
epoch: 9, Train Loss:0.001226 Train Accuracy: 98.85 Validation loss: 0.40 Validation accuracy: 88.45217391 
epoch: 10, Train Loss:0.0005