<a href="https://colab.research.google.com/github/vin-thomas/BBC_news_classification/blob/main/BBC_cnn_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import torch
import glob
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

##**Data Download**

In [2]:
URL = "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip" 
!wget -P 'Data/' $URL

--2022-02-20 09:04:53--  http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
Resolving mlg.ucd.ie (mlg.ucd.ie)... 137.43.93.132
Connecting to mlg.ucd.ie (mlg.ucd.ie)|137.43.93.132|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2874078 (2.7M) [application/zip]
Saving to: ‘Data/bbc-fulltext.zip’


2022-02-20 09:04:54 (4.52 MB/s) - ‘Data/bbc-fulltext.zip’ saved [2874078/2874078]



In [None]:
!unzip "/content/Data/bbc-fulltext.zip" -d 'Data/'

In [4]:
file_name_iter = glob.iglob('/content/Data/bbc/**/*.txt')

In [5]:
def get_topic(file):
  return file.split('/')[-2]

In [6]:
def get_content(file):
  with open(file, 'rb') as f:
    content = f.read()
    return content

In [7]:
labels = []
content = []

for file in file_name_iter:
  labels.append(get_topic(file))
  content.append (get_content(file))

##**Prepare a word index, dictionary**

In [10]:
word2idx= {}
idx = 0
n_l = []
news_len =0

for item in content:
  word_list = word_tokenize(str(item))
  n_l.append(len(word_list))
  for word in word_list:
    if word not in word2idx:
      word2idx[word]= idx
      idx += 1
      


In [12]:
max_len_article = max(n_l)
no_of_articles= len(content)


In [13]:
dataset = np.zeros((no_of_articles, max_len_article), dtype= int)
dataset.shape


(2225, 4862)

In [14]:
i=0
idx_list=[]
for item in content:
  word_list= word_tokenize(str(item))
  for token in word_list:
    if word2idx.get(token) is not None:
      idx = word2idx.get(token)
    else:
      idx= 0
    idx_list.append(idx)
  pad_list = [0]*(max_len_article- len(idx_list))
  idx_list = idx_list + pad_list
  dataset[i]= idx_list
  idx_list=[]
  i+=1

In [15]:
dataset= torch.tensor(dataset)

##**DownLoad the pretrained vectors**

In [16]:
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"
!wget -P $FILE $URL
!unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

--2022-02-20 09:09:13--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘fastText/crawl-300d-2M.vec.zip’


2022-02-20 09:10:12 (24.8 MB/s) - ‘fastText/crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

Archive:  fastText/crawl-300d-2M.vec.zip
  inflating: fastText/crawl-300d-2M.vec  


##**Load the pre-trained vectors and create the embedding matrix**

In [17]:
fin = open('/content/fastText/crawl-300d-2M.vec', 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())

In [18]:
embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))

In [19]:
count =0
for line in fin:
  tokens = line.rstrip().split(' ')
  word = tokens[0]
  if word in word2idx:
    embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

In [20]:
embeddings= torch.tensor(embeddings)

Now we have an embedding array wich has the vector for each word in our dictionary. Further, we have 'dataset' which gives the word index for each article

##**Create Pytorch Dataloaders**

In [21]:
labels, uniques = pd.factorize(labels)

In [22]:
labels = torch.from_numpy(labels)

In [23]:
from sklearn.model_selection import train_test_split

# Train Test Split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(dataset, labels, test_size=0.1, random_state=42)

In [24]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler)


batch_size=50

# Create DataLoader for training data
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for validation data
val_data = TensorDataset(val_inputs, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

##**Device**

In [25]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


##**CNN Model Architecture**

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [27]:
class Model(nn.Module):
  def __init__(self,
               pretrained_embedding,
               ):

      super(Model, self).__init__()
      self.vocab_size, self.embed_dim = pretrained_embedding.shape
      self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=False)

            
        
      # Defining first convolution layer with input_channels = 300, output_channels = 100, kernel_size = 3
      self.conv1 = nn.Conv1d(in_channels=300, out_channels=100, kernel_size=3)
      # Defining second convolution layer with input channels = 300, output channels = 100, kernel_size= 4
      self.conv2 = nn.Conv1d(in_channels=300, out_channels=100, kernel_size=4)
      # Defining third convolution layer with input channels =300, output channels = 100, kernel_size =5
      self.conv3 = nn. Conv1d(in_channels= 300, out_channels= 100, kernel_size=5)

            
      # Define the Fully connected layers
      # Each of the channels will generate an element, so (100+100+100 =300)
      self.fc = nn.Linear(300, 5)
        
      self.dropout = nn.Dropout(0.5)
        
    
  def forward(self, dataset):

    x_embed = self.embedding(dataset).float()

    x_reshaped = x_embed.permute(0, 2, 1)
    
    x1 = F.relu(self.conv1(x_reshaped))
    x2 = F.relu(self.conv2(x_reshaped))
    x3 = F.relu(self.conv2(x_reshaped))

    x1_pool = F.max_pool1d(x1, kernel_size=x1.shape[2])
    x2_pool = F.max_pool1d(x2, kernel_size=x2.shape[2])
    x3_pool = F.max_pool1d(x3, kernel_size=x3.shape[2])

    
    x_fc = torch.cat([x1_pool.squeeze(dim=2), x2_pool.squeeze(dim=2), x3_pool.squeeze(dim=2)],
                         dim=1)
    logits = self.fc(self.dropout(x_fc))
    
    return logits

##**Instatiate the CNN Model**

In [28]:
model = Model(embeddings)
model = model.to(device)
model

Model(
  (embedding): Embedding(50108, 300)
  (conv1): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
  (conv3): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (fc): Linear(in_features=300, out_features=5, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

##**Optimizer and Loss Function**

In [29]:
import torch.optim as optim

# Instantiate Adadelta optimizer
optimizer = optim.Adadelta(model.parameters(),
                               lr=.25,
                               rho=0.95)
criterion = nn.CrossEntropyLoss()

##**Train**

In [30]:
def evaluate ():
  eval_model = model
  eval_model.eval
  val_accuracy = []
  val_loss = []
  
  for inputs, labels in val_dataloader:
    inputs = inputs.to(device)
    labels = labels.to(device)
    logits = eval_model(inputs)
    
    loss = criterion (logits, labels)
    val_loss.append(loss.item())
    _, predicted = torch.max(logits, 1)
    
    accuracy = (predicted == labels).cpu().numpy().mean() * 100
    
    val_accuracy.append(accuracy)

  val_loss = np.mean(val_loss)
  val_accuracy = np.mean(val_accuracy)

  return val_loss, val_accuracy


In [31]:
# No of Epochs
epoch = 10

# keeping the network in train mode
model.train()
train_losses,  train_accuracy = [], []

# Loop for no of epochs
for e in range(epoch):
    train_loss = 0
    correct = 0
    # Iterate through all the batches in each epoch
    for inputs, labels in train_dataloader:

      # Convert the image and label to gpu for faster execution
      inputs = inputs.to(device)
      labels = labels.to(device)
      
      # Zero the parameter gradients
      optimizer.zero_grad()
      
      # Passing the data to the model (Forward Pass)
      outputs = model(inputs)
      
      # Calculating the loss
      loss = criterion(outputs, labels)
      train_loss += loss.item()

      # Performing backward pass (Backpropagation)
      loss.backward()

      # optimizer.step() updates the weights accordingly
      optimizer.step()

      _, predicted = torch.max(outputs, 1)
      correct += (predicted == labels).sum().item()
      val_loss, val_accuracy = evaluate()
      
    # Accuracy calculation
    train_losses.append(train_loss/len (train_data))
    train_accuracy.append(100 * correct/len(train_data))
    print('epoch: {}, Train Loss:{:.6f} Train Accuracy: {:.2f} Validation loss: {:.2f} Validation accuracy: {:.2f} '.format(e+1,train_losses[-1], train_accuracy[-1], val_loss, val_accuracy))
    

epoch: 1, Train Loss:0.027491 Train Accuracy: 49.85 Validation loss: 0.99 Validation accuracy: 63.10 
epoch: 2, Train Loss:0.012810 Train Accuracy: 87.86 Validation loss: 0.47 Validation accuracy: 87.65 
epoch: 3, Train Loss:0.006601 Train Accuracy: 93.41 Validation loss: 0.28 Validation accuracy: 92.26 
epoch: 4, Train Loss:0.004227 Train Accuracy: 95.25 Validation loss: 0.20 Validation accuracy: 94.73 
epoch: 5, Train Loss:0.003097 Train Accuracy: 96.30 Validation loss: 0.21 Validation accuracy: 94.66 
epoch: 6, Train Loss:0.002289 Train Accuracy: 97.75 Validation loss: 0.16 Validation accuracy: 95.13 
epoch: 7, Train Loss:0.001844 Train Accuracy: 98.35 Validation loss: 0.15 Validation accuracy: 96.33 
epoch: 8, Train Loss:0.001497 Train Accuracy: 98.45 Validation loss: 0.15 Validation accuracy: 96.73 
epoch: 9, Train Loss:0.001411 Train Accuracy: 99.05 Validation loss: 0.15 Validation accuracy: 96.33 
epoch: 10, Train Loss:0.001076 Train Accuracy: 98.85 Validation loss: 0.12 Validat