# Text Classification Task
In this task, you would require to claasify the BBC News text into 5 classes ['business' 'entertainment' 'politics' 'sport''tech']. For this task, the code skeleton has been given and you have to write your code below #TODO parts. Comments are given with #TODO as helpers

## Importing relevant libraries 
If any of the below list libraries is not installed already, then use "pip install #library_name" to install it

In [67]:
!pip install torch==1.6.0



In [68]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Importing BBC News Dataset
Source data from public data set on BBC news articles:
D. Greene and P. Cunningham. "Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering", Proc. ICML 2006. [PDF] [BibTeX].

http://mlg.ucd.ie/datasets/bbc.html

Cleaned up version of the Dataset is given as csv file with the assignment

In [69]:
data_train = pd.read_csv("bbc-text_train.csv")
data_test= pd.read_csv("bbc-text_test.csv")

In [70]:
data_train.head(16)

Unnamed: 0,category,text
0,entertainment,farrell due to make us tv debut actor colin fa...
1,business,china continues rapid growth china s economy h...
2,business,ebbers aware of worldcom fraud former worldc...
3,entertainment,school tribute for tv host carson more than 1 ...
4,tech,broadband fuels online expression fast web acc...
5,sport,cole faces lengthy injury lay-off aston villa ...
6,tech,no half measures with half-life 2 could half-l...
7,tech,microsoft takes on desktop search microsoft ha...
8,tech,long life promised for laptop pcs scientists a...
9,sport,barcelona title hopes hit by loss barcelona s ...


In [71]:
data_train['category'].value_counts()

sport            413
business         409
politics         334
tech             319
entertainment    305
Name: category, dtype: int64

## Splitting training data into Train and validation set
Note: Validation set is surrogate to test set and while training the network , we evaluate the model on validation set

In [72]:
train_x_df,val_x_df,train_y_df,val_y_df = train_test_split(data_train['text'],data_train['category'],test_size=0.2,random_state=42)
print(train_y_df.head(10))
print(train_x_df.head(10))

1278    business
745     politics
422     business
1083    business
1658    politics
198         tech
15         sport
265     business
752        sport
678        sport
Name: category, dtype: object
1278    news corp eyes video games market news corp  t...
745     talks aim to avert pension strike talks aimed ...
422     high fuel prices hit ba s profits british airw...
1083    rank  set to sell off film unit  leisure group...
1658    kennedy questions trust of blair lib dem leade...
198     dvd copy protection strengthened dvds will be ...
15      radcliffe eyes hard line on drugs paula radcli...
265     wmc profits up amid bid criticism australian m...
752     reds sink 10-man magpies titus bramble s own g...
678     williams battles to aussie title serena willia...
Name: text, dtype: object


## Encoding prediction classes/labels into integers


In [73]:
le =LabelEncoder()
le.fit(train_y_df)
print(le.classes_)
train_y=le.transform(train_y_df)
val_y=le.transform(val_y_df)
test_y=le.transform(data_test['category'])

['business' 'entertainment' 'politics' 'sport' 'tech']


## Converting News text into numerical vector using count vectorizer

In [74]:
vectorizer = CountVectorizer()
vectorizer.fit_transform(train_x_df)
train_x=vectorizer.transform(train_x_df)
val_x=vectorizer.transform(val_x_df)
test_x=vectorizer.transform(data_test['text'])
input_dim = len(test_x.toarray()[1]) #24295

In [75]:
train_x.toarray()
print(train_x)
print(len(test_x.toarray()[1]))

  (0, 1264)	1
  (0, 1342)	1
  (0, 1723)	1
  (0, 1771)	1
  (0, 1891)	1
  (0, 1919)	5
  (0, 2162)	1
  (0, 2182)	5
  (0, 2209)	1
  (0, 2280)	1
  (0, 2285)	7
  (0, 2507)	1
  (0, 2839)	1
  (0, 2900)	2
  (0, 3136)	1
  (0, 3173)	2
  (0, 3186)	1
  (0, 3473)	1
  (0, 3562)	1
  (0, 3855)	1
  (0, 3957)	1
  (0, 3997)	2
  (0, 4614)	3
  (0, 4629)	1
  (0, 4828)	1
  :	:
  (1423, 22172)	1
  (1423, 22389)	1
  (1423, 22392)	1
  (1423, 22530)	1
  (1423, 22626)	1
  (1423, 22717)	2
  (1423, 22867)	1
  (1423, 22872)	1
  (1423, 22970)	1
  (1423, 23060)	1
  (1423, 23067)	2
  (1423, 23069)	4
  (1423, 23234)	1
  (1423, 23289)	1
  (1423, 23541)	1
  (1423, 23557)	2
  (1423, 23707)	6
  (1423, 23754)	1
  (1423, 23755)	1
  (1423, 23788)	2
  (1423, 23894)	2
  (1423, 23912)	2
  (1423, 23923)	2
  (1423, 24019)	1
  (1423, 24035)	4
24295


In [76]:
class ClassificationNet(nn.Module):

    def __init__(self):
        super(ClassificationNet, self).__init__()
        '''
        Defining layers of neural network
        '''
        #TODO 1
        # input dim = 24295 = len(test_x.toarray()[1])
        # output dim = 5 = nr of categories)
        self.fc1 = nn.Linear(24295, 5)


    def forward(self, x):
        """The forward pass of the classifier
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (data_points, num_features)
        Returns:
            the resulting tensor.
        """
        #TODO 2
        x = torch.sigmoid(self.fc1(x))
        return x


net =ClassificationNet()

# TODO 3 (define learning rate)
w = 0.001


# TODO 4 (Construct/define an optimizer object)
#optimizer similiar "adaptive gradient descent" 
optimizer = optim.Adam(net.parameters(), lr=0.05)

# TODO 5 (Construct an loss/criterion object)
criterion = nn.BCELoss() #BCELoss() is suited if classifie 0 or 1

#TODO 6 (define number of epochs/ number of training iteration)
epochs= 30

#converting train and validation set arrays to tensor
train_x_tensor=torch.tensor(train_x.toarray()).float()
train_y_tensor=torch.tensor(train_y)
val_x_tensor=torch.tensor(val_x.toarray()).float()
val_y_tensor=torch.tensor(val_y)


def evaluation_metrics(predict_y,ground_truth_y):
    '''
    Returns accuracy and f1 score metrics for evaluation
    '''
    accuracy=accuracy_score(ground_truth_y,predict_y)
    f1score=f1_score(ground_truth_y,predict_y,average='macro')
    
    return accuracy,f1score

## Training Loop

In [77]:
for i in range(epochs):
    
    # the training routine is these 5 steps:
    
    # TODO 7 step 1. zero the gradients
    optimizer.zero_grad()
    
    # TODO 8 step 2. compute the output
    output = net(train_x_tensor)
    
    
    # TODO 9 step 3. compute the loss (name this as "loss")
    #given y-val (category 0-4) construct a vector(1,5) initalized to 0, then set vector[y-val] = 1
    #add the vectors to an array 
    #transform array to tensor
    train_y_adapted = []
    for ele in train_y:
        ele_res = [0,0,0,0,0]
        ele_res[ele] = 1
        train_y_adapted.append(ele_res)
    print(train_y_adapted)
    
    train_y_adapted_tensor = torch.tensor(train_y_adapted)
    loss = criterion(output, train_y_adapted_tensor.float())
    
    # TODO 10 use loss to produce gradients
    #calc gardients
    loss.backward()
    
    # TODO 11 use optimizer to take gradient step
    optimizer.step() 
    
    
    with torch.no_grad():
        # validation set evaluation:
        
        # TODO 11 compute the output
        output_val = net(val_x_tensor)

        
        # TODO 12 compute the loss (name this as "loss_val")
        #given y-val (category 0-4) construct a vector(1,5) initalized to 0, then set vector[y-val] = 1
        #add the vectors to an array 
        #transform array to tensor
        val_y_adapted = []
        print(val_y.size())
        for ele in val_y:
            ele_res = [0,0,0,0,0]
            ele_res[ele] = 1
            val_y_adapted.append(ele_res) 

        val_y_adapted_tensor = torch.tensor(val_y_adapted)
        loss_val = criterion(output_val, val_y_adapted_tensor.float())
        
        # TODO 13 compute the prediction
        # take as prediction the category which has the hight value
        prediction = []
        for idx, val1 in enumerate(output_val):
            max = -1
            max_pos = 0
            for idx2, ele in enumerate(val1):
                if ele > max:
                    max = ele
                    max_pos = idx2
            prediction.append(max_pos)
            
        
        # TODO 14 Use the "evaluation_metrics" function to find accuracy and f1 score and name this as               "accuracy","f1score")
        (accuracy, f1score) = evaluation_metrics(val_y_tensor, prediction)

        
        
        print('Epoch %d/%d - Loss_train: %.3f   loss_val: %.3f   accuracy_val: %.3f f1score_val: %.3f   '% \
            (i + 1, epochs,loss.item(),loss_val.item(),accuracy,f1score))

, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0], [1, 0, 0, 0, 0], [0, 0, 0, 1, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 0], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [1, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 0], [0, 0, 0, 1, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [1, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0], [1, 0, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 1, 0, 0, 0], [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 0, 0, 0],

## Test set Prediction and Evaluation

In [78]:
# converting test set arrays to tensor
test_x_tensor=torch.tensor(test_x.toarray()).float()
test_y_tensor=torch.tensor(test_y)

with torch.no_grad():
    # Test set evaluation:
    
    # TODO 15 compute the output
    output_test = net(test_x_tensor)
    
    # TODO 16 compute the prediction
    prediction = []
    for idx, val1 in enumerate(output_test):
        max = -1
        max_pos = 0
        for idx2, ele in enumerate(val1):
            if ele > max:
                max = ele
                max_pos = idx2
        prediction.append(max_pos)
    
    # TODO 17 Use the "evaluation_metrics" function to find accuracy and f1 score
    (accuracy_test, f1score_test) = evaluation_metrics(test_y_tensor, prediction)
    
    print('Accuracy_test: %.3f f1score_val: %.3f   '% (accuracy_test,f1score_test))

Accuracy_test: 0.971 f1score_val: 0.971   
