# BBC News Articles

## Task 1: Exploratory Data Analytics
###### (a) Load the dataset and construct a feature vector for each article in the. You need to report the number of articles, and the number of extracted features. Show 5 example articles with their extracted features using a dataframe.
###### (b) Conduct term frequency analysis and report three plots: (i) top-50 term frequency distribution across the entire dataset, (ii) term frequency distribution for respective class of articles, and (iii) class distribution.

Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import random

Import Data

In [2]:
df = pd.read_csv("train.csv", skiprows=0, header=0, na_values= "", dtype=str)
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1976,lifestyle governs mobile choice faster better ...,tech
1,1797,french honour director parker british film dir...,entertainment
2,1866,fockers fuel festive film chart comedy meet fo...,entertainment
3,1153,housewives lift channel 4 ratings debut us tel...,entertainment
4,342,u2 desire number one u2 three prestigious gram...,entertainment


Vectorize Data

In [3]:
articles_text = df["Text"].to_numpy()

#select 5 random articles for task 1
random_sample = random.sample(list(articles_text), 5)

## APPROACH ONE ##
vectorizer1 = CountVectorizer()
vectorizer1.fit(articles_text)

vectorizer1_sample = CountVectorizer()
vectorizer1_sample.fit(random_sample)

#Summary
#print(f'vector vocabulary - {vectorizer.vocabulary_}\n')

# encode document
vector1 = vectorizer1.transform(articles_text)
vector1_sample = vectorizer1_sample.transform(random_sample)

# summarize encoded vector
print("Method 1")
print(f'article vector\n {vector1.toarray()}')
print(f'\narticle vector (5 articles)\n {vector1_sample.toarray()}')

## APPROACH TWO ##
vectorizer2 = TfidfVectorizer()
vectorizer2.fit(articles_text)

vectorizer2_sample = TfidfVectorizer()
vectorizer2_sample.fit(random_sample)

#Summary
#print(f'vector vocabulary - {vectorizer.vocabulary_}\n')

# encode document
vector2 = vectorizer2.transform(articles_text)
vector2_sample = vectorizer2_sample.transform(random_sample)

# summarize encoded vector
print('\n', "Method 2")
print(f'article vector\n {vector2.toarray()}')
print(f'\narticle vector (5 articles)\n {vector2_sample.toarray()}')
print('\nArticles:', vector2.shape[0], ', Extracted Features:', vector2.shape[1])

Method 1
article vector
 [[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

article vector (5 articles)
 [[0 0 0 ... 0 0 1]
 [2 0 1 ... 0 0 0]
 [0 0 0 ... 2 0 0]
 [1 0 0 ... 0 1 0]
 [2 2 0 ... 0 0 0]]

 Method 2
article vector
 [[0.         0.02011467 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]

article vector (5 articles)
 [[0.         0.         0.         ... 0.         0.         0.05714108]
 [0.07651776 0.         0.05712739 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.10146309 0.         0.        ]
 [0.02856

# Task 3

Use this code for Task 3(a). 

In [4]:
from math import floor

m = [0.1, 0.3, 0.5, 0.7, 0.9]
training_output = pd.DataFrame(columns=m)
testing_output = pd.DataFrame(columns=m)

NUM_ARTICLES = vector1.shape[0]

# LOGISTIC REGRESSION
training_accuracies = {}
testing_accuracies = {}
for m_value in m:
    TRAIN_LENGTH = floor(m_value * NUM_ARTICLES)
    VECTOR_ARRAY = vector1.toarray() # change this to vector 2 if needed
    X_train = VECTOR_ARRAY[0:TRAIN_LENGTH]
    X_test = VECTOR_ARRAY[TRAIN_LENGTH:]
    Y_train = df["Category"].to_list()[:TRAIN_LENGTH]
    Y_test = df["Category"].to_list()[TRAIN_LENGTH:]

    # train your model here

    # put your accuracy calc here
    training_accuracies[m_value] = 0
    testing_accuracies[m_value] = 0

training_output = pd.concat(objs=[training_output, pd.DataFrame(training_accuracies, index=['LR'])])
testing_output = pd.concat(objs=[testing_output, pd.DataFrame(testing_accuracies, index=['LR'])])

# NAIVE BAYES
training_accuracies = {}
testing_accuracies = {}
for m_value in m:
    TRAIN_LENGTH = floor(m_value * NUM_ARTICLES)
    VECTOR_ARRAY = vector1.toarray() # change this to vector 2 if needed
    X_train = VECTOR_ARRAY[0:TRAIN_LENGTH]
    X_test = VECTOR_ARRAY[TRAIN_LENGTH:]
    Y_train = df["Category"].to_list()[:TRAIN_LENGTH]
    Y_test = df["Category"].to_list()[TRAIN_LENGTH:]

    # train your model here

    # put your accuracy calc here
    training_accuracies[m_value] = 0
    testing_accuracies[m_value] = 0
    
training_output = pd.concat(objs=[training_output, pd.DataFrame(training_accuracies, index=['NB'])])
testing_output = pd.concat(objs=[testing_output, pd.DataFrame(testing_accuracies, index=['NB'])])

# SVM
training_accuracies = {}
testing_accuracies = {}
for m_value in m:
    TRAIN_LENGTH = floor(m_value * NUM_ARTICLES)
    VECTOR_ARRAY = vector1.toarray() # change this to vector 2 if needed
    X_train = VECTOR_ARRAY[0:TRAIN_LENGTH]
    X_test = VECTOR_ARRAY[TRAIN_LENGTH:]
    Y_train = df["Category"].to_list()[:TRAIN_LENGTH]
    Y_test = df["Category"].to_list()[TRAIN_LENGTH:]

    # train your model here

    # put your accuracy calc here
    training_accuracies[m_value] = 0
    testing_accuracies[m_value] = 0
    
training_output = pd.concat(objs=[training_output, pd.DataFrame(training_accuracies, index=['SVM'])])
testing_output = pd.concat(objs=[testing_output, pd.DataFrame(testing_accuracies, index=['SVM'])])

# Not Nearest Neighbour (lol)
training_accuracies = {}
testing_accuracies = {}
for m_value in m:
    TRAIN_LENGTH = floor(m_value * NUM_ARTICLES)
    VECTOR_ARRAY = vector1.toarray() # change this to vector 2 if needed
    X_train = VECTOR_ARRAY[0:TRAIN_LENGTH]
    X_test = VECTOR_ARRAY[TRAIN_LENGTH:]
    Y_train = df["Category"].to_list()[:TRAIN_LENGTH]
    Y_test = df["Category"].to_list()[TRAIN_LENGTH:]

    # train your model here

    # put your accuracy calc here
    training_accuracies[m_value] = 0
    testing_accuracies[m_value] = 0
    
training_output = pd.concat(objs=[training_output, pd.DataFrame(training_accuracies, index=['NN'])])
testing_output = pd.concat(objs=[testing_output, pd.DataFrame(testing_accuracies, index=['NN'])])

pd.merge(training_output, testing_output, left_index=True, right_index=True, suffixes=('_train', '_test'))

Unnamed: 0,0.1_train,0.3_train,0.5_train,0.7_train,0.9_train,0.1_test,0.3_test,0.5_test,0.7_test,0.9_test
LR,0,0,0,0,0,0,0,0,0,0
NB,0,0,0,0,0,0,0,0,0,0
SVM,0,0,0,0,0,0,0,0,0,0
NN,0,0,0,0,0,0,0,0,0,0
