In [39]:
#Importing Libraries
import copy
import numpy as np
import re
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to /home/vs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
#Loading of data
titles = []
categories = []
with open('dsjVoxArticles.tsv','r') as tsv:
    count = 0;

    for line in tsv:
        a = line.strip().split('\t')[:3]
        if a[2] in ['Business & Finance', 'Health Care', 'Science & Health', 'Politics & Policy', 'Criminal Justice']:
            title = a[0].lower()
            title = re.sub('\s\W',' ',title)
            title = re.sub('\W\s',' ',title)
            titles.append(title)
            categories.append(a[2])

In [41]:
#Now spliting the data 

title_tr, title_te, category_tr, category_te = train_test_split(titles,categories)
title_tr, title_de, category_tr, category_de = train_test_split(title_tr,category_tr)
print("Training: ",len(title_tr))
print("Developement: ",len(title_de),)
print("Testing: ",len(title_te))


Training:  1779
Developement:  594
Testing:  792


In [42]:
# Vectorize the data using Bag of words (BOW)

tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)

vectorizer.fit(iter(title_tr))
Xtr = vectorizer.transform(iter(title_tr))
Xde = vectorizer.transform(iter(title_de))
Xte = vectorizer.transform(iter(title_te))

encoder = LabelEncoder()
encoder.fit(category_tr)
Ytr = encoder.transform(category_tr)
Yde = encoder.transform(category_de)
Yte = encoder.transform(category_te)

In [43]:
#Feature Reduction
# We can check the variance of the feature and drop them based on a threshold
print("Number of features before reduction : ", Xtr.shape[1])
selection = VarianceThreshold(threshold=0.001)
Xtr_whole = copy.deepcopy(Xtr)
Ytr_whole = copy.deepcopy(Ytr)
selection.fit(Xtr)
Xtr = selection.transform(Xtr)
Xde = selection.transform(Xde)
Xte = selection.transform(Xte)
print("Number of features after reduction : ", Xtr.shape[1])


Number of features before reduction :  4270
Number of features after reduction :  1790


In [44]:
nb = MultinomialNB()
nb.fit(Xtr, Ytr)
pred_nb = nb.predict(Xde)
print(classification_report(Yde, pred_nb, target_names=encoder.classes_))


                    precision    recall  f1-score   support

Business & Finance       0.58      0.34      0.43        87
  Criminal Justice       0.71      0.45      0.55        82
       Health Care       0.65      0.49      0.56        67
 Politics & Policy       0.57      0.77      0.66       230
  Science & Health       0.64      0.64      0.64       128

         micro avg       0.61      0.61      0.61       594
         macro avg       0.63      0.54      0.57       594
      weighted avg       0.62      0.61      0.60       594



In [47]:
#Predicting test data using Multinomial Naive Bayesian
pred_final = nb.predict(Xte)
print(classification_report(Yte, pred_final, target_names=encoder.classes_))


                    precision    recall  f1-score   support

Business & Finance       0.63      0.38      0.47       108
  Criminal Justice       0.70      0.51      0.59       100
       Health Care       0.51      0.42      0.46        72
 Politics & Policy       0.64      0.80      0.71       353
  Science & Health       0.69      0.65      0.67       159

         micro avg       0.64      0.64      0.64       792
         macro avg       0.63      0.55      0.58       792
      weighted avg       0.64      0.64      0.63       792

