In [1]:
# Importing required libraries
from bs4 import BeautifulSoup
import requests
import csv
import nltk
import pandas as pd
import re
import numpy as np
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
nltk.download('words')
nltk.download('omw-1.4')
import string
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
# Reading the data and dropping the unnecessary columns
df=pd.read_csv("BBC News Train.csv",encoding = "ISO-8859-1")
df=df.drop(['ArticleId'], axis=1)
df.head()

Unnamed: 0,Text,Category
0,worldcom ex-boss launches defence lawyers defe...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in $168m payout eighteen former e...,business


In [3]:
# Function to remove punctuations
def removePunctuation(text):
    for i in string.punctuation:
      text=text.replace(i,"")
    return text

df['Text'] = df['Text'].apply(removePunctuation)
df.head()

Unnamed: 0,Text,Category
0,worldcom exboss launches defence lawyers defen...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens in ...,business
3,lifestyle governs mobile choice faster bett...,tech
4,enron bosses in 168m payout eighteen former en...,business


In [4]:
# Function to remove stopwords
def removeStopwords(text):
    l=text.split()
    s=""
    for i in l:
      if i not in stop_words :
        s=s+i+" "
    return s[:-1]

stop_words = set(stopwords.words("english"))
df['Text'] = df['Text'].apply(removeStopwords)
df.head()

Unnamed: 0,Text,Category
0,worldcom exboss launches defence lawyers defen...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens maj...,business
3,lifestyle governs mobile choice faster better ...,tech
4,enron bosses 168m payout eighteen former enron...,business


In [5]:
# Function to lowercase
def tolower(text):
    text=text.lower()
    return text

df['Text'] = df['Text'].apply(tolower)
df.head()

Unnamed: 0,Text,Category
0,worldcom exboss launches defence lawyers defen...,business
1,german business confidence slides german busin...,business
2,bbc poll indicates economic gloom citizens maj...,business
3,lifestyle governs mobile choice faster better ...,tech
4,enron bosses 168m payout eighteen former enron...,business


In [6]:
# Function to perform tokenization
def tokenization(text):
  data = word_tokenize(text)
  return data

df['Text'] = df['Text'].apply(tokenization)
df.head()

Unnamed: 0,Text,Category
0,"[worldcom, exboss, launches, defence, lawyers,...",business
1,"[german, business, confidence, slides, german,...",business
2,"[bbc, poll, indicates, economic, gloom, citize...",business
3,"[lifestyle, governs, mobile, choice, faster, b...",tech
4,"[enron, bosses, 168m, payout, eighteen, former...",business


In [7]:
# Function to perform lemmatization
def lemmatize(text):
  output = []
  lst = ['a','r','n','v']
  wordNetLemmatizer = WordNetLemmatizer()
  posTag = pos_tag(text)
  for word, tag in posTag:
    pos = tag[0].lower()
    if pos not in lst:
      pos = 'n'
    output.append(wordNetLemmatizer.lemmatize(word,pos))
  return output

df['Text'] = df['Text'].apply(lemmatize)
df.head()

Unnamed: 0,Text,Category
0,"[worldcom, exboss, launch, defence, lawyer, de...",business
1,"[german, business, confidence, slide, german, ...",business
2,"[bbc, poll, indicate, economic, gloom, citizen...",business
3,"[lifestyle, governs, mobile, choice, faster, w...",tech
4,"[enron, boss, 168m, payout, eighteen, former, ...",business


In [8]:
# Function to calculate TF_ICF
def tficf(df):
  classes =df['Category'].unique()
  num_docs = len(df)
  tf = {}
  cf = {}
  for c in classes:
    a = df[df['Category'] == c]
    for words in a['Text']:

      for word in words:
        if word not in tf:
          tf[word] = {}
        if c not in tf[word]:
          tf[word][c] = 0
        tf[word][c] += 1
    for word in tf.keys():
      if word not in cf:
        cf[word] = 0
      if c in tf[word]:
        cf[word] += 1
  tf_icf = {}
  for word in tf.keys():
    tf_icf[word] = {}
    for c in classes:
      a = tf[word].get(c, 0)
      b = cf[word]
      icf = np.log10(5/b)
      tf_icf[word][c] = a * icf
  return tf_icf

d=tficf(df)

In [9]:
# Converting the TF_ICF to dataframe
print(d)
tf_icf_df=pd.DataFrame(d)
tf_icf_df.head()



Unnamed: 0,worldcom,exboss,launch,defence,lawyer,defend,former,chief,bernie,ebbers,...,butter,playboyz,maniac,granada,cheeky,cassette,grandma,verbalicious,stereophonics,rapport
business,37.74438,1.39794,0.0,0.0,0.0,0.0,0.0,0.0,5.59176,31.45365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tech,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
politics,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sport,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
entertainment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.39794,0.69897,0.69897,0.69897,0.69897,0.69897,0.69897,0.69897,0.69897,0.69897


In [10]:
# Making an empty Dataframe with dimensions( rows= no of unique words, cols= no of unique classes)
cols=tf_icf_df.columns
ro=df.index
temp_list=[]
X=pd.DataFrame(temp_list,columns=cols,index=ro)
X.head()

Unnamed: 0,worldcom,exboss,launch,defence,lawyer,defend,former,chief,bernie,ebbers,...,butter,playboyz,maniac,granada,cheeky,cassette,grandma,verbalicious,stereophonics,rapport
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# Filling the values of TF_ICF as per the words occuring sentence wise
n = df.index
for i in range(len(n)):
  c = df.iloc[i]['Category']
  text = df.iloc[i]['Text']
  for word in text:
     X.iloc[i][word] = tf_icf_df.loc[c][word]
X=X.fillna(0)
X.head()

Unnamed: 0,worldcom,exboss,launch,defence,lawyer,defend,former,chief,bernie,ebbers,...,butter,playboyz,maniac,granada,cheeky,cassette,grandma,verbalicious,stereophonics,rapport
0,37.74438,1.39794,0.0,0.0,0.0,0.0,0.0,0.0,5.59176,31.45365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Function to perform splitting of data
def split(X,y,i):
  #j=1-i
  lx=len(X)
  ly=len(y)
  nd=int(i*lx)
  x_train=X[:nd]
  x_test=X[nd:]
  y_train=y[:nd]
  y_test=y[nd:]

  return x_train,x_test,y_train,y_test

In [13]:
# Training the Naive Bayes classifier 
from sklearn.naive_bayes import MultinomialNB
x_train, x_test, y_train, y_test = split(X,df['Category'],0.7)
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
y_pred = mnb.predict(x_test)

In [14]:
# Calculating the probability of each category based on the frequency of documets in training set
cb=0
ct=0
cp=0
cs=0
ce=0
n=len(y_train)
for i in y_train:
  if i=='business':
    cb+=1
  elif i=='tech':
    ct+=1
  elif i=='politics':
    cp+=1
  elif i=='sport':
    cs+=1
  elif i=='entertainment':
    ce+=1

prob_b=cb/n
prob_t=ct/n
prob_p=cp/n
prob_s=cs/n
prob_e=ce/n
print("Probability of business : ",prob_b)
print("Probability of tech : ",prob_t)
print("Probability of politics : ",prob_p)
print("Probability of sport : ",prob_s)
print("Probability of entertainment : ",prob_e)


Probability of business :  0.22722914669223393
Probability of tech :  0.17833173537871524
Probability of politics :  0.18024928092042186
Probability of sport :  0.23873441994247363
Probability of entertainment :  0.17545541706615533


In [15]:
# Calculation of the various metrics
def score_metrics(y_test,y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred,average='macro')
  f1 = f1_score(y_test, y_pred, average='macro')
  precision = precision_score(y_test, y_pred, average='macro')

  print('Accuracy : ', accuracy)
  print("Precision : ",precision)
  print('Recall : ', recall)
  print('F1 score : ', f1)


y_test=list(y_test)
y_train=list(y_train)
score_metrics(y_test,y_pred)

Accuracy :  0.9977628635346756
Precision :  0.998
Recall :  0.9976744186046511
F1 score :  0.997825384231097


In [16]:
# # Function to perform stemming
def stemming(text):
    stemmer = PorterStemmer()
    words = text
    stemmed_words = []
    for word in words:
        stemmed_words.append(stemmer.stem(word))
    stemmed_sentence = " ".join(stemmed_words)
    return stemmed_words

df['Text'] = df['Text'].apply(stemming)

In [17]:
# Function to perform the above actions for different split sizes
def diff(df,split_size):
  
  d=tficf(df)
  tf_icf_df=pd.DataFrame(d)
  # print(tf_icf_df.head())    ####  
  
  cols=tf_icf_df.columns
  ro=df.index
  temp_list=[]
  X=pd.DataFrame(temp_list,columns=cols,index=ro)
  n =df.index
  for i in range(len(n)):
    cat = df.iloc[i]['Category']
    text = df.iloc[i]['Text']
    for word in text:
      X.iloc[i][word] = tf_icf_df.loc[cat][word]

  X=X.fillna(0)
  # print(X.head())                  ####
  # print(tf_icf_df.shape,X.shape)   ####
  x_train, x_test, y_train, y_test = split(X,df['Category'],split_size)
  mnb = MultinomialNB()
  mnb.fit(x_train, y_train)
  y_pred = mnb.predict(x_test)
  score_metrics(y_test,y_pred)


In [18]:
# Experimenting by using stemming instead of lemmatization by keeping all other parameters same
df=pd.read_csv("BBC News Train.csv",encoding = "ISO-8859-1")
df=df.drop(['ArticleId'], axis=1)
df['Text'] = df['Text'].apply(removePunctuation)
df['Text'] = df['Text'].apply(removeStopwords)
df['Text'] = df['Text'].apply(tolower)
df['Text'] = df['Text'].apply(tokenization)
#df['Text'] = df['Text'].apply(lemmatize)
df['Text'] = df['Text'].apply(stemming)
diff(df,0.6)


Accuracy :  0.9983221476510067
Precision :  0.9984848484848484
Recall :  0.9983606557377049
F1 score :  0.998416498458746


In [19]:
# Experimenting by using stemming instead of lemmatization and split size 0.8
df=pd.read_csv("BBC News Train.csv",encoding = "ISO-8859-1")
df=df.drop(['ArticleId'], axis=1)
df['Text'] = df['Text'].apply(removePunctuation)
df['Text'] = df['Text'].apply(removeStopwords)
df['Text'] = df['Text'].apply(tolower)
df['Text'] = df['Text'].apply(tokenization)
#df['Text'] = df['Text'].apply(lemmatize)
df['Text'] = df['Text'].apply(stemming)
diff(df,0.8)

Accuracy :  0.9966442953020134
Precision :  0.9971014492753623
Recall :  0.9966101694915255
F1 score :  0.9968307442759997


In [20]:
# Experimenting by using split size 0.5
df=pd.read_csv("BBC News Train.csv",encoding = "ISO-8859-1")
df=df.drop(['ArticleId'], axis=1)
df['Text'] = df['Text'].apply(removePunctuation)
df['Text'] = df['Text'].apply(removeStopwords)
df['Text'] = df['Text'].apply(tolower)
df['Text'] = df['Text'].apply(tokenization)
df['Text'] = df['Text'].apply(lemmatize)
#df['Text'] = df['Text'].apply(stemming)
diff(df,0.5)

Accuracy :  0.9986577181208054
Precision :  0.9987341772151899
Recall :  0.9986842105263157
F1 score :  0.9987050133584787


In [21]:
# Experimenting by using TF-IDF instead of TF-ICF and split size 0.7
df=pd.read_csv("BBC News Train.csv",encoding = "ISO-8859-1")
df=df.drop(['ArticleId'], axis=1)
df['Text'] = df['Text'].apply(removePunctuation)
df['Text'] = df['Text'].apply(removeStopwords)
df['Text'] = df['Text'].apply(tolower)
df['Text'] = df['Text'].apply(tokenization)
#df['Text'] = df['Text'].apply(lemmatize)
df['Text'] = df['Text'].apply(stemming)



X_train, X_test, y_train, y_test = split(df['Text'], df['Category'],0.7)
X_train_joined = [' '.join(doc) for doc in X_train]
X_test_joined = [' '.join(doc) for doc in X_test]
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_joined)
X_test_tfidf = vectorizer.transform(X_test_joined)

mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)
y_pred = mnb.predict(X_test_tfidf)
score_metrics(y_test,y_pred)

Accuracy :  0.970917225950783
Precision :  0.9713831281500453
Recall :  0.9699074465586094
F1 score :  0.9703292402737617
