In [1]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# import io
# df = pd.read_csv(io.BytesIO(uploaded['BBC News Train.csv']))

In [2]:
df = pd.read_csv('BBC News Train.csv')

The Columns in the dataframe:

In [3]:
for col in df.columns:
  print(col)

ArticleId
Text
Category


***Cleaning the text by removing punctuation, stop words, and converting all text to lowercase:**

***Tokenizing the text by splitting it into words:**

***Performing stemming to reduce words to their root form.**


In [4]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    stop_words = set(stopwords.words('english'))
    text_tokens = nltk.word_tokenize(text)
    filtered_words = [word for word in text_tokens if word not in stop_words]
    text = ' '.join(filtered_words)
    tokens = nltk.word_tokenize(text)
    porter = PorterStemmer()
    stemmed_tokens = [porter.stem(token) for token in tokens]
    return stemmed_tokens

Before:

In [6]:
print(df.head())

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business


In [7]:
df["Text"] = df["Text"].apply(clean_text)

After:

In [8]:
print(df.head())

   ArticleId                                               Text  Category
0       1833  [worldcom, exboss, launch, defenc, lawyer, def...  business
1        154  [german, busi, confid, slide, german, busi, co...  business
2       1101  [bbc, poll, indic, econom, gloom, citizen, maj...  business
3       1976  [lifestyl, govern, mobil, choic, faster, bette...      tech
4        917  [enron, boss, 168m, payout, eighteen, former, ...  business


In [9]:
X=df['Text']
y=df['Category']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split( X, y, random_state=89, test_size=0.3, shuffle=True)

**Implementing the TF-ICF weighting scheme.**

In [11]:
TF = dict()
n = len(X_train)

for i in range(n):
    cat = y_train.iloc[i]
    txt = X_train.iloc[i]

    for j in txt:
        if j not in TF:
            TF[j] = dict()
        if cat not in TF[j]:
            TF[j][cat] = 1
        else:
            TF[j][cat]+=1

In [12]:
import math
TF_ICF = list()
N = y_train.nunique()

for i in TF:
    curr = []
    curr.append(i)
    ICF_base = math.log10(N/len(TF[i]))

    if "business" not in TF[i]:
        curr.append(0)
    else:
        score = TF[i]["business"]*ICF_base
        curr.append(score)

    if "entertainment" not in TF[i]:
        curr.append(0)
    else:
        score = TF[i]["entertainment"]*ICF_base
        curr.append(score)

    if "politics" not in TF[i]:
        curr.append(0)
    else:
        score = TF[i]["politics"]*ICF_base
        curr.append(score)

    if "sport" not in TF[i]:
        curr.append(0)
    else:
        score = TF[i]["sport"]*ICF_base
        curr.append(score)

    if "tech" not in TF[i]:
        curr.append(0)
    else:
        score = TF[i]["tech"]*ICF_base
        curr.append(score)

    TF_ICF.append(curr)
TF_ICF_df = pd.DataFrame(TF_ICF, columns = ["term","business","entertainment","politics","sport","tech"])

In [13]:
TF_ICF_df.head(10)

Unnamed: 0,term,business,entertainment,politics,sport,tech
0,consum,5.523871,0.19382,0.29073,0.0,12.792122
1,drive,0.0,0.0,0.0,0.0,0.0
2,french,0.0,0.0,0.0,0.0,0.0
3,economi,37.714287,0.0,9.983194,0.0,0.221849
4,franc,0.0,0.0,0.0,0.0,0.0
5,econom,33.72101,0.0,7.986555,0.0,0.443697
6,growth,17.540712,0.19382,0.67837,0.0,2.81039
7,acceler,1.77479,0.0,0.443697,0.0,0.221849
8,last,0.0,0.0,0.0,0.0,0.0
9,three,0.0,0.0,0.0,0.0,0.0


### Training Naive Bayes

##### Probabilty of Each Category based on the frequency of documents in the training set that belong to that category.

In [14]:
prob_cat={}
n_1=len(y_train)

for i in range(n_1):
    cat = y_train.iloc[i]

    if(cat not in prob_cat):
            prob_cat[cat]=0
    

    prob_cat[cat]+=1

for i in prob_cat.keys():
    prob_cat[i]/=n_1

print(prob_cat)
    

{'business': 0.22818791946308725, 'sport': 0.23106423777564716, 'entertainment': 0.18216682646212848, 'politics': 0.1840843720038351, 'tech': 0.174496644295302}


##### Probability of Each feature given Each category based on the TF-ICF values of that feature in documents belonging to that category.

Assumption: We can calculate the probability by the formula P(Feature|Given Category)=(TF-ICF value of that feature for the given category)/(sum of TF-ICF values of all features belonging to that category)

In [15]:
sum_cat_tf_icf={'business':TF_ICF_df['business'].sum(), 'entertainment':TF_ICF_df['entertainment'].sum(),'politics':TF_ICF_df['politics'].sum(),'sport':TF_ICF_df['sport'].sum(),'tech':TF_ICF_df['tech'].sum()}

In [16]:
pb_TF_ICF=TF_ICF_df.copy()

pb_TF_ICF['business']/=sum_cat_tf_icf['business']
pb_TF_ICF['entertainment']/=sum_cat_tf_icf['entertainment']
pb_TF_ICF['politics']/=sum_cat_tf_icf['politics']
pb_TF_ICF['sport']/=sum_cat_tf_icf['sport']
pb_TF_ICF['tech']/=sum_cat_tf_icf['tech']

In [17]:
TF_ICF_df.head(10)

Unnamed: 0,term,business,entertainment,politics,sport,tech
0,consum,5.523871,0.19382,0.29073,0.0,12.792122
1,drive,0.0,0.0,0.0,0.0,0.0
2,french,0.0,0.0,0.0,0.0,0.0
3,economi,37.714287,0.0,9.983194,0.0,0.221849
4,franc,0.0,0.0,0.0,0.0,0.0
5,econom,33.72101,0.0,7.986555,0.0,0.443697
6,growth,17.540712,0.19382,0.67837,0.0,2.81039
7,acceler,1.77479,0.0,0.443697,0.0,0.221849
8,last,0.0,0.0,0.0,0.0,0.0
9,three,0.0,0.0,0.0,0.0,0.0


In [18]:
pb_TF_ICF.head(10)

Unnamed: 0,term,business,entertainment,politics,sport,tech
0,consum,0.000927,3.2e-05,6.3e-05,0.0,0.001959
1,drive,0.0,0.0,0.0,0.0,0.0
2,french,0.0,0.0,0.0,0.0,0.0
3,economi,0.006331,0.0,0.002158,0.0,3.4e-05
4,franc,0.0,0.0,0.0,0.0,0.0
5,econom,0.00566,0.0,0.001726,0.0,6.8e-05
6,growth,0.002944,3.2e-05,0.000147,0.0,0.00043
7,acceler,0.000298,0.0,9.6e-05,0.0,3.4e-05
8,last,0.0,0.0,0.0,0.0,0.0
9,three,0.0,0.0,0.0,0.0,0.0


In [19]:
category=['business','entertainment','politics','sport','tech']

In [20]:
pb_term={}

for i in range(len(pb_TF_ICF)):
    info=pb_TF_ICF.iloc[i]
    pb_term[info['term']]={}

    for cat in category:
        pb_term[info['term']][cat]=info[cat]

print(pb_term)

{'consum': {'business': 0.0009272262213464036, 'entertainment': 3.173916860584688e-05, 'politics': 6.284581816032492e-05, 'sport': 0.0, 'tech': 0.0019594835747407317}, 'drive': {'business': 0.0, 'entertainment': 0.0, 'politics': 0.0, 'sport': 0.0, 'tech': 0.0}, 'french': {'business': 0.0, 'entertainment': 0.0, 'politics': 0.0, 'sport': 0.0, 'tech': 0.0}, 'economi': {'business': 0.006330647088908161, 'entertainment': 0.0, 'politics': 0.002158022542468226, 'sport': 0.0, 'tech': 3.398255508858707e-05}, 'franc': {'business': 0.0, 'entertainment': 0.0, 'politics': 0.0, 'sport': 0.0, 'tech': 0.0}, 'econom': {'business': 0.005660343279494356, 'entertainment': 0.0, 'politics': 0.001726418033974581, 'sport': 0.0, 'tech': 6.796511017717414e-05}, 'growth': {'business': 0.0029443499309420884, 'entertainment': 3.173916860584688e-05, 'politics': 0.00014664024237409146, 'sport': 0.0, 'tech': 0.00043049260354152447}, 'acceler': {'business': 0.0002979128041839135, 'entertainment': 0.0, 'politics': 9.59

### Model

In [48]:
def predict(query):

    output=None
    output_val=0

    
    for cat in category:
        value=np.log(1+prob_cat[cat])

        for term in query:

            if(term not in pb_term):
                continue
            
            if(pb_term[term][cat]==0):
                continue
            value+=np.log(1+ pb_term[term][cat])

        if value>output_val or output==None:
            output_val=value
            output=cat
    
    return output

In [49]:
def model(x, y_true):

    n=len(x)
    y_pred=[]

    for i in range(n):
        ans=predict(x.iloc[i])
        y_pred.append(ans)
    
    return y_pred



### Testing the Model

In [50]:
y_pred=model(X_test, y_test)
print(y_pred)

['entertainment', 'sport', 'sport', 'politics', 'sport', 'sport', 'entertainment', 'tech', 'sport', 'business', 'business', 'sport', 'tech', 'sport', 'entertainment', 'business', 'politics', 'sport', 'politics', 'sport', 'business', 'politics', 'tech', 'tech', 'sport', 'politics', 'sport', 'entertainment', 'business', 'tech', 'business', 'sport', 'tech', 'sport', 'business', 'tech', 'sport', 'entertainment', 'tech', 'sport', 'entertainment', 'business', 'business', 'sport', 'sport', 'tech', 'sport', 'sport', 'politics', 'tech', 'sport', 'entertainment', 'sport', 'business', 'politics', 'sport', 'sport', 'entertainment', 'entertainment', 'business', 'politics', 'business', 'entertainment', 'tech', 'tech', 'business', 'sport', 'tech', 'sport', 'entertainment', 'business', 'politics', 'sport', 'business', 'sport', 'tech', 'sport', 'sport', 'business', 'sport', 'business', 'politics', 'business', 'tech', 'tech', 'sport', 'politics', 'entertainment', 'entertainment', 'entertainment', 'busin

#### Performace Metrics

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score 

print("Accuracy Score:",accuracy_score(y_test,y_pred))
print("Precision Score: ",precision_score(y_test,y_pred, labels=category, average='macro'))
print("Recall Score:",recall_score(y_test,y_pred,labels=category, average='macro'))
print("F1 Score:",f1_score(y_test,y_pred,labels=category, average='macro'))

Accuracy Score: 0.8098434004474273
Precision Score:  0.8686597102167877
Recall Score: 0.7933298327421372
F1 Score: 0.8053600372586922
