In [None]:
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
import numpy as np

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv("bbc-text.csv")
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [None]:
df['text'] = df['text'].apply(lambda s: s.lower())
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [None]:
import string
puncs = string.punctuation
def remove_punc(s):
  ret = "".join([c for c in s if c not in puncs])
  return ret
df['text'] = df['text'].apply(remove_punc)
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [None]:
from nltk import word_tokenize
df['text'] = df['text'].apply(word_tokenize)
df.head()

Unnamed: 0,category,text
0,tech,"[tv, future, in, the, hands, of, viewers, with..."
1,business,"[worldcom, boss, left, books, alone, former, w..."
2,sport,"[tigers, wary, of, farrell, gamble, leicester,..."
3,sport,"[yeading, face, newcastle, in, fa, cup, premie..."
4,entertainment,"[ocean, s, twelve, raids, box, office, ocean, ..."


In [None]:
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

def lematize_words(word_list):
  ret = []
  for i in range(len(word_list)):
    if word_list[i] not in stop_words:
      ret.append(lemmatizer.lemmatize(word_list[i]))
  return ret

df['text'] = df['text'].apply(lematize_words)
df.head()

Unnamed: 0,category,text
0,tech,"[tv, future, hand, viewer, home, theatre, syst..."
1,business,"[worldcom, bos, left, book, alone, former, wor..."
2,sport,"[tiger, wary, farrell, gamble, leicester, say,..."
3,sport,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,"[ocean, twelve, raid, box, office, ocean, twel..."


In [None]:
all_words = set()
def store_words(word_list):
  for word in word_list:
    all_words.add(word)
  return word_list

df['text'] = df['text'].apply(store_words)

In [None]:
df.head()

Unnamed: 0,category,text
0,tech,"[tv, future, hand, viewer, home, theatre, syst..."
1,business,"[worldcom, bos, left, book, alone, former, wor..."
2,sport,"[tiger, wary, farrell, gamble, leicester, say,..."
3,sport,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,"[ocean, twelve, raid, box, office, ocean, twel..."


In [None]:
rows, cols = df.shape
train_df = df.iloc[:int(rows*0.9), :].copy()
test_df = df.iloc[int(rows*0.9):, :].copy()

In [None]:
idx_to_word = list(all_words)
word_to_idx = dict()
for i in range(len(idx_to_word)):
  word_to_idx[idx_to_word[i]] = i

In [None]:
prior = list()
valcnt = train_df.category.value_counts()
for i in range(len(valcnt.index)):
  prior.append([valcnt.index[i], np.log(valcnt.values[i])])
  
prior = np.array(prior)
prior

array([['sport', '6.148468295917647'],
       ['business', '6.124683390894205'],
       ['politics', '5.929589143389895'],
       ['entertainment', '5.8664680569332965'],
       ['tech', '5.8522024797744745']], dtype='<U18')

In [None]:
target_to_idx = dict()
for i in range(prior.shape[0]):
  target_to_idx[prior[i,0]] = i

In [None]:
prob_x_given_y = np.zeros((prior.shape[0], len(all_words)))
def calc_x_given_y(row):
  target_idx = target_to_idx[row[0]]
  for word in row[1]:
    word_idx = word_to_idx[word]
    # print(target_idx, word_idx)
    prob_x_given_y[target_idx, word_idx] += 1
  return row    

_ = train_df.apply(calc_x_given_y, axis=1)

In [76]:
prob_x_given_y += 0.01
prob_x_given_y = (np.log(prob_x_given_y)).astype(np.float64) - (prior[:,1].reshape(prior.shape[0], 1)).astype(np.float64)

In [77]:
prob_x_given_y[:3,:3]

array([[-9.14420057, -5.4306285 , -9.14420057],
       [-5.4068436 , -9.12041566, -9.12041566],
       [-8.92532142, -8.92532142, -8.92532142]])

In [83]:
def predict(row):
  prob = prior[:, 1].reshape(-1, 1).copy().astype(np.float64)
  for target_idx in range(5):
    for word in row[1]:
      word_idx = word_to_idx[word]
      prob[target_idx, 0] += prob_x_given_y[target_idx, word_idx].astype(np.float64)
  return np.argmax(prob)

In [84]:
y_pred_train = np.array(train_df.apply(predict, axis=1))
y_train = np.array(train_df['category'].apply(lambda x: target_to_idx[x]))
accuracy = np.sum(y_pred_train == y_train)/y_pred_train.shape[0]
accuracy

0.9925074925074925

In [85]:
y_pred_train[:10]

array([4, 1, 0, 0, 3, 2, 2, 0, 0, 3])

In [86]:
y_train[:10]

array([4, 1, 0, 0, 3, 2, 2, 0, 0, 3])

In [87]:
y_pred_test = np.array(test_df.apply(predict, axis=1))
y_test = np.array(test_df['category'].apply(lambda x: target_to_idx[x]))
accuracy = np.sum(y_pred_test == y_test)/y_test.shape[0]
accuracy

0.9237668161434978