In [71]:
# ! pip install nltk
# ! pip install Wikipedia-API

In [72]:
import wikipediaapi
import re
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yasme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## loading Data

In [73]:
wiki_wiki = wikipediaapi.Wikipedia('en')

## class 1 (sports)

## for trainging
football = wiki_wiki.page('Football')
basketball = wiki_wiki.page('Basketball')
baseball = wiki_wiki.page('Baseball')
tennis = wiki_wiki.page('Tennis')
volly = wiki_wiki.page('Volleyball')

## subset from data 
football = football.summary[0:650]
basketball = basketball.summary[0:650]
baseball = baseball.summary[0:650]
tennis = tennis.summary[0:650]
volly = volly.summary[0:650]

## for testing
swimming = wiki_wiki.page('Swimming (sport)')
swimming = swimming.summary[0:650]

## class 2 (programming language)

##for trainging
python = wiki_wiki.page('Python (programming language)')
c_pluse_pluse = wiki_wiki.page('C++')
java = wiki_wiki.page('Java (programming language)')
c_sharp = wiki_wiki.page('C Sharp (programming language)')
c = wiki_wiki.page('C (programming language)')

## subset from data 
python = python.summary[0:750]
c_pluse_pluse = c_pluse_pluse.summary[0:750]
java = java.summary[0:750]
c_sharp = c_sharp.summary[0:750]
c = c.summary[0:750]

##for testing
javascript = wiki_wiki.page('JavaScript')
javascript = javascript.summary[0:750]


In [74]:
class1 = []
class2 = []
test1 = []
test2 = []

class1.append(football)
class1.append(basketball)
class1.append(baseball)
class1.append(tennis)
class1.append(volly)

class2.append(python)
class2.append(c_pluse_pluse)
class2.append(c_sharp)
class2.append(java)
class2.append(c)

test1.append(swimming)
test2.append(javascript)

y_train = 5*["sports"] + 5 *["programming language"]
data = class1+class2

## pre-processing Data

In [75]:
## 1. remove punctuatios 
## 2. split the syntax 
## 3. dose not remove duplicate words
## 4. convert to lower case
def prepare_Data(text: list,punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_“~''') -> list:
    
    for x in text.lower():
        if x in punctuations:
            text = text.replace(x, "")

    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    text = text.split(' ')
    text = [x for x in text if x != '']

    return text


## return unique words in each dictionary
def getTokens(text: list) -> list:
    words = list(set(text))
    words.sort()

    unique_word_list = []
    for i, word in enumerate(words):
      if(word not in unique_word_list):
        unique_word_list.append(word)
      
    return unique_word_list

## return stemmed words
def stemming(word):
  ps = PorterStemmer()
  stem_word = ps.stem(word)
  return stem_word

## return the final list of pre-processed words
def preprocessing(Data,string): 
  preprocessed_data = []
  for i in range(len(Data)):
    text = prepare_Data(Data[i])
    preprocessed_data.append(text)

  tokens = []
  for i in range(len(preprocessed_data)):
    words = getTokens(preprocessed_data[i])
    tokens.append(words)

  tokens2=[]
  for i in range(len(tokens)):
    for j in range(len(tokens[i])):
      tokens2.append(stemming(tokens[i][j]))

  tokens_list = []
  for i in range(len(tokens2)):
    if(string=="all"):
      if(tokens2[i] not in tokens_list):
        tokens_list.append(tokens2[i])
    else:
      tokens_list.append(tokens2[i])
  return tokens_list

In [76]:
tokens_list = preprocessing(data,"all") ## all unique words on class1, and class2
class1_tokens = preprocessing(class1,"class") ## all words in class1
class2_tokens = preprocessing(class2,"class") ## all words in class2 

swim = preprocessing(test1,"class") ## all unique words in doc test 1
JS = preprocessing(test2,"class") ## all unique words in doc test 2

## Naive bayes Model

In [77]:
## calculate p(ci)
def class_prob (doc_label,y_train):
  n = len(y_train)
  m =0 
  for label in y_train :
    if label == doc_label :
      m = m +1
  return (m/n)

## calculate the p(word|classi)
def conditional_prob(tokens_list,tokens_class_list):
  k=len(tokens_list) 
  class_count = {}
  p_w_c = {}
  count =0
  sum_c_count = 0

  ## 1. get classes count
  for i in range(len(tokens_list)):
    for j in range(len(tokens_class_list)):
      if(tokens_list[i]==tokens_class_list[j]):
        count = count + 1 
    class_count.update({
        tokens_list[i]:count
    })
    count = 0 

  ## 2. sum of classes count
  for key,value in list(class_count.items()):
      sum_c_count = sum_c_count + value
  
  ## 3. calculate the p(word|classi)
  denominator = sum_c_count + k
  for i in range(len(tokens_list)):
      word = tokens_list[i]
      pwc = (class_count[word]+1)/(denominator)
      pwc = round(pwc,3)
      p_w_c.update({
        word : pwc
      })

  return p_w_c


def predict(test_doc,pwc1,pwc2,pc1,pc2,words_c1,word_c2):
  ## calculate p(document|classs1) = p(w0|class1)*p(w1|class1)*...*p(wk|class1)
  p_d_c1 =1
  for i in range(len(test_doc)):
    string = test_doc[i]
    if string in words_c1:
      prop = pwc1[string]
      p_d_c1 = p_d_c1 * prop
      
  ## calculate p(document|class2) = p(w0|class2)*p(w1|class2)*...*p(wk|class2)
  p_d_c2 =1
  for i in range(len(test_doc)):
    string = test_doc[i]
    if string in word_c2:
      prop = pwc2[string]
      p_d_c2 = p_d_c2 * prop
  ## p(clas1|document) = p(document|classs1) * p(class1)
  p_c1_doc = p_d_c1*pc1
  ## p(clas2|document) = p(document|classs2) * p(class2)
  p_c2_doc = p_d_c2*pc2
  ## final result = p(clas1|document) / p(clas2|document)
  result = p_c1_doc /p_c2_doc
  
  if result > 1:
    print("sport")
  else:
    print("programming lanuage")

In [78]:
pwc1 =conditional_prob(tokens_list,class1_tokens) ## p(w|c1)
pwc2 =conditional_prob(tokens_list,class2_tokens) ## p(w|c2)

pc1 = class_prob("sports",y_train) # p(c1)
pc2 = class_prob("programming language",y_train) # p(c2)

words = list(pwc1.keys())  ## words in class1
words2 = list(pwc2.keys()) ## words in class2

p_w_c = {}
for keys,values in list(pwc1.items()):
    p_w_c.update({
        ("sport",keys): values
    })

for keys,values in list(pwc2.items()):
    p_w_c.update({
        ("programming lanuage",keys): values
    })

## save the probabilties 

In [79]:
import os 
os.getcwd()

## save weights to file 
file = open("probabilities.txt","w+")
for item in (p_w_c.items()):
    file.write(str(item))
    file.write('\n')

file.close()

## predict

In [80]:
predict(swim,pwc1,pwc2,pc1,pc2,words,words2)
predict(JS,pwc1,pwc2,pc1,pc2,words,words2)

sport
programming lanuage
