In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import math
from urllib.parse import quote
import pandas as pd

In [3]:
def getPage(url):
#     url = "https://en.wikipedia.org/wiki/New_Delhi"
    page = urlopen(url)
    page = page.read().decode("utf-8")
    return page

In [4]:
def preProcess(page):
    soup = BeautifulSoup(page, 'html.parser')
    clean_soup = soup.get_text()
    
    whitespaces = re.compile("\n[\n]+\n")
    refs = re.compile("\[.*?\]")
    page = re.sub(whitespaces, "\n", clean_soup)
    page = re.sub(refs, " ", page)
    
    paras = ["".join(list(s.strings)).strip()+"\n" for s in soup.find_all('p')]
    paras = [s for s in paras if s != "\n"] # paras
    cleaned_text = [s for s in paras if s!="\n"]
    cleaned_text = "\n".join(cleaned_text)
    return cleaned_text

In [5]:
sports = []
sports_topics = ['Olympic_Games','Paralympic_Games','Sport_in_India','Euro']

In [6]:
edu = []
edu_topics = ['The_Language_of_Goldfish','International_Conference_on_Software_Engineering','Education_of_Generation_Z','Prison_education']


In [9]:
docs = []
data = {
    "docs":[],
    "category":[] 
}
for topic in sports_topics:
    cleaned_text = preProcess(getPage("https://en.wikipedia.org/wiki/"+topic))
    docs.append(cleaned_text)
    data["docs"].append(cleaned_text)
    data["category"].append("sports")
for topic in edu_topics:
    cleaned_text = preProcess(getPage("https://en.wikipedia.org/wiki/"+topic))
    docs.append(cleaned_text)
    data["docs"].append(cleaned_text)
    data["category"].append("education")

In [13]:
import pandas as pd

df = pd.DataFrame(data)
df

Unnamed: 0,docs,category
0,The modern Olympic Games or Olympics [a][1] ar...,sports
1,"The Paralympic Games or Paralympics, also know...",sports
2,India has a history of sports dating back to t...,sports
3,The euro (symbol: €; currency code: EUR) is th...,sports
4,The Language of Goldfish is a young adult nove...,education
5,The International Conference on Software Engin...,education
6,"Generation Z (or Gen Z for short), colloquiall...",education
7,Prison education is any educational activity t...,education


In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

unigram_vectorizer = CountVectorizer(strip_accents="ascii")
bigram_vectorizer = CountVectorizer(ngram_range=(2,2), strip_accents="ascii")
tfidf_vectorizer = TfidfVectorizer(strip_accents="ascii")

In [15]:
X = df["docs"]
y = df["category"]

In [16]:
unigram_matrix = unigram_vectorizer.fit_transform(X).toarray()
bigram_matrix = bigram_vectorizer.fit_transform(X).toarray()
tfidf_matrix = tfidf_vectorizer.fit_transform(X).toarray()

In [17]:
unigram_matrix

array([[ 0,  6,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  1,  0],
       [ 0,  3,  1, ...,  0,  0,  1],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0, 10,  0, ...,  1,  0,  0],
       [ 0,  6,  0, ...,  0,  0,  0]])

In [18]:
unique_words = [None for _ in range(len(unigram_vectorizer.vocabulary_))]

for word, idx in unigram_vectorizer.vocabulary_.items(): 
    unique_words[idx] = word

uni_df = pd.DataFrame(unigram_matrix, columns=unique_words)
uni_df.head()

Unnamed: 0,00,000,000m,010,03,06,079,10,100,100150,...,zen,zero,zeus,zibby,zimbabwe,zonal,zone,zoomers,zorn,zurich
0,0,6,0,0,0,0,0,2,3,0,...,0,1,5,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,4,1,0,...,0,0,0,0,0,0,1,0,1,0
2,0,3,1,1,1,1,0,9,5,1,...,1,0,0,0,0,1,0,0,0,1
3,2,1,0,0,0,0,0,3,4,0,...,0,0,0,0,1,0,2,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [19]:
unique_bigrams = [None for _ in range(len(bigram_vectorizer.vocabulary_))]

for word, idx in bigram_vectorizer.vocabulary_.items(): 
    unique_bigrams[idx] = word

bi_df = pd.DataFrame(bigram_matrix, columns=unique_bigrams).astype("float")

for idx, row in bi_df.iterrows():
    for bigram in unique_bigrams:
        prev = bigram.split(" ")[0]
        prev_count = uni_df[prev][idx] #takingcount of first word
        if prev_count != 0:
            row[bigram] /= prev_count #conditional probability
bi_df.head()

Unnamed: 0,00 or,00 see,000 athletes,000 for,000 foreign,000 french,000 in,000 inmates,000 kilometres,000 meerut,...,zeus whose,zibby oneal,zimbabwe abandoned,zonal national,zone factor,zone germany,zone would,zoomers is,zorn of,zurich switzerland
0,0.0,0.0,0.166667,0.333333,0.0,0.0,0.166667,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
labels = [None for _ in range(len(tfidf_vectorizer.vocabulary_))]

for word, idx in tfidf_vectorizer.vocabulary_.items(): 
    labels[idx] = word
    
tfidf_df = pd.DataFrame(tfidf_matrix, columns=labels)
tfidf_df.head()

Unnamed: 0,00,000,000m,010,03,06,079,10,100,100150,...,zen,zero,zeus,zibby,zimbabwe,zonal,zone,zoomers,zorn,zurich
0,0.0,0.005081,0.0,0.0,0.0,0.0,0.0,0.001508,0.002262,0.0,...,0.0,0.001509,0.007545,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006646,0.001661,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002786,0.0,0.003325,0.0
2,0.0,0.002184,0.001297,0.001297,0.001297,0.001297,0.0,0.005833,0.003241,0.001297,...,0.001297,0.0,0.0,0.0,0.0,0.001297,0.0,0.0,0.0,0.001297
3,0.00611,0.001715,0.0,0.0,0.0,0.0,0.0,0.00458,0.006107,0.0,...,0.0,0.0,0.0,0.0,0.003055,0.0,0.005121,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.056048,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
from sklearn.naive_bayes import MultinomialNB

y = df["category"].apply(lambda x: 0 if x =="Sports" else 1)
naive_bayes = MultinomialNB()

In [42]:
new_doc = pd.Series(["The primary education in India is divided into two parts, namely Lower Primary and Upper Primary"])
new_doc

0    The primary education in India is divided into...
dtype: object

In [43]:
out = unigram_vectorizer.transform(new_doc).toarray()
uni_test = pd.DataFrame(out, columns=unigram_vectorizer.get_feature_names())
uni_test

Unnamed: 0,00,000,000m,010,03,06,079,10,100,100150,...,zen,zero,zeus,zibby,zimbabwe,zonal,zone,zoomers,zorn,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# unigram prediction
naive_bayes.fit(uni_df, y)
pred = naive_bayes.predict(uni_test)[0];
if pred == 0: print("Sports")
else: print("Education")

Education


In [50]:
out = bigram_vectorizer.transform(new_doc).toarray()
bi_test = pd.DataFrame(out, columns=unique_bigrams)
for bigram in unique_bigrams:
    prev = bigram.split(" ")[0]
    prev_count = uni_test[prev][0]
    if prev_count != 0:
        row[bigram] /= prev_count
bi_test

Unnamed: 0,00 or,00 see,000 athletes,000 for,000 foreign,000 french,000 in,000 inmates,000 kilometres,000 meerut,...,zeus whose,zibby oneal,zimbabwe abandoned,zonal national,zone factor,zone germany,zone would,zoomers is,zorn of,zurich switzerland
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
#bigram prediction
naive_bayes.fit(bi_df, y)
pred = naive_bayes.predict(bi_test)[0];
if pred == 0: print("Sports")
else: print("Education")

Education


In [51]:
out = tfidf_vectorizer.transform(new_doc).toarray()
tfid_test = pd.DataFrame(out, columns=tfidf_vectorizer.get_feature_names())
tfid_test

Unnamed: 0,00,000,000m,010,03,06,079,10,100,100150,...,zen,zero,zeus,zibby,zimbabwe,zonal,zone,zoomers,zorn,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
#tfidf prediction
naive_bayes.fit(tfidf_df, y)
pred = naive_bayes.predict(tfid_test)[0];
if pred == 0: print("Sports")
else: print("Education")

Education
