### Problem :- Scrape n documents of two categories from Wikipdia (say sports and education)

- Preprocess and clean all the documents
- Prepare 
    - Unigram count Matrix
    - Bigram Probability Matrix
    - TF-IDF Matrix
- Apply appropriate Naive Bayes classification 

In [2]:
from bs4 import BeautifulSoup
import requests
import os.path as path
import math

In [3]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import PorterStemmer
import pandas as pd

### Fetch Scrape N documents of two categories from wikipedia


In [4]:
URLS={
    "football":["https://en.wikipedia.org/wiki/Football","https://en.wikipedia.org/wiki/American_football","https://en.wikipedia.org/wiki/Association_football","https://en.wikipedia.org/wiki/Australian_rules_football","https://en.wikipedia.org/wiki/Gaelic_football"],
    "algorithm":["https://en.wikipedia.org/wiki/Algorithm","https://en.wikipedia.org/wiki/Analysis_of_algorithms","https://en.wikipedia.org/wiki/Computational_complexity","https://en.wikipedia.org/wiki/Worst-case_complexity","https://en.wikipedia.org/wiki/Average-case_complexity"],
}
BASE_URL="Documents"
stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 
'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 
'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 
'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 
'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 
'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 
'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y',
 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 
'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
 "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 
'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]


### Helper Functions

In [5]:
def getHTMLFromURL(url):
    page = requests.get(url).content
    soup = BeautifulSoup(page, 'html.parser')
    return soup;

def getBodyTextFromHTML(soup):
    paragraphs = soup.find_all('p')
    text = '\n'.join([para.get_text().strip() for para in paragraphs])
    return text;

def writeInTextFile(topic,filename,text):
    file = open(path.join(BASE_URL,topic,filename),"w")
    file.write(text)
    file.close()

def scrapeFunction():
    for topic in URLS:
        fileno=1;
        for url in URLS[topic]:
            soup = getHTMLFromURL(url)
            text = getBodyTextFromHTML(soup)
            filename = f"{fileno}.txt"
            writeInTextFile(topic,filename,text)
            print("Done for ",topic,filename)
            fileno=fileno+1
        

### Getting text from the urls provided

In [6]:

scrapeFunction()

Done for  football 1.txt
Done for  football 2.txt
Done for  football 3.txt
Done for  football 4.txt
Done for  football 5.txt
Done for  algorithm 1.txt
Done for  algorithm 2.txt
Done for  algorithm 3.txt
Done for  algorithm 4.txt
Done for  algorithm 5.txt


### Preperations

#### Unigram Count Matrix

In [7]:
unique_word_set={}
unique_word_dict={}

def getUniqueWords(text):
    unique_words={}
    words = word_tokenize(text)
    for word in words:
        word=word.lower()
        if word in stop_words:
            continue
        unique_word_set[word]=1
        if word in unique_words:
            unique_words[word]=unique_words[word]+1
        else:
            unique_words[word]=1
    return unique_words;

def getUniqueWordsFromFiles():
    for topic in URLS:
        if topic not in unique_word_dict:
            unique_word_dict[topic]={}
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            unique_words=getUniqueWords(text);

            for word in unique_words:
                if word in unique_word_dict[topic]:
                    unique_word_dict[topic][word]=unique_word_dict[topic][word]+unique_words[word]
                else:
                    unique_word_dict[topic][word]=1
                    

    print("Total Unique Words from files are ",len(unique_word_set))



In [8]:
getUniqueWordsFromFiles()

Total Unique Words from files are  6668


In [9]:
# for word in unique_word_set:
#     print("word",end="\t\t")
#     print(word,end="\t\t");

# print()

# for topic in unique_word_dict:
#     print(topic,end="\t")
#     for word in unique_word_set:
#         if word not in unique_word_dict[topic]:
#             print(0,end="\t\t")
#         else:
#             print(unique_word_dict[topic][word],end="\t\t")
#     print()

df = pd.DataFrame(unique_word_dict)
print(df.transpose().fillna(0))

           football  family   team  sports  involve       ,  varying  degrees  \
football      475.0     2.0  139.0    28.0      2.0  1593.0      2.0      3.0   
algorithm       0.0     0.0    0.0     0.0      0.0   396.0      0.0      0.0   

           kicking   ball  ...  good-on-average  weaker  1993  feigenbaum  \
football      38.0  214.0  ...              0.0     0.0   0.0         0.0   
algorithm      0.0    0.0  ...              1.0     1.0   1.0         1.0   

           fortnow  non-adaptive  reductions  2003  bogdanov  trevisan  
football       0.0           0.0         0.0   0.0       0.0       0.0  
algorithm      1.0           1.0         1.0   1.0       1.0       1.0  

[2 rows x 6668 columns]


#### Bigram 

In [10]:
unique_bigram_set={}
unique_bigram_dict={}
unique_bigram_length={}

def getUniqueBigramsWords(text):
    unique_words={}
    words = word_tokenize(text)
    prev="<string>"
    for word in words:
        word=word.lower()
        if word in stop_words:
            continue
        bigram=prev+" "+word
        unique_bigram_set[bigram]=1
        if bigram in unique_words:
            unique_words[bigram]=unique_words[bigram]+1
        else:
            unique_words[bigram]=1
        prev=word
    return unique_words;


def getUniqueBigramsFromFiles():
    for topic in URLS:
        if topic not in unique_bigram_length:
                unique_bigram_length[topic]=0
        if topic not in unique_bigram_dict:
                unique_bigram_dict[topic]={}
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            unique_bigrams=getUniqueBigramsWords(text);
            
            for bigram in unique_bigrams:
                if bigram in unique_bigram_dict[topic]:
                    unique_bigram_dict[topic][bigram]=unique_bigram_dict[topic][bigram]+unique_bigrams[bigram]
                else:
                    unique_bigram_dict[topic][bigram]=1
                
                unique_bigram_length[topic]+=1;
                    

    print("Total Unique Words from files are ",len(unique_bigram_set))

In [11]:
getUniqueBigramsFromFiles();

Total Unique Words from files are  27228


In [12]:

# print("bigram",end="\t\t")
# for bigram in unique_bigram_set:
#     print(bigram,end="\t\t");

# print()

# for topic in unique_bigram_dict:
#     print(topic,end="\t")
#     for bigram in unique_bigram_set:
#         if bigram not in unique_bigram_dict[topic]:
#             print(0,end="\t\t")
#         else:
#             unique_bigram_dict[topic][bigram]=unique_bigram_dict[topic][bigram]/unique_bigram_length[topic]
#             print(unique_bigram_dict[topic][bigram],end="\t\t")
#     print()

df = pd.DataFrame(unique_bigram_dict)
df.transpose().fillna(0)

Unnamed: 0,<string> football,football family,family team,team sports,sports involve,"involve ,",", varying",varying degrees,"degrees ,",", kicking",...,association made,made average-case,complexity via,via reductions,] literature,literature average,complexity includes,includes following,following work,work :
football,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
algorithm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Bigram Probability

In [13]:
unique_bigram_prob_dict={}
for topic in unique_bigram_dict:
    unique_bigram_prob_dict[topic]={}
    for bigram in unique_bigram_set:
        unique_bigram_prob_dict[topic][bigram]=0;
        if bigram in unique_bigram_dict[topic]:
            unique_bigram_prob_dict[topic][bigram]=unique_bigram_dict[topic][bigram]/unique_bigram_length[topic]

In [14]:
df=pd.DataFrame(unique_bigram_prob_dict)
df.transpose().fillna(0)

Unnamed: 0,<string> football,football family,family team,team sports,sports involve,"involve ,",", varying",varying degrees,"degrees ,",", kicking",...,association made,made average-case,complexity via,via reductions,] literature,literature average,complexity includes,includes following,following work,work :
football,4.4e-05,4.4e-05,4.4e-05,4.4e-05,4.4e-05,4.4e-05,4.4e-05,4.4e-05,4.4e-05,0.000175,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
algorithm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109,0.000109


### TF IDF Term Frequency-Inverse Document Frequency

In [15]:
unique_word_dict_tfidf={}
unique_word_doc_count={}
def getUniqueWordsFromFiles():
    for topic in URLS:
        unique_word_dict_tfidf[topic]={}
        unique_word_doc_count[topic]={}
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            unique_words=getUniqueWords(text);
            
            unique_word_dict_tfidf[topic][i]={}
            unique_word_doc_count[topic][i]=0

            for word in unique_words:
                unique_word_dict_tfidf[topic][i][word]=unique_words[word]
                unique_word_doc_count[topic][i]+=unique_words[word]


In [16]:
getUniqueWordsFromFiles();

for topic in unique_word_dict_tfidf:
    for i in unique_word_dict_tfidf[topic]:
        for word in unique_word_dict_tfidf[topic][i]:
            total=len(unique_word_dict_tfidf[topic])
            count=0
            for j in unique_word_dict_tfidf[topic]:
                if word in unique_word_dict_tfidf[topic][j]:
                    count+=1
            idf=math.log10(total/count)
            tf=unique_word_dict_tfidf[topic][i][word]/unique_word_doc_count[topic][i]
            unique_word_dict_tfidf[topic][i][word]=tf*idf;

pd.DataFrame(unique_word_dict_tfidf['football']).fillna(0)
pd.DataFrame(unique_word_dict_tfidf['algorithm']).fillna(0)

Unnamed: 0,1,2,3,4,5
mathematics,0.001488,0.000000,0.000000,0.00000,0.000000
computer,0.000000,0.000000,0.000000,0.00000,0.000000
science,0.000063,0.000079,0.000048,0.00023,0.000000
",",0.000000,0.000000,0.000000,0.00000,0.000000
algorithm,0.000000,0.000000,0.000000,0.00000,0.000000
...,...,...,...,...,...
2003,0.000000,0.000000,0.000000,0.00000,0.000512
bogdanov,0.000000,0.000000,0.000000,0.00000,0.000512
trevisan,0.000000,0.000000,0.000000,0.00000,0.000512
unlikely,0.000000,0.000000,0.000000,0.00000,0.000512


### Naive Bayes Classification

In [17]:
my_sentence="I like algorithms and football"
def removeStopWords(sentence):
    words = word_tokenize(sentence)
    new_sentence=""
    for word in words:
        if word.lower() not in stop_words:
            new_sentence+=word+" "
    return new_sentence;

words=word_tokenize(removeStopWords(my_sentence))

In [18]:
prob1=1
prob2=1;
total_word1=0;
total_word2=0;

for word in unique_word_dict['football']:
    total_word1+=unique_word_dict['football'][word];

for word in unique_word_dict['algorithm']:
    total_word2+=unique_word_dict['algorithm'][word];

for word in words:

    # for class football
    if word in unique_word_dict['football']:
        prob1*=unique_word_dict['football'][word]/(total_word1);
    else:
        # laplace 
        prob1*=(1)/(len(unique_word_dict['football'])+total_word1)

    # for class algorithm
    if word in unique_word_dict['algorithm']:
        prob2*=unique_word_dict['algorithm'][word]/(total_word2);
    else:
        # laplace 
        prob2*=(1)/(len(unique_word_dict['algorithm'])+total_word2)

print('Football : ',(prob1/(prob1+prob2))*100)
print('Algorithm : ',(prob2/(prob1+prob2))*100)

Football :  45.28292963730315
Algorithm :  54.71707036269685
