### Problem :- Scrape n documents of two categories from Wikipdia (say sports and education)

- Preprocess and clean all the documents
- Prepare 
    - Unigram count Matrix
    - Bigram Probability Matrix
    - TF-IDF Matrix
- Apply appropriate Naive Bayes classification 

In [7]:
from bs4 import BeautifulSoup
import requests
import os.path as path
import math

In [8]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import PorterStemmer
import pandas as pd

### Fetch Scrape N documents of two categories from wikipedia


In [9]:
URLS={
    "football":["https://en.wikipedia.org/wiki/Football","https://en.wikipedia.org/wiki/American_football","https://en.wikipedia.org/wiki/Association_football","https://en.wikipedia.org/wiki/Australian_rules_football","https://en.wikipedia.org/wiki/Gaelic_football"],
    "algorithm":["https://en.wikipedia.org/wiki/Algorithm","https://en.wikipedia.org/wiki/Analysis_of_algorithms","https://en.wikipedia.org/wiki/Computational_complexity","https://en.wikipedia.org/wiki/Worst-case_complexity","https://en.wikipedia.org/wiki/Average-case_complexity"],
}
BASE_URL="Documents"


### Helper Functions

In [10]:
def getHTMLFromURL(url):
    page = requests.get(url).content
    soup = BeautifulSoup(page, 'html.parser')
    return soup;

def getBodyTextFromHTML(soup):
    paragraphs = soup.find_all('p')
    text = '\n'.join([para.get_text().strip() for para in paragraphs])
    return text;

def writeInTextFile(topic,filename,text):
    file = open(path.join(BASE_URL,topic,filename),"w")
    file.write(text)
    file.close()

def scrapeFunction():
    for topic in URLS:
        fileno=1;
        for url in URLS[topic]:
            soup = getHTMLFromURL(url)
            text = getBodyTextFromHTML(soup)
            filename = f"{fileno}.txt"
            writeInTextFile(topic,filename,text)
            print("Done for ",topic,filename)
            fileno=fileno+1
        

### Getting text from the urls provided

In [11]:

scrapeFunction()

Done for  football 1.txt
Done for  football 2.txt
Done for  football 3.txt
Done for  football 4.txt
Done for  football 5.txt
Done for  algorithm 1.txt
Done for  algorithm 2.txt
Done for  algorithm 3.txt
Done for  algorithm 4.txt
Done for  algorithm 5.txt


### Preperations

#### Unigram Count Matrix

In [12]:
unique_word_set={}
unique_word_dict={}

def getUniqueWords(text):
    unique_words={}
    words = word_tokenize(text)
    for word in words:
        word=word.lower()
        unique_word_set[word]=1
        if word in unique_words:
            unique_words[word]=unique_words[word]+1
        else:
            unique_words[word]=1
    return unique_words;

def getUniqueWordsFromFiles():
    for topic in URLS:
        if topic not in unique_word_dict:
            unique_word_dict[topic]={}
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            unique_words=getUniqueWords(text);

            for word in unique_words:
                if word in unique_word_dict[topic]:
                    unique_word_dict[topic][word]=unique_word_dict[topic][word]+unique_words[word]
                else:
                    unique_word_dict[topic][word]=1
                    

    print("Total Unique Words from files are ",len(unique_word_set))



In [13]:
getUniqueWordsFromFiles()

Total Unique Words from files are  6790


In [14]:
# for word in unique_word_set:
#     print("word",end="\t\t")
#     print(word,end="\t\t");

# print()

# for topic in unique_word_dict:
#     print(topic,end="\t")
#     for word in unique_word_set:
#         if word not in unique_word_dict[topic]:
#             print(0,end="\t\t")
#         else:
#             print(unique_word_dict[topic][word],end="\t\t")
#     print()

df = pd.DataFrame(unique_word_dict)
print(df.transpose().fillna(0))

           football     is      a  family     of   team  sports   that  \
football      475.0  368.0  724.0     2.0  901.0  139.0    28.0  130.0   
algorithm       0.0  224.0  224.0     0.0  308.0    0.0     0.0  114.0   

           involve       ,  ...  good-on-average  weaker  1993  feigenbaum  \
football       2.0  1593.0  ...              0.0     0.0   0.0         0.0   
algorithm      0.0   396.0  ...              1.0     1.0   1.0         1.0   

           fortnow  non-adaptive  reductions  2003  bogdanov  trevisan  
football       0.0           0.0         0.0   0.0       0.0       0.0  
algorithm      1.0           1.0         1.0   1.0       1.0       1.0  

[2 rows x 6790 columns]


#### Bigram 

In [15]:
unique_bigram_set={}
unique_bigram_dict={}
unique_bigram_length={}

def getUniqueBigramsWords(text):
    unique_words={}
    words = word_tokenize(text)
    prev="<string>"
    for word in words:
        word=word.lower()
        bigram=prev+" "+word
        unique_bigram_set[bigram]=1
        if bigram in unique_words:
            unique_words[bigram]=unique_words[bigram]+1
        else:
            unique_words[bigram]=1
        prev=word
    return unique_words;


def getUniqueBigramsFromFiles():
    for topic in URLS:
        if topic not in unique_bigram_length:
                unique_bigram_length[topic]=0
        if topic not in unique_bigram_dict:
                unique_bigram_dict[topic]={}
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            unique_bigrams=getUniqueBigramsWords(text);
            
            for bigram in unique_bigrams:
                if bigram in unique_bigram_dict[topic]:
                    unique_bigram_dict[topic][bigram]=unique_bigram_dict[topic][bigram]+unique_bigrams[bigram]
                else:
                    unique_bigram_dict[topic][bigram]=1
                
                unique_bigram_length[topic]+=1;
                    

    print("Total Unique Words from files are ",len(unique_bigram_set))

In [16]:
getUniqueBigramsFromFiles();

Total Unique Words from files are  31807


In [17]:

# print("bigram",end="\t\t")
# for bigram in unique_bigram_set:
#     print(bigram,end="\t\t");

# print()

# for topic in unique_bigram_dict:
#     print(topic,end="\t")
#     for bigram in unique_bigram_set:
#         if bigram not in unique_bigram_dict[topic]:
#             print(0,end="\t\t")
#         else:
#             unique_bigram_dict[topic][bigram]=unique_bigram_dict[topic][bigram]/unique_bigram_length[topic]
#             print(unique_bigram_dict[topic][bigram],end="\t\t")
#     print()

df = pd.DataFrame(unique_bigram_dict)
df.transpose().fillna(0)

Unnamed: 0,<string> football,football is,is a,a family,family of,of team,team sports,sports that,that involve,"involve ,",...,between average-case,and worst-case,complexity via,via reductions,literature of,of average,complexity includes,includes the,following work,work :
football,1.0,44.0,19.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
algorithm,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Bigram Probability

In [18]:
unique_bigram_prob_dict={}
for topic in unique_bigram_dict:
    unique_bigram_prob_dict[topic]={}
    for bigram in unique_bigram_set:
        unique_bigram_prob_dict[topic][bigram]=0;
        if bigram in unique_bigram_dict[topic]:
            unique_bigram_prob_dict[topic][bigram]=unique_bigram_dict[topic][bigram]/unique_bigram_length[topic]

In [19]:
df=pd.DataFrame(unique_bigram_prob_dict)
df.transpose().fillna(0)

Unnamed: 0,<string> football,football is,is a,a family,family of,of team,team sports,sports that,that involve,"involve ,",...,between average-case,and worst-case,complexity via,via reductions,literature of,of average,complexity includes,includes the,following work,work :
football,3.4e-05,0.00149,0.000644,6.8e-05,6.8e-05,6.8e-05,3.4e-05,3.4e-05,6.8e-05,3.4e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
algorithm,0.0,0.0,0.001404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05


### TF IDF Term Frequency-Inverse Document Frequency

In [20]:
unique_word_dict_tfidf={}
unique_word_doc_count={}
def getUniqueWordsFromFiles():
    for topic in URLS:
        unique_word_dict_tfidf[topic]={}
        unique_word_doc_count[topic]={}
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            unique_words=getUniqueWords(text);
            
            unique_word_dict_tfidf[topic][i]={}
            unique_word_doc_count[topic][i]=0

            for word in unique_words:
                unique_word_dict_tfidf[topic][i][word]=unique_words[word]
                unique_word_doc_count[topic][i]+=unique_words[word]


In [21]:
getUniqueWordsFromFiles();

for topic in unique_word_dict_tfidf:
    for i in unique_word_dict_tfidf[topic]:
        for word in unique_word_dict_tfidf[topic][i]:
            total=len(unique_word_dict_tfidf[topic])
            count=0
            for j in unique_word_dict_tfidf[topic]:
                if word in unique_word_dict_tfidf[topic][j]:
                    count+=1
            idf=math.log10(total/count)
            tf=unique_word_dict_tfidf[topic][i][word]/unique_word_doc_count[topic][i]
            unique_word_dict_tfidf[topic][i][word]=tf*idf;

pd.DataFrame(unique_word_dict_tfidf['football']).fillna(0)
pd.DataFrame(unique_word_dict_tfidf['algorithm']).fillna(0)

Unnamed: 0,1,2,3,4,5
in,0.000000,0.000000,0.00000,0.000000,0.00000
mathematics,0.001005,0.000000,0.00000,0.000000,0.00000
and,0.000000,0.000000,0.00000,0.000000,0.00000
computer,0.000000,0.000000,0.00000,0.000000,0.00000
science,0.000043,0.000057,0.00003,0.000163,0.00000
...,...,...,...,...,...
2003,0.000000,0.000000,0.00000,0.000000,0.00033
bogdanov,0.000000,0.000000,0.00000,0.000000,0.00033
trevisan,0.000000,0.000000,0.00000,0.000000,0.00033
unlikely,0.000000,0.000000,0.00000,0.000000,0.00033


### Naive Bayes Classification

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [25]:
def getTextDocuments():
    X=[]
    y=[]
    for topic in URLS:
        for i in range(1,6):
            filename = f"{i}.txt"
            file = open(path.join(BASE_URL,topic,filename),"r")
            text = file.read()
            file.close()
            X.append(text)
            y.append(topic)
    return X,y

X,y=getTextDocuments()

# Step 4: Convert the text data into numerical vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Apply Naive Bayes algorithm
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Step 7: Predict on the test data
y_pred = clf.predict(X_test)

# Step 8: Evaluate the model
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [34]:
# Your own sentence
my_sentence = "football algorithm"

# Transform your sentence into vectors
my_sentence_vectorized = vectorizer.transform([my_sentence])

# Predict the label of your sentence
my_label_pred = clf.predict(my_sentence_vectorized)

# Get class probabilities for your sentence
my_label_proba = clf.predict_proba(my_sentence_vectorized)

print(my_label_pred, my_label_proba)

['algorithm'] [[0.52621964 0.47378036]]
