In [1]:
import os
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import codecs
import pandas as pd
import numpy as np

In [2]:
"""stops is a set containing all the english stopwords and punctuations containing all the punctuations and later
adding all the punctuations to the stops set..."""

stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

"""clean_words accepts all the words in the document,,,,firstly converting all the words to lower case....
secondly removing stop words from it...and after that removing numeric characters 
from the list ..if there is any..and finally returning the whole list keeping / as a delimiter with 
all the cleaned words..."""

def clean_words(words):
    new_words = [w.lower() for w in words]  #converting all words to lower case
    new_words2 = [w for w in new_words if not w in stops]  #removing stop words
    new_words3 = [w.strip() for w in new_words2 if w.isalpha()]  #removing numeric values
    final_set = '/'.join(new_words3)
    return final_set

In [4]:
""" file_locations returns the list of dictionaries where each dictionary contains location of each document..
keys in the dictionary are root folder of file....filename....to which category it belongs ...and ....path
of that file...."""


def file_locations(folder_name):
    all_paths = []
    walk_func = os.walk(folder_name, topdown = False)
    for head, directory, file_list in walk_func :#traversing through each folders
        for file in file_list:  #traversing  through each files inside that folder
            dict_doc = {}   #creating dictionary for each file
            dir_name = head.split('/')[-1]
            dict_doc['head'] = head  
            dict_doc['path'] = head+"/"+file
            dict_doc['file_name'] = file
            dict_doc['class_name'] = dir_name
            all_paths.append(dict_doc)
    return all_paths

In [5]:
""" clean_docs accepts all the paths of the documents...opens them...and it tokenizes all whole content 
of the document and calls the clean_words function for removing useless words and after that it creates a new
file with the same name at different location...and writes all the cleaned words within that new file which is returned 
by the clean_words function"""


def clean_docs(document_paths) :
    i = 0
    print("Processing..",end="")
    for doc_path in document_paths:   #traversing through ecah file
        path = doc_path['path']
        with codecs.open(path, 'r', encoding='utf-8',errors='ignore') as fdata:  #opening the file
            text = fdata.read()  #reading the data within that file
            tokens = word_tokenize(text)  #tokenizing the whole content within that file
            clean_tokens = clean_words(tokens)  #removing the useless words
            i += 1
            
            clean_folder = doc_path['head'].replace('20_newsgroups','cleaned_content/20_newsgroups')  #renaming the name of root folder of that file
            os.makedirs(clean_folder,exist_ok = True)   #creating cleaned_content folder if it is not there..
            
            clean_files_location = clean_folder + "/" +doc_path['file_name'] + '.txt'  #allocating new path to the file...
            
            with open(clean_files_location, 'wb') as data :  #creating a new file with the same name but at different location
                data.write(bytes(clean_tokens,'utf8')) #writing the cleaned words within that file...
                data.close()   #closing the file....
            
            if i % 400 == 0 :
                print("..",end="")
    print("Done!")

In [6]:
document_paths = file_locations("20_newsgroups")   #extracting location of all documents

In [7]:
""" calling clean_docs function which creates a new folder by the name cleaned_content..which contains 
all the documents by the same name containing only the useful words separated by delimiter / """

print("Please wait...it may take some time...")
clean_docs(document_paths)  #creating new files...

Please wait...it may take some time...
Processing....................................................................................................Done!


In [9]:
new_paths = file_locations("cleaned_content")  #extracting location of cleaned files....

"""created a dataframe containg 2 columns class-name i.e to which category doc belongs and path of that document"""

main_df = pd.DataFrame(new_paths)   
main_df.drop(["head","file_name"],axis=1,inplace = True)

In [10]:
"""Provided all the categories a unique numeric value...i.e Y and all the paths of the documents is X ...after
that splited X and Y using sklearn into test and train.."""

class_list = list(set(main_df['class_name'].values))   #extracting names of all the classes...
class_dict = { class_list[i]:i for i in range(len(class_list))}   #assigning numbers to each class name
main_df['class_value'] = [class_dict[name] for name in main_df['class_name'].values]   #adding new column containing the values that is provided to each classes

X = main_df['path']  #pulling out X FROM THE dataest
Y = main_df['class_value']  #pulling out Y from the dataset

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0) #spliting into train and test

In [11]:
""" selecting_words function scans the content of all the new documents and creates a dictionary out of that...
where all the words are keys and frequency of that word is the value..
AFTER THAT....created 2 lists out of that dictionary 
first ones name is selected_features containing only those words whose frequency is 16 or more than 16
last feature name of Selected_feature list is class_num which is appended as extra at last ..will be used later in
representing category of the document....
Second ones name is Selected_featues_freq containing frequency of all those words....

and it finally returns the selected_features list.."""

def selecting_words(X_train):
    main_vocab = {}   #dictionary contaning frequency of each word
    i = 0
    print("Creating Dictionary.",end="")
    for path in X_train:   #traversing through each training file
        with codecs.open(path, 'r', encoding='utf-8',errors='ignore') as fdata:   #opening the file
            text = fdata.read().split('/')    
            i += 1
            for word in text:  #traversing through each words within that file...
                main_vocab[word] = main_vocab.get(word,0) + 1   
            if i % 700 == 0 :
                print("..",end="")
    
    sorted_vocab = sorted(main_vocab, key = main_vocab.get , reverse=True)   #sorting the dictionary w.r.t values
    selected_features = []
    selected_features_freq = []
    for word in sorted_vocab:  #traversing through each word and removing those having frequency less than 16
        if main_vocab[word] < 16:
            continue
        selected_features.append(word)
        selected_features_freq.append(main_vocab[word])
    selected_features.append("class_num")  #adding one more column...which will contain the class to which the document belongs to...
    print("Done!")
    return selected_features

selected_features = selecting_words(X_train)

Creating Dictionary...........................................Done!


In [15]:
""" dataset_frequency accepts selected_features...X_data(document paths)...and Y-data(document categories)...
this function actually creates a dataset...where columns are all the selected features ...and each rows represents 
each document....rows contains the frequency of each feature for a particular document...and last column of the 
dataset depicts the category of that document ....and finally returning the dataset"""

def dataset_frequency(selected_features,x_data,y_data):
    
    selected_words_dictionary = {word : True for word in selected_features}   #creating a dictionary and assigning true values to each word...to be used later..
    file_locations = x_data.values
    file_class = y_data.values
    frequency_set = []
    print("Creating Dataset.",end="")
    a = 0
    for file in file_locations:   #traversing through each files..
        freq_file = []
        freq_dict = {}
        
        with codecs.open(file, 'r', encoding='utf-8',errors='ignore') as fdata:
            text = fdata.read().split('/')  #reading the data
            for word in text:  #traversing through the content
                freq_dict[word] = freq_dict.get(word,0) + 1
            for word in selected_features:   #traversing through all the features
                if word not in freq_dict:   #if feature is not in dictionary assigning zzero value to that feature and adding that key to dict
                    freq_dict[word] = 0
            if a % 300 == 0:
                print(".",end="")
        freq_file = [freq_dict[word] for word in selected_features]  #creating the final list ...in proper sequence as the sequence of words in selected features list..
        freq_file[-1] = file_class[a]  #appending the class value to the last feature column that was assigned earlier
        frequency_set.append(freq_file)   #appending the list to the frequency set...
        a += 1
    print("Wait...",end="")
    df = pd.DataFrame(frequency_set, columns = selected_features)   #creating a final dataset with that frequency set..
    print("Done!")
    return df

In [13]:
training_dataset = dataset_frequency(selected_features,X_train,Y_train)  #creating dataset for training data

Creating Dataset...................................................Done!


In [16]:
testing_dataset = dataset_frequency(selected_features,X_test,Y_test)  #creating dataset for testing data

Creating Dataset..................Wait...Done!


In [25]:
x_train = training_dataset.drop(["class_num"],axis = 1) #separating x_train out of the training dataset

In [26]:
y_train = training_dataset["class_num"]   #separating y_train out of the training dataset

In [27]:
x_test = testing_dataset.drop(["class_num"],axis = 1)  #separating x_test out of the testing dataset

In [28]:
y_test = testing_dataset["class_num"]  #separating y_test out of testing dataset

In [29]:
"""the fit function is used for creating a dictionary where each category will be a key and each key  will be having 
an another dictionary...which will consist of all the features as keys..and values of those keys will
be the frequency of occurance of that feature within that particular category or class...
..this function  accepts training dataset...and returns the dictionary..."""

def fit(df):
    prob_dictionary = {}  
    for target in set(df["class_num"]):  #traversing through all the classes
        prob_dictionary[target] = {}   #creating dict for each
        temp = df[df["class_num"] == target]   #creating a new dataset from the main one ...which contains rows of that particular class only..
        total_count = 0   
        for word in range(len(df.columns)-1):   #traversing through each features
            prob_dictionary[target][df.columns[word]] = temp[df.columns[word]].sum()  #adding all the frequency under that feature and assigning it to dictionary
            total_count += prob_dictionary[target][df.columns[word]]   #maintaining total_count of words under that class
        prob_dictionary[target]["total_count"] = total_count   #adding total count to the dictionary
    return prob_dictionary   

prob_dictionary = fit(training_dataset)  #calling the above function....

In [30]:
"""this function probablity...accepts the dictionary that is returned by above function and the x-train...
It returns a dictionary where each categories or classes to which each documnets belongs to.... depicts keys..
....and value of each key is list ........where each element of that list contains probablity of occurance 
of the feature within that particular class..."""

def probablity(prob_dictionary,x_train):
    probablity_dict = {}
    no_of_features = len(x_train.columns)   
    for target in prob_dictionary.keys():  #traversing through each classes ...
        words_probablity_array = []   #creating a list for each
        for j in x_train.columns:   #traversing through each feature... 
            prob_word = prob_dictionary[target][j]+1/(prob_dictionary[target]["total_count"]+no_of_features)   #calculating probablity of that feature...
            words_probablity_array.append(prob_word)  
        probablity_dict[target] = np.log(np.array(words_probablity_array))  #coverting the list to numpy array ...finding out the log of each elements and assigning it to the dictionary under that particular class..
    return probablity_dict

probablity_dict = probablity(prob_dictionary,x_train)   #calling the above function

In [31]:
""" this function predict2 returns the class to which a document belongs to"""

def predict2(doc,prob_dictionary,probablity_dict,y_train):
    class_counts = y_train.value_counts().to_dict()   #calculating the occurance of each class in y_train...
    total_class_counts = y_train.shape[0]  #calculating total class counts...
    max_class = 0  #for storing the class_name
    max_value = -1.334   #for storing the probablity of being to that class
    test_array = np.array(doc)
    for target in probablity_dict:   #traversing through each classes 
        probablity = test_array * probablity_dict[target]   #multiplying the probablity array and frequency array...
        final_proba = probablity.sum() * (class_counts[target]/total_class_counts)   #calculating the final probablity of being to that particular class for that document..
        if max_value < final_proba:   #comparing the probablity with max one...if it is greater than max...assigning the new value to max...
            max_value = final_proba
            max_class = target
    return max_class    #returns the class with max probablity for that particular document

In [32]:
"""this function is used for predicting the values for x_test..."""

def predict(prob_dictionary,probablity_dict,x_test,y_train):
    Y_predict = []
    a = 0
    for doc in x_test.values:   #traversing through each rows of x_test
        Y_predict.append(predict2(doc,prob_dictionary,probablity_dict,y_train))    #calling the above function...and appending the returned value class to the y_predict..
        a += 1
        if a % 100 == 0:
            print(a,"Files Processed")
    return Y_predict   

In [33]:
y_predict = predict(prob_dictionary,probablity_dict,x_test,y_train)   #calling the above function...

100 Files Processed
200 Files Processed
300 Files Processed
400 Files Processed
500 Files Processed
600 Files Processed
700 Files Processed
800 Files Processed
900 Files Processed
1000 Files Processed
1100 Files Processed
1200 Files Processed
1300 Files Processed
1400 Files Processed
1500 Files Processed
1600 Files Processed
1700 Files Processed
1800 Files Processed
1900 Files Processed
2000 Files Processed
2100 Files Processed
2200 Files Processed
2300 Files Processed
2400 Files Processed
2500 Files Processed
2600 Files Processed
2700 Files Processed
2800 Files Processed
2900 Files Processed
3000 Files Processed
3100 Files Processed
3200 Files Processed
3300 Files Processed
3400 Files Processed
3500 Files Processed
3600 Files Processed
3700 Files Processed
3800 Files Processed
3900 Files Processed
4000 Files Processed
4100 Files Processed
4200 Files Processed
4300 Files Processed
4400 Files Processed
4500 Files Processed
4600 Files Processed
4700 Files Processed
4800 Files Processed
4

In [35]:
from sklearn.metrics import classification_report, confusion_matrix
print(" ---Results-----Manually performed code.. ")
print("Precision....Recall...F1-score for the predicted data")
print("-----------------------------------------------------")
print(classification_report(y_test,y_predict)) 
print("Confusion Matrix for the predicted data....")
print("-----------------------------------------------------")
print(confusion_matrix(y_test,y_predict))  #for seeing how many documents are predicted correctly....

 ---Results-----Manually performed code.. 
Precision....Recall...F1-score for the predicted data
-----------------------------------------------------
             precision    recall  f1-score   support

          0       0.61      0.54      0.57       243
          1       0.86      0.66      0.75       261
          2       0.89      0.83      0.86       270
          3       0.56      0.78      0.65       258
          4       0.78      0.61      0.69       249
          5       0.78      0.93      0.85       238
          6       0.88      0.89      0.89       246
          7       0.99      0.83      0.91       283
          8       0.89      0.95      0.92       271
          9       0.83      0.76      0.79       256
         10       0.97      0.89      0.93       249
         11       0.92      0.51      0.65       261
         12       0.73      0.95      0.82       237
         13       0.97      0.95      0.96       264
         14       0.80      0.81      0.81       236


In [36]:
#USING MULTINOMIAL NAIVE BAYES

In [37]:
""" extracting the values from the datasets for input in Multinomial Naive Bayes classifier..."""
x_train2 = x_train.values   
x_test2 = x_test.values
y_train2 = y_train.values
y_test2 = y_test.values

In [38]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0.01)    
clf.fit(x_train2, y_train2)  #fiting the data within the classifier

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [39]:
Y_pred = clf.predict(x_test2)    #predicting the data

In [40]:
print(" Results using Multinomial Naive Bayes classifier")
print("Precision....Recall...F1-score for the predicted data")
print("-----------------------------------------------------")
print(classification_report(y_test2,Y_pred))
print("Confusion Matrix for the predicted data....")
print("-----------------------------------------------------")
print(confusion_matrix(y_test2,Y_pred))

 Results using Multinomial Naive Bayes classifier
Precision....Recall...F1-score for the predicted data
-----------------------------------------------------
             precision    recall  f1-score   support

          0       0.64      0.55      0.59       243
          1       0.83      0.85      0.84       261
          2       0.89      0.91      0.90       270
          3       0.74      0.72      0.73       258
          4       0.89      0.55      0.68       249
          5       0.94      0.92      0.93       238
          6       0.96      0.89      0.92       246
          7       0.94      0.94      0.94       283
          8       0.94      0.94      0.94       271
          9       0.83      0.89      0.86       256
         10       0.97      0.96      0.96       249
         11       0.80      0.84      0.82       261
         12       0.98      0.93      0.96       237
         13       0.96      0.98      0.97       264
         14       0.78      0.88      0.83    

-----------------------------------:CONCLUSION:-------------------------------------
-----------------------------------:COMPARISON :------------------------------------
As we know that...The above task is performed in two methods....firstly..using manually performed code...and secondly using multinomial naive bayes...
RESULTS:------
**1st...USING ---Manually performed code...:
Precision - 0.81,
Recall - 0.80,
F1-Score - 0.80,
**2nd...USING ---Inbuilt Multinomial Naive Bayes classifier...:
Precision - 0.85,
Recall - 0.85,
F1-Score - 0.85,

**From the results we can see that inbuilt multinomial naive bayes is providing better results than the manually written code.