In [387]:
import pandas as pd
import html
import pprint
import re
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import *
from html.parser import HTMLParser
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.model_selection
from sklearn.model_selection import*
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')
stopwords.extend(['Reuter','zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])

import warnings
warnings.filterwarnings('ignore')

import re

[nltk_data] Downloading package stopwords to C:\Users\Supreme
[nltk_data]     Leader\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [388]:
# Defining Class which contains methods to parse the SGML Files


class ReutersParser(HTMLParser):
    """
    ReutersParser subclasses HTMLParser and is used to open the SGML
    files associated with the Reuters-21578 categorised test collection.

    The parser is a generator and will yield a single document at a time.
    Since the data will be chunked on parsing, it is necessary to keep 
    some internal state of when tags have been "entered" and "exited".
    Hence the in_body, in_topics and in_topic_d boolean members.
    """
    def __init__(self, encoding='latin-1'):
        """
        Initialise the superclass (HTMLParser) and reset the parser.
        Sets the encoding of the SGML files by default to latin-1.
        """
        html.parser.HTMLParser.__init__(self)
        self._reset()
        self.encoding = encoding

    def _reset(self):
        """
        This is called only on initialisation of the parser class
        and when a new topic-body tuple has been generated. It
        resets all off the state so that a new tuple can be subsequently
        generated.
        """
        self.in_body = False
        self.in_topics = False
        self.in_topic_d = False
        self.body = ""
        self.topics = []
        self.topic_d = ""

    def parse(self, fd):
        """
        parse accepts a file descriptor and loads the data in chunks
        in order to minimise memory usage. It then yields new documents
        as they are parsed.
        """
        self.docs = []
        for chunk in fd:
            self.feed(chunk.decode(self.encoding))
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_starttag(self, tag, attrs):
        """
        This method is used to determine what to do when the parser
        comes across a particular tag of type "tag". In this instance
        we simply set the internal state booleans to True if that particular
        tag has been found.
        """
        if tag == "reuters":
            pass
        elif tag == "body":
            self.in_body = True
        elif tag == "topics":
            self.in_topics = True
        elif tag == "d":
            self.in_topic_d = True 

    def handle_endtag(self, tag):
        """
        This method is used to determine what to do when the parser
        finishes with a particular tag of type "tag". 

        If the tag is a <REUTERS> tag, then we remove all 
        white-space with a regular expression and then append the 
        topic-body tuple.

        If the tag is a <BODY> or <TOPICS> tag then we simply set
        the internal state to False for these booleans, respectively.

        If the tag is a <D> tag (found within a <TOPICS> tag), then we
        append the particular topic to the "topics" list and 
        finally reset it.
        """
        if tag == "reuters":
            self.body = re.sub(r'\s+', r' ', self.body)
            self.docs.append( (self.topics, self.body) )
            self._reset()
        elif tag == "body":
            self.in_body = False
        elif tag == "topics":
            self.in_topics = False
        elif tag == "d":
            self.in_topic_d = False
            self.topics.append(self.topic_d)
            self.topic_d = ""  

    def handle_data(self, data):
        """
        The data is simply appended to the appropriate member state
        for that particular tag, up until the end closing tag appears.
        """
        if self.in_body:
            self.body += data
        elif self.in_topic_d:
            self.topic_d += data
if __name__ == "__main__":
    # Create the list of Reuters data and create the parser
    files = [r"C:\Users\Supreme Leader\Desktop\DMML 2/reut2-%03d.sgm" % r for r in range(0, 22)]
    parser = ReutersParser()

    # Parse the document and force all generated docs into
    # a list so that it can be printed out to the console
    docs = []
    for fn in files:
        for d in parser.parse(open(fn, 'rb')):
            docs.append(d)
            
def obtain_topic_tags():
    
#    Open the topic list file and import all of the topic names
#    taking care to strip the trailing "\n" from each word.
    
    
    topics = open(r"C:\Users\Supreme Leader\Desktop\DMML 2\all-topics-strings.lc.txt").readlines()
    topics = [t.strip() for t in topics]
    return topics

In [389]:
#  Reads all of the documents and creates a new list of two-tuples
#  that contains only the topic list and the body text.
#  It removes all geographic features and only 
#  retains those documents which have at least one non-geographic
#  topic.
    

def filter_doc_list_through_topics(topics, docs):
    
           
    ref_docs = []
    for d in docs:
        
        if d[0] == [] or d[0]== "":
            
            continue
        a = []
        for t in d[0]:
          
            
            if t in topics:
                
                
                a.append(t)
        if a!=[] and d[1]!= "":
            
            
            ref_docs.append((a,d[1]))
            
                
    return ref_docs
                    
                    

In [390]:
topics= obtain_topic_tags() # Obtain all topic tags

In [391]:
ref_docs = filter_doc_list_through_topics(topics, docs)# Strip all other tags excluding those in topic list

In [392]:
df = pd.DataFrame(ref_docs, columns = ['Label', 'Text']) # Convert the data into a dataframe for processing

In [393]:
# Stemming the articles word by word. eg- amusement, amusing is all truncated to amuse
t = []
for i,r in df.iterrows():
    
    t.append(re.findall(r'\w+', r['Text']))
    
stemmer = SnowballStemmer('english')
j = []
for k in t:    
    j.append(' '.join([stemmer.stem(x) for x in k]))  

In [394]:
# Parsing the stemmed articles back into the data frame
df['Text1'] = j
df.drop('Text', axis = 1, inplace = True)
df.columns = ['Label','Text']

In [395]:
# We format the trainign data alone that an article has one corresponding topic, in a data frame. In case an article has multiple 
# topics assigned to it, say (d,c1,c2,c3)  we convert it into multiple rows (d,c1),(d,c2),(d,c3) 

def FormatTraindata(X_train,y_train):
       
    
    
    x=pd.concat([X_train,y_train],axis=1)
    Q = pd.DataFrame()
    Class =[]
    Article=[]
    
    for i,r in x.iterrows():
        
        
        if len(r['Label'])==1:
            
            
            Class.append(r['Label'][0])
            Article.append(r['Text'])
        elif len(r['Label'])>1:
            
                    
            for j in r['Label']:
                
                Article.append(r['Text'])
                Class.append(j)
    
    Q['Text']=Article
    Q['Label']=Class
    
    return(Q)


In [396]:
# Mapping Labels back to the predicted data after applying a threshold

def thresholdcomp(prediction,T,thr):
    ans = []
    for i in prediction:
        n = []
        for j in range(len(prediction[0])):
            if i[j] > thr:
                n.append(T[j])
        ans.append(n)
    return(ans)

In [397]:
#Compute Jacard index, a measure of precision( True positive/ True Positive + False Negative + False Positive)  

def Jacardscr(ans,y_test):
    
    js = 0
    
    for i in range(len(ans)):
        js += len(set(list(y_test)[i]) & set(ans[i]))/len(set(list(y_test)[i]) | set(ans[i]))
    
    js=js/len(ans)
    return(js)

In [404]:
# Perform K fold cross validation on the data where k is a user defined parameter. 
# In k-fold cross-validation, the original sample is randomly partitioned into k equal sized subsamples.
# Of the k subsamples, a single subsample is retained as the validation data for testing the model, 
# and the remaining k − 1 subsamples are used as training data. 

# The initial data which is processed into a data frame with two columns : Articles(Data points) and Labels(Classes)
# is also an argument to the funtion.

# The Probabilistic threshold 'thr' for predicting class labels is passed as an argument as well


def kfold(df,k,thr):
    kf = KFold(k,shuffle = True)
    kf.get_n_splits(df['Text'])
    
# Split the data into training data and test data
    
    Out =[]
    i=0
    
    for trainindex,testindex in kf.split(df['Text']):
        print ('Train:', trainindex,'Test:',testindex)
        X_train,X_test = df['Text'].iloc[trainindex], df['Text'].iloc[testindex]
        y_train,y_test = df['Label'].iloc[trainindex], df['Label'].iloc[testindex]

        Q = FormatTraindata(X_train,y_train)
        T= sorted(set(Q['Label']))
        
        # We sort the labels in alphabetical order so as to later map it back to the predicted test
        #data based on indices

        
        
        # One vs rest strategy involves training a single classifier per class, with the samples of that class as 
        # positive samples and all other samples as negatives. This strategy requires the 
        # base classifiers to produce a real-valued confidence score for its decision, rather than just a class label;
        # discrete class labels alone can lead to ambiguities, where multiple classes are predicted for a single sample
        
        # Define a pipeline combining a text feature extractor with multi label classifier(One vs rest)
        # pipeline allows performing a sequence of different transformations
        
        NB_pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2',stop_words = stopwords)),
                    ('clf', OneVsRestClassifier(MultinomialNB(alpha=0,
                        fit_prior=True, class_prior=None))),
                ])

        NB_pipeline.fit(Q['Text'],Q['Label']) # Fit the pipleline  to the training data

        prediction = NB_pipeline.predict_proba(X_test) # Predict the probabilities associated with each class for an article,
                                                       # output as an array

        ans=thresholdcomp(prediction,T,thr)     # Map the Labels to the articles based on a probability threshold.
                                    #       ie . the classes/labels are assigned if the probability ofhaving  a class associated with
                                            # an article is higher than a specified threshold
        
        h= pd.DataFrame()
        h['Text'] = X_test
        h["Actual lab"] = y_test
        h["Predicted"] = ans
        Out.append(h)
        
        print("len = ",len(ans))

        js=Jacardscr(ans,y_test)    # Compute the Jacard index

        print (js) # The Jacard index, a measure of precision of prediction is output
        
        h.to_csv("C:/Users/Supreme Leader/Desktop/Output-threshold = "+" " +str(thr)+" "+" Jacscore = "+" "+str(js)+" " +str(i)+".csv",sep='\t')
        
        i=i+1    
        
        
    return(Out) # Returns a list with each element of the output , a data frame which contains the article, actual label 
                # and the predicted label. The list has k items as we perform a kfold validation

In [405]:
Out = kfold(df,7,0.33)

Train: [    0     2     3 ... 10372 10373 10376] Test: [    1     7    13 ... 10366 10374 10375]
len =  1483
0.8330553390566877
Train: [    0     1     2 ... 10374 10375 10376] Test: [    6     8    10 ... 10358 10359 10360]
len =  1483
0.8236200751372702
Train: [    0     1     2 ... 10373 10374 10375] Test: [    4    14    32 ... 10367 10370 10376]
len =  1483
0.8261945142592475
Train: [    1     2     4 ... 10374 10375 10376] Test: [    0     3     5 ... 10344 10357 10368]
len =  1482
0.8236230265835528
Train: [    0     1     3 ... 10374 10375 10376] Test: [    2    11    31 ... 10364 10371 10373]
len =  1482
0.8302537861748387
Train: [    0     1     2 ... 10374 10375 10376] Test: [   12    24    25 ... 10361 10365 10369]
len =  1482
0.8434483645009957
Train: [    0     1     2 ... 10374 10375 10376] Test: [    9    19    28 ... 10338 10347 10372]
len =  1482
0.8152641218430695
