This notebook does Multi-Class Classification on 20 sets of Documents. 

In [4]:
# Importing the libraries
import os
import pandas as pd
import numpy as np
import sklearn 
import scipy.sparse as sp
%matplotlib inline  
# Don't use plt.show() as it opens a new window and blocks the evaluation of cell. 
import matplotlib.pyplot as plt

In [5]:
# Changing the default data frame options. 
pd.set_option("max_r",3000) # Set the default rows to 80
pd.set_option("max_columns",51000) # Set the default columns to 500 from 20. 
pd.set_option("expand_frame_repr",False) # Checks if frame can be expanded or truncated. Make it expand column-wise
np.set_printoptions(threshold=np.inf) # Removes the threshold level to print numpy array
np.set_printoptions(suppress=True) # Prevent printing in scientific notation

#  Multiclass Document Classification

Reading in the dataset from different folders. 

In [20]:
# This funtion reads in the data from different documents and stores as list

def readData():
    """This fucntion reads the document files from different sub-directories"""
    docs_list=[]
    target_class=[]
    target_recoded=[]
    x=-1
    for (dirnames, dir,files) in os.walk('./data'):
        if (x<0):
            class_names=dir
            x+=1
            continue
        for y in range(len(files)):
            docs_list.append(open(dirnames+'/'+files[y]).read())
            target_class.append(dirnames[7:])
            target_recoded.append(x)    # Getting the target class as numeric codes: 0 for 1st class, 1 for 2nd and so on..
        x+=1
    
    # Taking some sense of data
    print("Total no. of documents:",len(target_class))
    print("Total Classes:", len(class_names))
    print("Average no. of documents in a class",len(target_class)/len(class_names))
    return docs_list,target_recoded




# Training and test split

def split(docs_list,target_recoded):
    """This function samples the dataset into training and testing"""
    # Splitting into training and test. 
    from sklearn.cross_validation import train_test_split
    train_X, test_X,train_Y,test_Y = train_test_split(docs_list, target_recoded, test_size=0.30, random_state=42)
    
    return train_X, test_X,train_Y,test_Y




# Cleaning the doc files

def cleaningDocs(doc,stem='L'):  # 'S' for Stemming, 'L' for Lemmatization
    """This function cleans each doc string by doing the following: 
    i)   Removing punctuation and other non alphabetical characters
    ii)  Convert to Lower case and split string into words (tokenization)
    ii)  Removes stop words (most frequent words)
    iii) Doing Stemming and Lemmatization
    """
    
    # Removing punctuations and other non alphabetic characters
    import re
    alphabets_only=re.sub(r'[^a-zA-Z]'," ",doc)
    
    # Converting to lower case and splitting the words(tokenization)
    words_lower=alphabets_only.lower().split()
    
    # Removing stop words (Words like 'a', 'an', 'is','the' which doesn't contribute anything
    from nltk.corpus import stopwords
    useful_words = [w for w in words_lower if not w in set(stopwords.words("english"))] 
    
    # Doing Stemming or Lemmatization (Normalising the text)
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    if (stem=='S'):  # Choosing between Stemming ('S') and Lemmatization ('L')
        stemmer=PorterStemmer()
        final_words=[stemmer.stem(x) for x in useful_words]
    else: 
        lemma=WordNetLemmatizer()
        final_words=[lemma.lemmatize(x) for x in useful_words]
        
    
    return( " ".join(final_words))    
    
    
    
    
def processing(file,stem="L"):
    """ Function to clean the training or test docs
    Pass the name of file as argument to be cleaned
    
    """
    cleaned_train_X=[]
    for x in file: 
        cleaned_train_X.append(cleaningDocs(x,stem))
    return cleaned_train_X




# Creating Bag of words feature vectors for each document

def bagofWords(X,Y,max_feature=5000,type="count"):
    """This function creates a Bag of Features vectors from the original documents"""
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    
    if(type=="count"): # To choose between count or tf-idf bag or words model
        vectorizer = CountVectorizer(analyzer = "word",max_features = max_feature) 
    else: 
        vectorizer = TfidfVectorizer(analyzer = "word",max_features = max_feature)
        
    X=vectorizer.fit_transform(X)
    return X ,np.array(Y),vectorizer # Converting to numpy array


    
class learner(object):
    """Creating a class to efficiently run multiple algorithms on the same dataset"""
    
    def __init__(self,train_X,train_Y,k=5):
        self.k=k
        self.train_X=train_X
        self.train_Y=train_Y

        
    # Running algorithm with 5 fold cross-validation

    def kFold(self):
        '''This fucntion splits the training set into k folds
        '''
        from sklearn import cross_validation
        self.k_fold=cross_validation.KFold(n=self.train_X.shape[0],n_folds=self.k)
    

            
    def GNB(self):
        """Method to implement Multi-class Gaussian Naive Bayes"""
        
        from sklearn.naive_bayes import GaussianNB
        scores_gnb = []
        
        for train_indices, test_indices in self.k_fold:
            train_X_cv = self.train_X[train_indices].todense()
            train_Y_cv= self.train_Y[train_indices]
 
            test_X_cv = self.train_X[test_indices].todense()
            test_Y_cv= self.train_Y[test_indices]
        
            self.gnb=GaussianNB()
            scores_gnb.append(self.gnb.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))
            
        print("The mean accuracy of GaussianNaive Bayes on CV data is:", np.mean(scores_gnb))
        
        
        
    def RF(self):
        """Method to implement Multi-class RandomForest"""
        
        from sklearn.ensemble import RandomForestClassifier
        scores_rf = []
        
        for train_indices, test_indices in self.k_fold:
            train_X_cv = self.train_X[train_indices].todense()
            train_Y_cv= self.train_Y[train_indices]
 
            test_X_cv = self.train_X[test_indices].todense()
            test_Y_cv= self.train_Y[test_indices]
        
            self.rf=RandomForestClassifier(n_estimators=150,criterion='entropy')
            scores_rf.append(self.rf.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))

        print("The mean accuracy of Random Forests on CV data is:", np.mean(scores_rf))
        
        
        
    def SGD(self):
        """Method to implement Multi-class SVM using Stochastic Gradient Descent"""
        
        from sklearn.linear_model import SGDClassifier
        scores_sgd = []
        
        for train_indices, test_indices in self.k_fold:
            train_X_cv = self.train_X[train_indices].todense()
            train_Y_cv= self.train_Y[train_indices]
 
            test_X_cv = self.train_X[test_indices].todense()
            test_Y_cv= self.train_Y[test_indices]
        
            self.sgd=SGDClassifier(loss='hinge',penalty='l2')
            scores_sgd.append(self.sgd.fit(train_X_cv,train_Y_cv).score(test_X_cv,test_Y_cv))

        print("The mean accuracy of Stochastic Gradient Descent Classifier on CV data is:", np.mean(scores_sgd))   
        
        
        
    def test_performance(self,test_X,test_Y):
        """This method checks the performance of each algorithm on test data."""
        
        from sklearn import metrics
        
        # For GNB
        print ("The accuracy of GNB on test data is:", self.gnb.score(test_X,test_Y))
        print 'Classification Metrics for GNB'
        print metrics.classification_report(test_Y, self.gnb.predict(test_X))
        print "Confusion matrix"
        print metrics.confusion_matrix(test_Y, self.gnb.predict(test_X))
        
        # For RandomForest
        print ("The accuracy of Random Forest on test data is:", self.rf.score(test_X,test_Y))
        print 'Classification Metrics for RandomForest'
        print metrics.classification_report(test_Y, self.rf.predict(test_X))
        print "Confusion matrix"
        print metrics.confusion_matrix(test_Y, self.rf.predict(test_X))
        
        # For SGD
        print ("The accuracy of SGD on test data is:", self.sgd.score(test_X,test_Y))
        print 'Classification Metrics for SGD'
        print metrics.classification_report(test_Y, self.sgd.predict(test_X))
        print "Confusion matrix"
        print metrics.confusion_matrix(test_Y, self.sgd.predict(test_X))
    
    

Cleaning original docs by doing text preprocessing

In [6]:
# Reading Datasets
docs_list,target_Y=readData()

('Total no. of documents:', 18834)
('Total Classes:', 20)
('Average no. of documents in a class', 941)


In [7]:
#Splitting the dataset into training and test
train_X, test_X,train_Y,test_Y=split(docs_list,target_Y)


In [8]:
# Cleaning the training docs files
cleaned_train_X=processing(train_X)

In [26]:
# Creating bag of words
train_X, train_Y,vectorizer=bagofWords(cleaned_train_X,train_Y,type="tfidf")

(13183, 5000)

In [28]:
# Runnning the algorithms
obj=learner(train_X,train_Y)
obj.kFold()

In [35]:
# Running Gaussian Naive Bayes
obj.GNB()

('The mean accuracy of GaussianNaive Bayes on CV data is:', 0.6906625280601778)


In [31]:
#Running Random Forests
obj.RF()

('The mean accuracy of Random Forests on CV data is:', 0.76158766658437793)


In [29]:
# Running Stochastic Gradient Descent SVM
obj.SGD()

('The mean accuracy of Stochastic Gradient Descent Classifier on CV data is:', 0.85739335693812158)


In [37]:
# Converting the test data into Bags of model. 

# Cleaning the training docs files
test_X=processing(test_X)
# Creating bag of words
test_X=vectorizer.transform(test_X)
test_Y=np.array(test_Y)

In [36]:
# Printing out the classification metric for GNB, Random Forest and Stochastic gradient descent SVM
obj.test_performance(test_X.toarray(),test_Y)

('The accuracy of GNB on test data is:', 0.68837373916121036)
Classification Metrics for GNB
             precision    recall  f1-score   support

          0       0.77      0.77      0.77       220
          1       0.52      0.55      0.53       303
          2       0.52      0.46      0.49       280
          3       0.53      0.54      0.53       286
          4       0.60      0.58      0.59       275
          5       0.68      0.61      0.64       299
          6       0.62      0.48      0.54       302
          7       0.70      0.62      0.66       293
          8       0.86      0.71      0.78       329
          9       0.88      0.77      0.82       305
         10       0.91      0.88      0.90       310
         11       0.74      0.90      0.81       279
         12       0.59      0.54      0.57       310
         13       0.68      0.71      0.70       309
         14       0.66      0.84      0.74       294
         15       0.79      0.82      0.80       306
     

# II) Finding the descrambled Words from a list of words

Given a set of words list, and a word list, we need to find all the words in the world list which are descramble
of the word in scrambled word list 

e.g. : Scrambled list: ['dgo']  
       Wordlist=['dog','god','ddgoi']  
    >> print---> 'dog' 'god'
   

In [355]:
# function to find descrambled words

def descramble(scrambled_list,word_list):
    """This function takes a word list (word_list) and 
    checks if a scrambled word in scrambled list (scrambled_list) is present in the word list
    
    eg. Scrambled list: ['dgo']
        Wordlist=['dog','god','ddgoi']
        >>> 'dog' 'god'
    """
    for x in scrambled_list:
        for y in word_list:
            temp=y
            c=0
            for z in x:
                if(z in temp):
                    c+=1
                    temp=list(temp)
                    temp.remove(z)
                    temp="".join(temp)
                    if(c==len(x) and (len(x)==len(y))):
                        print(y)
                else: 
                    break

                    
check=['dgo','man','ddii']
wordlist=['dog','god','ddgoi','nam','dddi']
descramble(check,wordlist)

dog
god
nam
