## EXTRACTION OF DATA FROM XML FILES

In [21]:
# EXTRACTION OF DATA FROM XML FILE
import glob
import os

import numpy
import numpy as np
import nltk
nltk.download('wordnet')
from lxml import etree
import pandas as pd
from nltk.corpus import stopwords
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelBinarizer, LabelEncoder
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from gensim.models import word2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import preprocessing
from keras import Sequential


dataColumns = ["headline", "text", "bip:topics", "dc.date.published", "itemID", "XML_File_Name"]
clusterDataframeList = []
rows = []
paragraph = ""
bipTopicList = []
vec = CountVectorizer(stop_words=None)
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, max_df=0.5, use_idf=True)
enhancedDFList = []


def dataExtraction():
    dir = '/Users/vishn/Data'
    for file in glob.iglob(os.path.join(dir, '*.xml')):
        paragraph = ""
        bipTopicCode = ""
        path, fileName = os.path.split(file)  # Obtained File name
        data = etree.parse(file)
        root = data.getroot()
        itemId = data.getroot().attrib.get("itemid")  # Obtained item ID
        headline = data.find("headline").text
        textNode = data.find("text")
        for node in textNode:
            paragraph = paragraph + node.text  # Obtained text
        dcPublishedNode = root.findall("./metadata/dc[@element='dc.date.published']")
        if dcPublishedNode is not None:
            published_date = dcPublishedNode[0].attrib.get("value")  # obtained dc.date.published
        else:
            published_date = "NONE"
        bipNode = root.findall("./metadata/codes[@class='bip:topics:1.0']/code")
        text = removeStopWords(paragraph)  # removing stop words
        if bipNode is not None:
            for innercodes in bipNode:
                bipTopicCode = innercodes.attrib.get("code")  # obtained bip:topic code
                rows.append({"itemID": itemId, "XML_File_Name": fileName, "headline": headline, "text": text,
                             "dc.date.published": published_date, "bip:topics": bipTopicCode})
                uniqueBipTopics(bipTopicCode)
                break
        else:
            bipTopicCode = "NONE"
            rows.append({"itemID": itemId, "XML_File_Name": fileName, "headline": headline, "text": text,
                         "dc.date.published": published_date, "bip:topics": bipTopicCode})

    customDataFrame = pd.DataFrame(rows, columns=dataColumns)
    return customDataFrame


def uniqueBipTopics(topic):
    if topic not in bipTopicList:
        bipTopicList.append(topic)
    return bipTopicList


def removeStopWords(text):
    stop_words = set(stopwords.words('english'))
    text_tokens = word_tokenize(text)
    filtered_sentence_list = [w for w in text_tokens if w not in stop_words]
    filtered_lemmatized_list = lemmatization(filtered_sentence_list)
    filtered_stemmed_list = stemming(filtered_lemmatized_list)
    filtered_lemmatized_sentence = ' '.join(filtered_stemmed_list)
    return filtered_lemmatized_sentence


def stemming(sentence):
    ps = PorterStemmer()
    stemmed_words = []
    for w in sentence:
        stemmed_words.append(ps.stem(w))
    return stemmed_words


def lemmatization(filtered_sentence):
    lem = WordNetLemmatizer()
    lemmatized_words = []
    for w in filtered_sentence:
        lemmatized_words.append(lem.lemmatize(w))
    return lemmatized_words
rawDataFrame = dataExtraction()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vishn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
rawDataFrame.head()


Unnamed: 0,headline,text,bip:topics,dc.date.published,itemID,XML_File_Name
0,Canadian Occidental mounts rival Wascana bid.,canadian occident petroleum ltd. emerg tuesday...,C181,1997-03-18,326914,326914newsML.xml
1,"Gruma, Maseca to receive syndicated loan - bank.",bank america launch three-year $ 120 million s...,C173,1997-03-18,326915,326915newsML.xml
2,Too early to call Krupp bid hostile - Deutsche...,deutsch bank AG manag board member rolf breuer...,C18,1997-03-18,326916,326916newsML.xml
3,"FOCUS - Euro bourses fret over Wall St, electi...",european bours fell tuesday even wall street o...,M11,1997-03-18,326917,326917newsML.xml
4,"French stocks fall, Alcatel posts big gain.",french share close lower tuesday second consec...,G152,1997-03-18,326918,326918newsML.xml


### CLUSTERING DOCUMENTS USING K MEANS

In [23]:
# PERFORMING CLUSTERING ON DOCUMENTS
import warnings
warnings.filterwarnings('ignore')
def clustering(clusterDataFrame):
    docVecList = []
    textData = clusterDataFrame["text"]
    bipTopics = clusterDataFrame["bip:topics"]

    # The goal of doc2vec is to create a numeric representation of a document
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(textData)]
    max_epochs = 20
    # vec_size is used to set dimension of the vector. below vec size indicates the representation
    # of document in 20 components
    vec_size = 20
    alpha = 0.025 #Learning rate
    model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)
    #dm defines the training algorithm. If dm=1 means ‘distributed memory’ (PV-DM) and dm =0 means ‘distributed bag of words’
    # (PV-DBOW). Distributed Memory model preserves the word order in a document whereas Distributed Bag of words just uses the bag of
    # words approach, which doesn’t preserve any word order.
    model.build_vocab(tagged_data)
    for epoch in range(max_epochs):
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = model.alpha
    for i in range((len(tagged_data))):
        docVecList.append(model.docvecs[i])


    featureDataFrame = pd.DataFrame(data=docVecList)
    min_max_scaler = preprocessing.MinMaxScaler()
    scaledFeatureArray = min_max_scaler.fit_transform(featureDataFrame)
    scaledFeatureDataFrame = pd.DataFrame(data=scaledFeatureArray)
    km = KMeans(n_clusters=10) #TAKEN 10 CLUSTERS
    km.fit(scaledFeatureDataFrame)
    clusters = km.labels_
    scaledFeatureDataFrame['cluster_ID'] = clusters
    scaledFeatureDataFrame['labels'] = bipTopics
    # clusterQuality(clusters,featureData,featureDataFrame)
    # return FeatureDataFrame
    return scaledFeatureDataFrame,clusters,featureDataFrame

receivedClusterDataframe, receivedClusters, receivedFeatureData = clustering(rawDataFrame)

In [24]:
receivedClusterDataframe.head() # PRINING THE DATAFRAME AFTER CLUSTERING WITH cluster_id INCLUDED AS COLUMN

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,cluster_ID,labels
0,0.622538,0.545301,0.75223,0.619713,0.468593,0.576692,0.548873,0.620881,0.475798,0.055148,...,0.461509,0.519127,0.273841,0.269771,0.746221,0.503637,0.47997,0.662316,2,C181
1,0.685125,0.524099,0.719955,0.669576,0.425134,0.459874,0.579058,0.727017,0.528013,0.267745,...,0.472078,0.627598,0.290054,0.46658,0.78793,0.253786,0.502149,0.637086,2,C173
2,0.502486,0.50551,0.686061,0.556233,0.161481,0.281391,0.133655,0.689424,0.568082,0.252426,...,0.489413,0.623307,0.288272,0.325249,0.763247,0.418129,0.672976,0.523832,9,C18
3,0.443644,0.521659,0.757898,0.549458,0.224391,0.182809,0.311964,0.773963,0.556796,0.278611,...,0.460008,0.650015,0.460365,0.419992,0.722694,0.489154,0.746901,0.500162,4,M11
4,0.409186,0.572101,0.740241,0.530068,0.2137,0.212124,0.31262,0.744341,0.420685,0.30609,...,0.460553,0.580583,0.396706,0.394423,0.767854,0.383323,0.809262,0.548098,4,G152


### DIVIDING DATA ACCORDING TO CLUSTERS

In [25]:
#DIVIDING THE DATAFRAME ACCORDING TO  CLUSTERS
def clusterProcessing(receivedClusterDataframe):
    uniqueClusterIDs = receivedClusterDataframe.cluster_ID.unique()
    uniqueClusterIDs.sort()
    for id in uniqueClusterIDs:
        clusterWiseDF = receivedClusterDataframe.loc[receivedClusterDataframe['cluster_ID'] == id]
        clusterDataframeList.append(clusterWiseDF)
    return clusterDataframeList

clusterDataframeList = clusterProcessing(receivedClusterDataframe)
frameListforEnhance = clusterDataframeList[:]


In [26]:
clusterDataframeList # Cluster data is maintained in the respective index of the List

[              0         1         2         3         4         5         6  \
 28     0.622508  0.316628  0.742297  0.663221  0.350849  0.173662  0.438305   
 29     0.541481  0.410301  0.758444  0.614542  0.389700  0.267358  0.412344   
 30     0.629755  0.489406  0.746179  0.636506  0.390949  0.293430  0.343126   
 34     0.490809  0.393096  0.734348  0.496565  0.293426  0.128871  0.521706   
 37     0.586118  0.328403  0.721332  0.574498  0.310174  0.142900  0.366615   
 ...         ...       ...       ...       ...       ...       ...       ...   
 48235  0.581130  0.312766  0.615109  0.562801  0.222225  0.245799  0.508059   
 48236  0.552760  0.288122  0.632354  0.548497  0.209318  0.244996  0.516977   
 48237  0.453025  0.361840  0.805629  0.659009  0.303432  0.316699  0.572791   
 48238  0.605836  0.360852  0.682861  0.726450  0.387849  0.218006  0.417745   
 48250  0.604980  0.426017  0.709831  0.677164  0.303783  0.216087  0.514777   
 
               7         8         9  

### APPLYING CLASSIFIERS FOR EACH CLUSTERS

In [27]:
def applyClassifier(receivedDFList):
    for df in receivedDFList:
        clusterNumber = df['cluster_ID'].unique()
        clusterNumber = clusterNumber[0]
        
        Xtr, Xte, Ytr, Yte, target = trainTestSplit(df)
        #
        if clusterNumber == 0:
            # Artificial Neural Network(ANN) uses the processing of the brain as a basis 
            # to develop algorithms that can be used to model complex patterns and prediction problems.
            print("Neural Networks")
            print("========================================")
            neuralNetwork_model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                                                hidden_layer_sizes=(60,), random_state=1, max_iter=500)
            trainedClassifier = neuralNetwork_model.fit(Xtr, Ytr)
#SVM constructs a hyperplane in multidimensional space to separate different classes. 
#SVM generates optimal hyperplane in an iterative manner, which is used to minimize an error.
        elif clusterNumber == 1:
            print("SVC")
            print("========================================")
            SVC_model = SVC(kernel='sigmoid', gamma=0.1, C=0.1)
            trainedClassifier = SVC_model.fit(Xtr, Ytr)

        elif clusterNumber == 2:
            #A decision tree is a flowchart-like structure in which each internal node represents a “test” on an attribute
            # each branch represents the outcome of the test, and each leaf node represents a class label.
            #The paths from root to leaf represent classification rules.
            print("Decision Trees")
            print("========================================")
            decionTree_model = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=0.2,
                                                      min_samples_leaf=0.2)
            trainedClassifier = decionTree_model.fit(Xtr, Ytr)
            
        # Random forest is an ensemble method in which a classifier is constructed by combining several different Independent base classifiers.

        elif clusterNumber == 3:
            print("Random Forest")
            print("========================================")
            randomForest_model = RandomForestClassifier(n_estimators=10, max_depth=3, min_samples_split=0.4,
                                                        min_samples_leaf=0.2)
            trainedClassifier = randomForest_model.fit(Xtr, Ytr)
        # KNN is a non-parametric and lazy learning algorithm. Non-parametric means there is no assumption for underlying data distribution.
        elif clusterNumber == 4:
            print("KNearestNeighbors")
            print("=========================================")
            KNN_model = KNeighborsClassifier(n_neighbors=5)
            trainedClassifier = KNN_model.fit(Xtr, Ytr)
        # Naive Bayes is a classification algorithm for binary (two-class) and multi-class classification problems.
        elif clusterNumber == 5:
            print("Guassian Naive Bayes")
            print("=========================================")
            GNB_model = GaussianNB()
            trainedClassifier = GNB_model.fit(Xtr, Ytr)
        # Multinomial Naive Bayes calculates likelihood to be count of an word/token 

        elif clusterNumber == 6:
            print("Guassian Multinomial Naive Bayes")
            print("=========================================")
            GMNB_model = MultinomialNB()
            trainedClassifier = GMNB_model.fit(Xtr, Ytr)

        # We use only a single training example for calculation of gradient and update parameters.
        elif clusterNumber == 7:
            print("Stochastic Gradient Descent")
            print("=========================================")
            SGD_model = SGDClassifier(loss='modified_huber',shuffle=True,random_state=101)
            trainedClassifier = SGD_model.fit(Xtr, Ytr)

        # AdaBoost is a popular boosting technique which helps you combine multiple “weak classifiers” into a single “strong classifier”.
        elif clusterNumber == 8:
            print("ADA-Boost")
            print("=========================================")
            ADAB_model = AdaBoostClassifier(n_estimators=50,learning_rate=1)
            trainedClassifier = ADAB_model.fit(Xtr, Ytr)
        #It’s a sub-class of ensemble machine learning algorithms wherein we use multiple weak models and aggregate the predictions we get from each of them to get the final prediction. 
        elif clusterNumber == 9:
            print("Bagging")
            print("=========================================")
            Bagging_model = BaggingClassifier(n_estimators=50)
            trainedClassifier = Bagging_model.fit(Xtr, Ytr)


        else:
            print("end")
            break
            
        calculateMetrics(trainedClassifier, Xte, Yte)
    return target
        



def trainTestSplit(dataFrame):
    target = dataFrame['labels']
    splitDF = dataFrame.iloc[:,:-1]
    X_train, X_test, y_train, y_test = train_test_split(splitDF, target, test_size=0.2, random_state=101)
    return X_train, X_test, y_train, y_test, target

def calculateMetrics(trainedReceivedClassifier, XtestData, YtestData):
    predictor = trainedReceivedClassifier.predict(XtestData)
    confusionMatrix = confusion_matrix(predictor, YtestData)
    accuracy = accuracy_score(predictor, YtestData)
    ClassificationReport = classification_report(predictor, YtestData)
    print("Accuracy : ", accuracy)
    print("Classification Report :")
    print(ClassificationReport)
    
    
target = applyClassifier(frameListforEnhance)

Neural Networks
Accuracy :  0.6172140430351076
Classification Report :
              precision    recall  f1-score   support

         C11       0.00      0.00      0.00         2
         C12       0.33      0.33      0.33         3
         C13       0.57      0.54      0.56        79
         C14       0.00      0.00      0.00         0
         C15       0.17      0.33      0.22         3
        C151       0.00      0.00      0.00         0
         C16       0.00      0.00      0.00         1
         C17       0.00      0.00      0.00         2
         C18       0.25      0.21      0.23        14
        C183       0.00      0.00      0.00         0
         C21       0.00      0.00      0.00         7
         C23       0.00      0.00      0.00         0
         C24       0.18      0.33      0.24         6
         C31       0.29      0.46      0.35        13
         C34       0.00      0.00      0.00         1
         C41       0.25      1.00      0.40         1
         C

Accuracy :  0.732532751091703
Classification Report :
              precision    recall  f1-score   support

         C11       0.00      0.00      0.00         3
         C13       0.00      0.00      0.00         2
         C14       0.00      0.00      0.00         3
         C15       0.61      0.73      0.67        71
         C17       0.00      0.00      0.00         1
         C18       0.00      0.00      0.00         0
         C21       0.00      0.00      0.00         1
         C24       0.00      0.00      0.00         1
         C31       0.35      0.60      0.44        10
         C41       0.00      0.00      0.00         0
         E11       0.33      0.22      0.27         9
         E12       0.33      0.43      0.38        37
         E13       0.00      0.00      0.00         0
        E131       0.00      0.00      0.00         0
         E14       0.00      0.00      0.00         2
         E21       0.25      1.00      0.40         1
         E51       0.57    

### MEASURING QUALITY OF CLUSTERING

In [28]:
def clusterQuality(receivedclusters,receivedfeatureData,receivedfeatureDataFrame):
    receivedbipTopics = receivedfeatureDataFrame.labels
    # The knowledge of ground truth classes is known and hence the measure taking those classes into consideration is used
    # To compare both evaluation based on ground truth tables and independent of them, silhouette score is used
    # which resulted in biased scoring.
    
    
    print("Cluster Quality Scores")
    print("with ground truth labels")
    print("==========================")
    
    #The Rand Index computes a similarity measure between two clusterings by considering all pairs of samples 
    #and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
    print("Adjusted Rand index")
    print(metrics.adjusted_rand_score(receivedbipTopics, receivedclusters))
    
    
    #The Mutual Information is a measure of the similarity between two labels of the same data.
    #This Mutualinformation score is useful to check whether the clustering algorithm meets an important requirement:
    #a cluster should contain only samples belonging to a single class.
    print("Mutual Information based scores")
    print(metrics.adjusted_mutual_info_score(receivedbipTopics, receivedclusters))
    
    
    #A perfectly homogeneous clustering is one where each cluster has data-points belonging to the same class label. 
    #Homogeneity describes the closeness of the clustering algorithm to this perfection.
    print("Homogeneity score")
    print(metrics.homogeneity_score(receivedbipTopics, receivedclusters))
    
    
    #Completness score purpose is to provide a piece of information about the assignment of samples belonging to the same class.
    #More precisely, a good clustering algorithm should assign all samples with the same true label to the same cluster.
    print("completeness score")
    print(metrics.completeness_score(receivedbipTopics, receivedclusters))
    
    
    #The V-Measure is defined as the harmonic mean of homogeneity and completeness of the clustering
    # No assumption is made on the cluster structure: can be used to compare clustering algorithms such as k-means which assumes isotropic 
    # blob shapes with results of spectral clustering algorithms which can find cluster with “folded” shapes.
    print("V Measure Score")
    print(metrics.v_measure_score(receivedbipTopics, receivedclusters))
    
    
    #The Fowlkes-Mallows Score is an evaluation metric to evaluate the similarity among clusterings obtained after applying different clustering algorithms. 
    print("Fowlkes-Mallows scores")
    print(metrics.fowlkes_mallows_score(receivedbipTopics, receivedclusters))
    print("\n\n")
    print("without ground truth labels")
    print("============================")
    print("silhouette")
    print(metrics.silhouette_score(receivedfeatureData, receivedclusters, metric='euclidean'))
    
clusterQuality(receivedClusters, receivedFeatureData, receivedClusterDataframe)

Cluster Quality Scores
with ground truth labels
Adjusted Rand index
0.2070943083585578
Mutual Information based scores
0.2906931087278954
Homogeneity score
0.2922848919683959
completeness score
0.3652891571282194
V Measure Score
0.3247345359055094
Fowlkes-Mallows scores
0.2874215022954677



without ground truth labels
silhouette
0.05735247516532827


### FEATURE EXTRACTION USING AUTO-ENCODERS

In [29]:
#Here Auto encoder is used for feature extraction.
#Autoencoder is an unsupervised artificial neural network that learns how to efficiently compress and encode data then learns how to reconstruct the data back from the reduced encoded representation to a representation that is as close to the original input as possible.
#Autoencoder, by design, reduces data dimensions by learning how to ignore the noise in the data.
#It consists of 4 parts , one is encoder in which model learns how to reduce the input dimensions and compress the input data into encoded representation.
#Bottle neck , this layer contains the compressed representation of nput data
#Decoder , in which model learns how to recontruct the data from encoded representation to be close to the original input as possible.
#I decided to use ReLu as the activation function for the encoding stage and Softmax for the decoding stage.
#In here I have declared 3 hidden layers in the encoded stage and 3 hidden layers in the decoded stage.

def enhancedFeatureExtraction(receivedFeatureData):
    print("Enhancing Features using AutoEncoder..........")
    x = Input(shape=(receivedFeatureData.shape[1],))
    # 3 hidden layers are implemented
    hidden_1en = Dense(2048, activation='relu')(x)
    hidden_2en = Dense(1024, activation='relu')(hidden_1en)
    hidden_3en = Dense(512, activation='relu')(hidden_2en)
    h = Dense(128, activation='relu')(hidden_3en)
    hidden_1dec = Dense(512, activation='relu')(h)
    hidden_2dec = Dense(1024, activation='relu')(hidden_1dec)
    hidden_3dec = Dense(2048, activation='relu')(hidden_2dec)
    r = Dense(receivedFeatureData.shape[1], activation='sigmoid')(hidden_3dec)
    autoencoder = Model(x, r)
    autoencoder.compile(optimizer='adam', loss='mse')
    Xtraut, Xteaut, Ytraut, Yteaut = train_test_split(receivedFeatureData, receivedFeatureData, test_size=0.3, random_state=101)
    autoencoder.fit(Xtraut, Ytraut,
                    epochs=30,
                    batch_size=200,
                    shuffle=True,
                    verbose=0,
                    validation_data=(Xteaut, Yteaut))
    compressedData = autoencoder.predict(receivedFeatureData)
    return compressedData

for lf in clusterDataframeList:
 enhancedDFrame = lf.iloc[:, :-2]
 clusterandLabel = lf.iloc[:, -2:]
 compressedFrame = enhancedFeatureExtraction(enhancedDFrame) # PERFORMING AUTO ENCODER FOR DIFFERENT CLUSTER DATA
 compressedDataFrame = pd.DataFrame(data=compressedFrame)
 clusterandLabel.reset_index(drop=True, inplace=True)
 compressedDataFrame.reset_index(drop=True, inplace=True)
 compressedDataFrame = pd.concat([compressedDataFrame,clusterandLabel],axis=1)
 enhancedDFList.append(compressedDataFrame) 
    
print(enhancedDFList)# PRINTING THE EXTRACTED FEATURES USING AUTO ENCODER

Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
Enhancing Features using AutoEncoder..........
[             0         1         2         3         4         5         6  \
0     0.612590  0.308493  0.717187  0.682659  0.362452  0.153676  0.408199   
1     0.528015  0.401692  0.726005  0.640563  0.361598  0.247508  0.436721   
2     0.581098  0.482747  0.718352  0.694340  0.379208  0.253293  0.387036   
3     0.493961  0.421248  0.764261  0.511699  0.293629  0.158896  0.528732   
4     0.590432  0.362342  0.721932  0.588542  0.329968  0.159757  0.358458   
...        ...       ...       ...       ...       ...       

### APPLYING CLASSIFIERS AFTER FEATURE EXTRACTION

In [30]:
applyClassifier(enhancedDFList) # Comparing performance after using autoencoder

Neural Networks
Accuracy :  0.5662514156285391
Classification Report :
              precision    recall  f1-score   support

         C11       0.00      0.00      0.00         0
         C12       0.67      0.33      0.44         6
         C13       0.48      0.48      0.48        75
         C14       0.00      0.00      0.00         0
         C15       0.17      0.08      0.11        13
        C151       0.00      0.00      0.00         0
         C16       0.00      0.00      0.00         1
         C17       0.00      0.00      0.00         1
         C18       0.00      0.00      0.00         6
        C183       0.00      0.00      0.00         0
         C21       0.14      0.17      0.15         6
         C23       0.00      0.00      0.00         0
         C24       0.45      0.45      0.45        11
         C31       0.33      0.64      0.44        11
         C34       0.00      0.00      0.00         0
         C41       0.25      1.00      0.40         1
         C

Guassian Multinomial Naive Bayes
Accuracy :  0.937888198757764
Classification Report :
              precision    recall  f1-score   support

         C12       0.00      0.00      0.00         0
         C15       0.00      0.00      0.00         0
         C21       0.00      0.00      0.00         0
         C22       0.00      0.00      0.00         0
         C24       0.00      0.00      0.00         0
        CCAT       0.00      0.00      0.00         0
        GCAT       1.00      0.94      0.97       483
        GDIP       0.00      0.00      0.00         0
         M11       0.00      0.00      0.00         0
         M12       0.00      0.00      0.00         0
         M13       0.00      0.00      0.00         0
         M14       0.00      0.00      0.00         0

    accuracy                           0.94       483
   macro avg       0.08      0.08      0.08       483
weighted avg       1.00      0.94      0.97       483

Stochastic Gradient Descent
Accuracy :  0.7144

0        C18
1        C11
2        C17
3        C31
4        E12
        ... 
5248     M11
5249     C14
5250     C15
5251     C33
5252    C152
Name: labels, Length: 5253, dtype: object

### DEEP NEURAL NETWORKS

In [31]:
# Major Differences
# Deep Neural Network using 3 layers is implemented and moreover the feature extraction is enhanced using auto encoder
# Doc2Vec is used for vectorization of documents apart from assignment one as TF-IDF generates sparse matrix which is ineffic
# Neural networks with 3 hidden layers are used to enhance the performance.
def deepNeuralNet(rXtr, rXte, rYtr, rYte):
    print("Deep Neural Network using Enhanced Features")
    le = LabelEncoder()
    rYtr = le.fit_transform(rYtr)
    rYte = le.fit_transform(rYte)
    dup = numpy.unique(rYtr)
    classifier = Sequential()
    # First Hidden Layer
    classifier.add(Dense(512, activation='relu', input_dim=rXtr.shape[1]))
    classifier.add(Dropout(0.5))
    # Second  Hidden Layer
    classifier.add(Dense(512, activation='relu', input_dim=512))
    classifier.add(Dropout(0.5))
    classifier.add(Dense(512, activation='relu', input_dim=512))
    classifier.add(Dropout(0.5))
    classifier.add(Dense(512, activation='relu', input_dim=512))
    classifier.add(Dropout(0.5))
    # Output Layer
    classifier.add(Dense(dup.size, activation='softmax'))
    classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    classifier.fit(rXtr, rYtr,validation_data=(rXte, rYte), batch_size=1024, epochs=10)
    loss,accuracy = classifier.evaluate(rXtr,rYtr)
    print("accuracy for Deep Neural Networks is")
    print(accuracy)
    print("Loss in Deep Neural Networks is")
    print(loss)
    
def trainTestSplit(dataFrame):
    targetF = dataFrame['labels']
    splitDFF = dataFrame.iloc[:,:-1]
    X_train, X_test, y_train, y_test = train_test_split(splitDFF, targetF, test_size=0.2, random_state=101)
    return X_train, X_test, y_train, y_test
    
Xtr, Xte, Ytr, Yte = trainTestSplit(enhancedDFList[0])
deepNeuralNet(Xtr, Xte, Ytr, Yte)

Deep Neural Network using Enhanced Features
Train on 3532 samples, validate on 883 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy for Deep Neural Networks is
0.287089467048645
Loss in Deep Neural Networks is
2.709868862026815
