In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [3]:
# Combining all the above stundents 
from tqdm import tqdm
# tqdm is for printing the status bar
def textpreprocess(sentance):
    #remove words with numbers python: https://stackoverflow.com/a/18082370/4084039
    sentance = re.sub("\S*\d\S*", "", str(sentance)).strip()
    #remove spacial character: https://stackoverflow.com/a/5843547/4084039
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    stoplist = set(stopwords.words('english'))
    #remove stop words
    sentance = [e.lower() for e in sentance.split() if e.lower() not in stoplist]
    return sentance

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [7]:
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument
import re

In [8]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v, [label]))
    return labeled

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
df = pd.read_csv('all_logs_preprocessed.csv', usecols = [1,2])
X_train, X_test, y_train, y_test = train_test_split(df.Log_Data, df.Label, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
df.head()

Unnamed: 0,Log_Data,Label
0,"['mlu', 'e', 'status', 'device', 'ready', 'n',...",DeviceNotReady_BelowMR
1,"['deferred', 'odntodo', 'odnwip', 'n', 'mlu', ...",DeviceNotReady_BelowMR
2,"['user', 'kdriver', 'starting', 'input', 'powe...",DeviceNotReady_BelowMR
3,"['mlu', 'mlu', 'adding', 'waiter', 'obj', 'run...",DeviceNotReady_BelowMR
4,"['user', 'tldlistener', 'start', 'operation', ...",MCC_BadCRC


In [11]:
model = Doc2Vec.load("Doc2VecModel\d2v_1000.model")

In [12]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

Doc2Vec with vector size 1000

In [14]:
train_vectors_dbow = get_vectors(model, len(X_train), 1000, 'Train')
test_vectors_dbow = get_vectors(model, len(X_test), 1000, 'Test')

In [15]:
train_vectors_dbow[:2]

array([[ 0.51286131,  0.24583107, -0.13014413, ..., -0.10543926,
        -0.01474785,  0.05175322],
       [ 0.03392103, -0.38855737, -0.06742401, ...,  0.07175257,
         0.01580577,  0.06185701]])

# Logistic Regression ( 91.6% Accuracy )

In [16]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)



LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=1, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [17]:
logreg = logreg.fit(train_vectors_dbow, y_train)
y_pred = logreg.predict(test_vectors_dbow)



In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=list(set(df.Label))))

accuracy 0.9166666666666666


  'precision', 'predicted', average, warn_for)


                                 precision    recall  f1-score   support

                            MCR       0.00      0.00      0.00         1
                raid_data_error       0.67      1.00      0.80         2
Weka_Collects_Miniport_Slowness       1.00      1.00      1.00         2
                     MCC_BadCRC       1.00      1.00      1.00         2
                  MCC_CacheLost       1.00      1.00      1.00         3
         DeviceNotReady_BelowMR       1.00      1.00      1.00         2

                      micro avg       0.92      0.92      0.92        12
                      macro avg       0.78      0.83      0.80        12
                   weighted avg       0.86      0.92      0.88        12



In [19]:
print("------------------------")
print("Y_test            Y_pred")
print("------------------------")
for i,j in zip(y_test,y_pred):
    if i == j:
        print(i+"  ==  "+j)
    else:
        print("!!!! "+i+"  !=  "+j)
    

------------------------
Y_test            Y_pred
------------------------
MCR  ==  MCR
MCR  ==  MCR
MCC_CacheLost  ==  MCC_CacheLost
MCC_BadCRC  ==  MCC_BadCRC
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
raid_data_error  ==  raid_data_error
MCC_CacheLost  ==  MCC_CacheLost
MCC_BadCRC  ==  MCC_BadCRC
!!!! DeviceNotReady_BelowMR  !=  MCC_BadCRC
raid_data_error  ==  raid_data_error
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness


# Stochastic Gradient Descent (83.3% Accuracy)

In [20]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(max_iter=100, tol=1e-3, random_state = 101)
sgd.fit(train_vectors_dbow, y_train)
y_pred_sgd = sgd.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred_sgd, y_test))
print(classification_report(y_test, y_pred_sgd,target_names=list(set(df.Label))))

accuracy 0.8333333333333334
                                 precision    recall  f1-score   support

                            MCR       0.00      0.00      0.00         1
                raid_data_error       0.50      0.50      0.50         2
Weka_Collects_Miniport_Slowness       1.00      1.00      1.00         2
                     MCC_BadCRC       1.00      1.00      1.00         2
                  MCC_CacheLost       1.00      1.00      1.00         3
         DeviceNotReady_BelowMR       1.00      1.00      1.00         2

                      micro avg       0.83      0.83      0.83        12
                      macro avg       0.75      0.75      0.75        12
                   weighted avg       0.83      0.83      0.83        12



In [21]:
print("------------------------")
print("Y_test            Y_pred")
print("------------------------")
for i,j in zip(y_test,y_pred_sgd):
    if i == j:
        print(i+"  ==  "+j)
    else:
        print("!!! "+i+"  !=  "+j)

------------------------
Y_test            Y_pred
------------------------
MCR  ==  MCR
MCR  ==  MCR
MCC_CacheLost  ==  MCC_CacheLost
MCC_BadCRC  ==  MCC_BadCRC
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
raid_data_error  ==  raid_data_error
MCC_CacheLost  ==  MCC_CacheLost
!!! MCC_BadCRC  !=  DeviceNotReady_BelowMR
!!! DeviceNotReady_BelowMR  !=  MCC_BadCRC
raid_data_error  ==  raid_data_error
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness


# K-Nearest Neighbours (66.6% Accuracy)

In [22]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=len(set(df.Label)))
neigh.fit(train_vectors_dbow, y_train)
y_pred_neigh = neigh.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred_neigh, y_test))
print(classification_report(y_test, y_pred_neigh,target_names=list(set(df.Label))))

accuracy 0.6666666666666666


  'precision', 'predicted', average, warn_for)


                                 precision    recall  f1-score   support

                            MCR       0.00      0.00      0.00         1
                raid_data_error       0.00      0.00      0.00         2
Weka_Collects_Miniport_Slowness       1.00      0.50      0.67         2
                     MCC_BadCRC       0.67      1.00      0.80         2
                  MCC_CacheLost       0.60      1.00      0.75         3
         DeviceNotReady_BelowMR       1.00      1.00      1.00         2

                      micro avg       0.67      0.67      0.67        12
                      macro avg       0.54      0.58      0.54        12
                   weighted avg       0.59      0.67      0.60        12



In [23]:
print("------------------------")
print("Y_test            Y_pred")
print("------------------------")
for i,j in zip(y_test,y_pred_neigh):
    if i == j:
        print(i+"  ==  "+j)
    else:
        print("!!!! "+i+"  !=  "+j)

------------------------
Y_test            Y_pred
------------------------
MCR  ==  MCR
MCR  ==  MCR
MCC_CacheLost  ==  MCC_CacheLost
!!!! MCC_BadCRC  !=  Weka_Collects_Miniport_Slowness
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
raid_data_error  ==  raid_data_error
!!!! MCC_CacheLost  !=  MCR
!!!! MCC_BadCRC  !=  DeviceNotReady_BelowMR
!!!! DeviceNotReady_BelowMR  !=  Weka_Collects_Miniport_Slowness
raid_data_error  ==  raid_data_error
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness


# DecisionTreeClassifier (33.3% Accuracy)

In [24]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=101, max_features = None, max_depth = 10, min_samples_leaf = 4)
dtree.fit(train_vectors_dbow, y_train)
y_pred_dtree = dtree.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred_dtree, y_test))
print(classification_report(y_test, y_pred_dtree,target_names=list(set(df.Label))))

accuracy 0.3333333333333333


  'precision', 'predicted', average, warn_for)


                                 precision    recall  f1-score   support

                            MCR       0.00      0.00      0.00         1
                raid_data_error       0.50      0.50      0.50         2
Weka_Collects_Miniport_Slowness       0.20      0.50      0.29         2
                     MCC_BadCRC       0.00      0.00      0.00         2
                  MCC_CacheLost       0.00      0.00      0.00         3
         DeviceNotReady_BelowMR       1.00      1.00      1.00         2

                      micro avg       0.33      0.33      0.33        12
                      macro avg       0.28      0.33      0.30        12
                   weighted avg       0.28      0.33      0.30        12



In [25]:
print("------------------------")
print("Y_test            Y_pred")
print("------------------------")
for i,j in zip(y_test,y_pred_dtree):
    if i == j:
        print(i+"  ==  "+j)
    else:
        print("!!!! "+i+"  !=  "+j)

------------------------
Y_test            Y_pred
------------------------
!!!! MCR  !=  MCC_CacheLost
!!!! MCR  !=  MCC_CacheLost
!!!! MCC_CacheLost  !=  DeviceNotReady_BelowMR
!!!! MCC_BadCRC  !=  MCC_CacheLost
!!!! Weka_Collects_Miniport_Slowness  !=  DeviceNotReady_BelowMR
raid_data_error  ==  raid_data_error
MCC_CacheLost  ==  MCC_CacheLost
MCC_BadCRC  ==  MCC_BadCRC
!!!! DeviceNotReady_BelowMR  !=  MCC_BadCRC
raid_data_error  ==  raid_data_error
!!!! Weka_Collects_Miniport_Slowness  !=  MCC_CacheLost
!!!! Weka_Collects_Miniport_Slowness  !=  DeviceNotReady_BelowMR


# Random Forest (33.3% Accuracy)

In [26]:
from sklearn.ensemble import RandomForestClassifier
rfm = RandomForestClassifier(n_estimators=100, max_depth=10,random_state=101, oob_score = True, max_features = None,
                             min_samples_leaf = 4, n_jobs = -1)
rfm.fit(train_vectors_dbow, y_train)
y_pred_rfm = dtree.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred_rfm, y_test))
print(classification_report(y_test, y_pred_rfm,target_names=list(set(df.Label))))

accuracy 0.3333333333333333


  'precision', 'predicted', average, warn_for)


                                 precision    recall  f1-score   support

                            MCR       0.00      0.00      0.00         1
                raid_data_error       0.50      0.50      0.50         2
Weka_Collects_Miniport_Slowness       0.20      0.50      0.29         2
                     MCC_BadCRC       0.00      0.00      0.00         2
                  MCC_CacheLost       0.00      0.00      0.00         3
         DeviceNotReady_BelowMR       1.00      1.00      1.00         2

                      micro avg       0.33      0.33      0.33        12
                      macro avg       0.28      0.33      0.30        12
                   weighted avg       0.28      0.33      0.30        12



In [27]:
print("------------------------")
print("Y_test            Y_pred")
print("------------------------")
for i,j in zip(y_test,y_pred_rfm):
    if i == j:
        print(i+"  ==  "+j)
    else:
        print("!!!! "+i+"  !=  "+j)

------------------------
Y_test            Y_pred
------------------------
!!!! MCR  !=  MCC_CacheLost
!!!! MCR  !=  MCC_CacheLost
!!!! MCC_CacheLost  !=  DeviceNotReady_BelowMR
!!!! MCC_BadCRC  !=  MCC_CacheLost
!!!! Weka_Collects_Miniport_Slowness  !=  DeviceNotReady_BelowMR
raid_data_error  ==  raid_data_error
MCC_CacheLost  ==  MCC_CacheLost
MCC_BadCRC  ==  MCC_BadCRC
!!!! DeviceNotReady_BelowMR  !=  MCC_BadCRC
raid_data_error  ==  raid_data_error
!!!! Weka_Collects_Miniport_Slowness  !=  MCC_CacheLost
!!!! Weka_Collects_Miniport_Slowness  !=  DeviceNotReady_BelowMR


# Support Vector Machine (83.3 %  Accuracy)

In [28]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', random_state = 101, C = 1)
svm.fit(train_vectors_dbow, y_train)
y_pred_svm = svm.predict(test_vectors_dbow)
print('accuracy %s' % accuracy_score(y_pred_svm, y_test))
print(classification_report(y_test, y_pred_svm,target_names=list(set(df.Label))))

accuracy 0.8333333333333334


  'precision', 'predicted', average, warn_for)


                                 precision    recall  f1-score   support

                            MCR       0.00      0.00      0.00         1
                raid_data_error       0.67      1.00      0.80         2
Weka_Collects_Miniport_Slowness       1.00      0.50      0.67         2
                     MCC_BadCRC       0.67      1.00      0.80         2
                  MCC_CacheLost       1.00      1.00      1.00         3
         DeviceNotReady_BelowMR       1.00      1.00      1.00         2

                      micro avg       0.83      0.83      0.83        12
                      macro avg       0.72      0.75      0.71        12
                   weighted avg       0.81      0.83      0.79        12



In [29]:
print("------------------------")
print("Y_test            Y_pred")
print("------------------------")
for i,j in zip(y_test,y_pred_svm):
    if i == j:
        print(i+"  ==  "+j)
    else:
        print("!!!! "+i+"  !=  "+j)

------------------------
Y_test            Y_pred
------------------------
MCR  ==  MCR
MCR  ==  MCR
MCC_CacheLost  ==  MCC_CacheLost
MCC_BadCRC  ==  MCC_BadCRC
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
raid_data_error  ==  raid_data_error
!!!! MCC_CacheLost  !=  MCR
MCC_BadCRC  ==  MCC_BadCRC
!!!! DeviceNotReady_BelowMR  !=  MCC_BadCRC
raid_data_error  ==  raid_data_error
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
Weka_Collects_Miniport_Slowness  ==  Weka_Collects_Miniport_Slowness
