In [2]:
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tharakan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tharakan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tharakan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [78]:
engine = create_engine('sqlite:///elog_data_2011.db')
df = pd.read_sql_table('elog_data_2011', engine)

In [79]:
df.tail(10)

Unnamed: 0,elogid,title,text,tag,title_and_text
6903,554308,"Laser heater motor delay has lost ""home"" position",Put in Laser heater for diagnostics and switch...,LCLS,"Laser heater motor delay has lost ""home"" posit..."
6904,554298,Garth at CXI is seeing lower Intensity than GD...,Garth is hoping that Jim Turner identified the...,LCLS,Garth at CXI is seeing lower Intensity than GD...
6905,554292,Jim Craft advisory on CR2 LI21,Jim Craft has left a note on CR 2 of LI21 that...,LCLS,Jim Craft advisory on CR2 LI21 Jim Craft has l...
6906,554290,"repointUndulatorLine(0,0,-45,-45)",,LCLS,"repointUndulatorLine(0,0,-45,-45)"
6907,554265,Switching to Matlab UND launch. Fast Und launc...,,LCLS,Switching to Matlab UND launch. Fast Und launc...
6908,554263,"Model manager hangs on ""gold"" stage",,LCLS,"Model manager hangs on ""gold"" stage"
6909,554261,BBA Round 2 complete,,LCLS,BBA Round 2 complete
6910,554258,Swapped 30-6 for 30-4,Will LEM the change in; seems to be alright so...,LCLS,Swapped 30-6 for 30-4 Will LEM the change in; ...
6911,554257,1st Round BBA done,,LCLS,1st Round BBA done
6912,554250,Emittance Measurements - 9491 eV (14.673 GeV...,<table class=emittanceTable>\n<tr><th></th><th...,LCLS,Emittance Measurements - 9491 eV (14.673 GeV...


> Questions to be answere later: Does it perform better all lower or all uppercase??

In [80]:
# Turning it all to lower case and removing newline stuff

df['title_and_text'] = df['title_and_text'].str.lower()
df['title_and_text'] = df['title_and_text'].str.replace('\n', ' ')
df.tail(10)

Unnamed: 0,elogid,title,text,tag,title_and_text
6903,554308,"Laser heater motor delay has lost ""home"" position",Put in Laser heater for diagnostics and switch...,LCLS,"laser heater motor delay has lost ""home"" posit..."
6904,554298,Garth at CXI is seeing lower Intensity than GD...,Garth is hoping that Jim Turner identified the...,LCLS,garth at cxi is seeing lower intensity than gd...
6905,554292,Jim Craft advisory on CR2 LI21,Jim Craft has left a note on CR 2 of LI21 that...,LCLS,jim craft advisory on cr2 li21 jim craft has l...
6906,554290,"repointUndulatorLine(0,0,-45,-45)",,LCLS,"repointundulatorline(0,0,-45,-45)"
6907,554265,Switching to Matlab UND launch. Fast Und launc...,,LCLS,switching to matlab und launch. fast und launc...
6908,554263,"Model manager hangs on ""gold"" stage",,LCLS,"model manager hangs on ""gold"" stage"
6909,554261,BBA Round 2 complete,,LCLS,bba round 2 complete
6910,554258,Swapped 30-6 for 30-4,Will LEM the change in; seems to be alright so...,LCLS,swapped 30-6 for 30-4 will lem the change in; ...
6911,554257,1st Round BBA done,,LCLS,1st round bba done
6912,554250,Emittance Measurements - 9491 eV (14.673 GeV...,<table class=emittanceTable>\n<tr><th></th><th...,LCLS,emittance measurements - 9491 ev (14.673 gev...


In [81]:
# Forgot to create dummy variables
df = pd.concat([df,pd.get_dummies(df.tag, drop_first = True)], axis=1)
#df = df.drop(['FACET'], axis = 1)   #Taken care of by above line

In [82]:
df.head()

Unnamed: 0,elogid,title,text,tag,title_and_text,LCLS
0,137674,EPICS LCLS IOC heartbeat fault.,,LCLS,epics lcls ioc heartbeat fault.,1
1,137670,RR monitor to 2 Hz,A. Prinz approves this change. Approval grante...,LCLS,rr monitor to 2 hz a. prinz approves this chan...,1
2,148112,PARANOIA Restart,PARANOIA was restarted with changes to handle ...,LCLS,paranoia restart paranoia was restarted with c...,1
3,145825,Errorlog re: LCLS SOLN 121,15-MAY-2007 20:24:11 %CAU-E-EPICS_MSG_PEP CM...,LCLS,errorlog re: lcls soln 121 15-may-2007 20:24:1...,1
4,144311,Rack Location of LCLS BX01/BX02 Breaker,As one can see from Electrical Safety label th...,LCLS,rack location of lcls bx01/bx02 breaker as one...,1


In [83]:
X = df['title_and_text'] 
Y = df.drop(['text','title','elogid', 'title_and_text','tag'], axis = 1)

In [84]:
# Creat tokenizer function
def tokenize(x):
    
    # Generating list of stop words
    stop_words = set(stopwords.words('english'))
    
    # Separate sentance into individual words
    no_punctuation_x = re.sub(r"[^a-zA-Z0-9]"," ", x)
    word_token = word_tokenize(no_punctuation_x)
    
    # Lemmatizing each word and added cleaned words to clean_words
    lemmatizer = WordNetLemmatizer()
    clean_words = []
    for word in word_token:
        clean_words.append(lemmatizer.lemmatize(word.lower().strip()))

    # Return lematized words that are indeed words and are not in stopwords list
    final_token = [w for w in clean_words if w not in stop_words]
    return final_token

In [85]:
# Proof that it works. Watch out for cells with tables tho, there is a lot of useless information in them.
#tokenize(df.at[6912,'title_and_text'])

In [86]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))    
])

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X,Y)
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function toke...
                 MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

In [88]:
y_pred = pipeline.predict(X_test)

In [89]:
# y_pred is not a data frame, let's make it into a data frame so we can see things more easily
df_ypred = pd.DataFrame(y_pred, columns = y_test.columns)
df_ypred.head()

Unnamed: 0,LCLS
0,1
1,1
2,1
3,0
4,1


In [90]:
# Will give "scoring" for each tag. Right now it's only one, but this allows us to check in case we have >2 programs
for col in y_test.columns:
    print(col)
    print(classification_report(y_test[col].values, df_ypred[col]))

LCLS
              precision    recall  f1-score   support

           0       0.89      0.69      0.78       310
           1       0.94      0.98      0.96      1419

    accuracy                           0.93      1729
   macro avg       0.91      0.84      0.87      1729
weighted avg       0.93      0.93      0.93      1729



In [91]:
pipeline.get_params()

{'memory': None,
 'steps': [('vectorizer',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x00000255A2E16DC8>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_depth=None,
                                                          m

In [113]:
example = pd.Series('facet dump')
pipeline.predict(example)

array([[0]], dtype=uint8)

In [112]:
df[df['text'].str.contains('FACET') == True]

Unnamed: 0,elogid,title,text,tag,title_and_text,LCLS
588,515398,FACET Summary:,* Recover beam to FACET dump & scav ext. l...,FACET,facet summary: * recover beam to facet dum...,0
617,515325,FACET SW release: IOCManager,Released new version of IOCManager and booted ...,FACET,facet sw release: iocmanager released new vers...,0
667,515109,FACET Summary - Day Shift,<b>FACET Delivery</b>: \n<u>delivered</u>: 7 h...,FACET,facet summary - day shift <b>facet delivery</b...,0
685,515030,FACET tuning halted,Due to the SB17 issue and the Sector 9 klystro...,FACET,facet tuning halted due to the sb17 issue and ...,0
697,514946,FACET beam re-established.,Low current (3e9) beam in Linac to Sector 19 a...,FACET,facet beam re-established. low current (3e9) b...,0
...,...,...,...,...,...,...
4648,542081,Fixed problem with BGRP macros,Turns out that the Operator Maintenance panel ...,LCLS,fixed problem with bgrp macros turns out that ...,1
4837,541345,* Re: MDL Slow feedback is working..sort of..,Note - this SCP MDL feedback is only adjusting...,LCLS,* re: mdl slow feedback is working..sort of.. ...,1
4904,540487,RSWCF 5196 and 5198 APPROVED,FACET modulators:\n19-8 commission newly insta...,FACET,rswcf 5196 and 5198 approved facet modulators:...,0
6074,548669,"* Re: 28-6 has bad HV capacitor, won't be fixe...",Al Owens OKed PEM to cannibalize a modulator f...,LCLS,"* re: 28-6 has bad hv capacitor, won't be fixe...",1


In [104]:
df.at[4142,'title_and_text']

'ep01 mps hlam enable fault with ep01 stoppers in we think this should be bypassed when we put in the positron vault stoppers-- the hlam disable is correctly bypassed and the magnet came back on.'