In [24]:
# import libraries
import pandas as pd
import numpy as np
import sqlite3

import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

[nltk_data] Downloading package punkt to /Users/vikram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/vikram/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vikram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Extracting 'elog_all_data' table from 'elog_data.db' database
conn = sqlite3.connect(r'/Users/vikram/Projects/Elog/data/elog_data.db')
c = conn.cursor()
c.execute('SELECT * FROM elog_all_data')
df = pd.DataFrame(c.fetchall(), columns=['elogid', 'tag', 'text', 'title', 'title_and_text'])
df.shape

(251193, 5)

> We want to implement a form of unsupervised learning that can <i>ideally</i> cluster these entries into 2 groups (LCLS and FACET). This will allow us to train our data on a much larger sample size across a much bigger time period, which will hopefully make our model more robust

### Tokenize Function

In [18]:
# Creat tokenizer function
def tokenize(x):
    
    # Generating list of stop words
    stop_words = set(stopwords.words('english'))
    
    # Separate sentance into individual words
    no_punctuation_x = re.sub(r"[^a-zA-Z0-9]"," ", x)
    word_token = word_tokenize(no_punctuation_x)
    
    # Lemmatizing each word and added cleaned words to clean_words
    lemmatizer = WordNetLemmatizer()
    clean_words = []
    for word in word_token:
        clean_words.append(lemmatizer.lemmatize(word.lower().strip()))

    # Return lematized words that are indeed words and are not in stopwords list
    final_token = [w for w in clean_words if w not in stop_words]
    return final_token

### LSA Method

Let's practice on a smaller subset of the data fram at first (let's say 100 random entries) and only test it on the title. We can then move onto incorporating the text after we get a working model

In [16]:
# practice with first 100 entries.
df_abbr = df.sample(n = 100)
df_abbr.head()

Unnamed: 0,elogid,tag,text,title,title_and_text
143742,672269,LCLS,,BSY valves IV2-6 closed at Baker's request,BSY valves IV2-6 closed at Baker's request
135342,651285,LCLS,There are a ton of plots in the LCLSlog - we i...,* Re: L1X amplitude two-state,* Re: L1X amplitude two-state There are a ton ...
211774,858993,LCLS,,Bypassing A-line gauge PS3 to pump down with P...,Bypassing A-line gauge PS3 to pump down with P...
207501,851132,LCLS,XTCAV will likely need to be rephased when we ...,"PEM/AMRF working on XTCAV TWT, tightening PAD ...","PEM/AMRF working on XTCAV TWT, tightening PAD ..."
144972,675854,LCLS,Pedro is alone tonight.\nHe has a list from sw...,Touched base with PEM for tonight's plans.,Touched base with PEM for tonight's plans. Ped...


In [23]:
# Creating a bag of words for our 100 log entry sample
vectorizer = CountVectorizer(tokenizer = tokenize)       # Calls our tokenize function written above
bag_of_words = vectorizer.fit_transform(df_abbr.title)   # Fitting just the title column for now
bag_of_words.todense()                                   # Visual verification of bag of words matrix

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
# Applying SVD down to 2 components for our bag_of_words
svd = TruncatedSVD(n_components = 2)
lsa = svd.fit_transform(bag_of_words)