# Elog Tagging

The goal is to try and tag elog entries with the correct tag. In order to do this we need to:
* Scrape the data logbook
* Clean data (drop duplicates, only keep data with tags, make sure text body is in proper format..)
* Save data in easy to access way for NLP pipeline

In [54]:
import pandas as pd
import numpy as np
import requests
import time
from datetime import datetime
from sqlalchemy import create_engine

In [55]:
def get_data(s,e):
    '''
    --- Imports data from Elog and stores it in a workable format ---
    INPUT
        s: start time as unix timestamp
        e: end time as unix time stamp
    RETURN
        df: dataframe of uncleaned data between selected time range
    '''
    
    # api-endpoint 
    URL = "https://mccelog.slac.stanford.edu/elog/dev/mgibbs/dev_elog_display_json.php"

    PARAMS = {'logbook': 'MCC', 'start': s, 'end': e} 

    # sending get request and saving the response as response object 
    r = requests.get(url = URL, params = PARAMS) 

    # extracting data in json format 
    data = r.json()

    # Turning list of json objects into dataframe
    df = pd.DataFrame.from_records(data)

    return df

In [56]:
def clean_data(df, only_tags = True):
    '''
    --- Cleans data frame ---
    INPUT
        df: dataframe (not cleaned)
        only_tags: Boolean that determines if we are only keeping entries with tags or if we are keeping all entries
    RETURN
        df: dataframe (cleaned)
    '''
    # Checks to make sure there are even entries with a tag in the specified month
    if only_tags == True:
        if 'tag' not in df.columns:
            return 0

        # Dropping rows without any tags (these rows are useless for us)
        df = df[df.tag.notnull() == True]
    
    # Dropping useless columns
    important_cols = {'title', 'text', 'elogid', 'tag', 'superseded_by'}
    list1 = df.columns.tolist()
    list1 = [ele for ele in list1 if ele not in important_cols]
    for column in df.columns.tolist():
        if column in list1:
            df = df.drop(column,axis = 1)

    # Dropping all columns where superceded_by is not null to essentially drop duplicates. Then drop superceded_by column
    df = df[df['superseded_by'].isnull() == True]
    df = df.drop(['superseded_by'],axis = 1)
    df = df.drop_duplicates(subset ="elogid", keep = 'first')

    # Reset the index
    df = df.reset_index(drop=True)
    
    return df

In [57]:
def join_all_data(only_tags = True):
    '''
    --- Builds one giant dataframe by concating data frames together one month at a time ---
    RETURN
        df: Cleaned dataframe of tagged entries from April 2007 - December 2011.    
    '''
    year_list = list(range(2007,2019))
    month_list = list(range(1,13))
    df = pd.DataFrame(columns=['elogid', 'title', 'text', 'tag'])
    for year in year_list:
        for month in month_list:
            if (year == 2007 and month < 4):
                continue
            elif (only_tags == True and year >= 2012):
                break
            elif (month == 12):
                s = datetime(year, month, 1, 0, 0).timestamp()
                e = datetime(year+1, 1, 1, 0, 0).timestamp()
                df_temp = get_data(s,e)
                df_temp = clean_data(df_temp, only_tags)
            else:
                s = datetime(year, month, 1, 0, 0).timestamp()
                e = datetime(year, month+1, 1, 0, 0).timestamp()
                df_temp = get_data(s,e)
                df_temp = clean_data(df_temp, only_tags)
            
            # Checks to make sure cleaned dataframe actually has any tags
            if isinstance(df_temp, pd.DataFrame) == True:
                print(str(month)+'/'+str(year) + ':  ' + str(df_temp.shape[0]))
                df = pd.concat([df,df_temp], ignore_index = True)
    return df

In [86]:
# Function to save the data as .db file
def save_data(df, database_filename, only_tags = True, n = 3):
    if only_tags == True:
        engine = create_engine('sqlite:///'+database_filename+'.db')
        df.to_sql(database_filename, engine, index=False)
    
    if only_tags == False:
        df_big = np.array_split(df, n)
        chunk_list = list(range(0,n))
        for i in chunk_list:
            engine = create_engine('sqlite:///'+database_filename+str(i)+'.db')
            df_big[i].to_sql(database_filename, engine, index=False)

### Below is the main function that will use the actually compile the data and save it 

In [83]:
def main():
    '''
    Will go through all the necessary steps to extract the data from the elog, clean it, and save the data
    in an SQL database
    
    ---Parameters--- Decided to include an input as there are really 2 possible outcomes that you want
    only_tags: If you want only tagged data (through 2011), set to TRUE. If you want all the data,
                select FALSE
    names: Current names for saving the cleaned Data. 
                names[0] --> name for tagged data, names[1] --> name for all untagged data
                
    ---Output--- Returns DataFrame locally (used for testing). Main output is data stored in .db file
    '''
    
    # Set these variables prior to running main function. See main() documentation
    only_tags = False
    names = ['elog_data_2011', 'elog_all_data']

    
    # Extracts/Cleans Data
    df = join_all_data(only_tags)
    df['title_and_text'] = df['title'].str.cat(df['text'], sep =" ")
    
    # Saves data to database
    if only_tags == True:
        try:
            save_data(df,names[0])
        except:
            print('Already a file called ' + str(names[0]))
    else:
        try:
            save_data(df,names[1],False,3)
        except:
            print('Already a file called ' + str(names[1]))
    return df

In [88]:
names = ['elog_data_2011', 'elog_all_data']
save_data(df,names[1],False,10)

In [63]:
# Running this will save the data that we want to collect
df = main()

4/2007:  3413


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




5/2007:  2603
6/2007:  4149
7/2007:  4353
8/2007:  3939
9/2007:  584
10/2007:  121
11/2007:  422
12/2007:  3000
1/2008:  3721
2/2008:  3447
3/2008:  2844
4/2008:  1486
5/2008:  1607
6/2008:  1537
7/2008:  1439
8/2008:  1211
9/2008:  408
10/2008:  613
11/2008:  1609
12/2008:  1410
1/2009:  1832
2/2009:  1999
3/2009:  1172
4/2009:  2326
5/2009:  2137
6/2009:  2206
7/2009:  2144
8/2009:  2491
9/2009:  2258
10/2009:  2120
11/2009:  2033
12/2009:  1320
1/2010:  312
2/2010:  198
3/2010:  364
4/2010:  2323
5/2010:  2498
6/2010:  2258
7/2010:  2370
8/2010:  1889
9/2010:  1755
10/2010:  2042
11/2010:  2051
12/2010:  1142
1/2011:  2013
2/2011:  1763
3/2011:  739
4/2011:  307
5/2011:  1336
6/2011:  2735
7/2011:  2217
8/2011:  2172
9/2011:  1844
10/2011:  1816
11/2011:  1805
12/2011:  941
1/2012:  1768
2/2012:  1778
3/2012:  2531
4/2012:  2339
5/2012:  2710
6/2012:  2320
7/2012:  1733
8/2012:  639
9/2012:  868
10/2012:  1803
11/2012:  1694
12/2012:  1123
1/2013:  1654
2/2013:  1845
3/2013:  2486
4

In [66]:
df.head()

Unnamed: 0,elogid,tag,text,title,title_and_text
0,143932,,"BaBar, ROW","MCC Shift Change: Owl Shift, Tuesday, 01-May-2007","MCC Shift Change: Owl Shift, Tuesday, 01-May-2..."
1,143922,,Swing Shift</font></h2>\n\t<table>\n\t\t<tbody...,Swing Shift Summary,Swing Shift Summary Swing Shift</font></h2>\n\...
2,143919,,\n\n Luminosity of 7973 x10^30/cm^2s and Spec...,"LUM: 7973, HER: 1601, LER: 2501, SPLUM: 3.44","LUM: 7973, HER: 1601, LER: 2501, SPLUM: 3.44 \..."
3,143917,,Successful after PEM breakered off BX01/02.\n,LCLS injector vault to controlled access.,LCLS injector vault to controlled access. Succ...
4,143916,,He's done for the night. He has been having tr...,* RE: C. Rivetta performing LER grow/damp meas...,* RE: C. Rivetta performing LER grow/damp meas...


Have to be able to deal with:
* Tables
* special characters (new line)