In [1]:
import pandas as pd
import numpy as np
import requests
import time
from datetime import datetime
from sqlalchemy import create_engine

In [2]:
def get_data(s,e):
    '''
    --- Imports data from Elog and stores it in a workable format ---
    INPUT
        s: start time as unix timestamp
        e: end time as unix time stamp
    RETURN
        df: dataframe of uncleaned data between selected time range
    '''
    
    # api-endpoint 
    URL = "https://mccelog.slac.stanford.edu/elog/dev/mgibbs/dev_elog_display_json.php"

    PARAMS = {'logbook': 'MCC', 'start': s, 'end': e} 

    # sending get request and saving the response as response object 
    r = requests.get(url = URL, params = PARAMS) 

    # extracting data in json format 
    data = r.json()

    # Turning list of json objects into dataframe
    df = pd.DataFrame.from_records(data)

    return df

In [3]:
def clean_data(df):
    '''
    --- Cleans data frame ---
    INPUT
        df: dataframe (not cleaned)
    RETURN
        df: dataframe (cleaned)
    '''
    # Checks to make sure there are even entries with a tag in the specified month
    if 'tag' not in df.columns:
        return 0
    
    # Dropping rows without any tags (these rows are useless for us)
    df = df[df.tag.notnull() == True]
    
    # Dropping useless columns
    important_cols = {'title', 'text', 'elogid', 'tag', 'superseded_by'}
    list1 = df.columns.tolist()
    list1 = [ele for ele in list1 if ele not in important_cols]
    for column in df.columns.tolist():
        if column in list1:
            df = df.drop(column,axis = 1)

    # Dropping all columns where superceded_by is not null to essentially drop duplicates. Then drop superceded_by column
    df = df[df['superseded_by'].isnull() == True]
    df = df.drop(['superseded_by'],axis = 1)
    df = df.drop_duplicates(subset ="elogid", keep = 'first')
    
    # Reset the index
    df = df.reset_index(drop=True)
    
    return df

In [4]:
def join_data_2011():
    '''
    --- Builds one giant dataframe by concating data frames together one month at a time ---
    RETURN
        df: Cleaned dataframe of tagged entries from April 2007 - December 2011.    
    '''
    year_list = [2007,2008,2009,2010,2011]
    month_list = list(range(1,13))
    df = pd.DataFrame(columns=['elogid', 'title', 'text', 'tag'])
    for year in year_list:
        for month in month_list:
            if (year == 2007 and month < 4):
                continue
            elif (month == 12):
                s = datetime(year, month, 1, 0, 0).timestamp()
                e = datetime(year+1, 1, 1, 0, 0).timestamp()
                df_temp = get_data(s,e)
                df_temp = clean_data(df_temp)
            else:
                s = datetime(year, month, 1, 0, 0).timestamp()
                e = datetime(year, month+1, 1, 0, 0).timestamp()
                df_temp = get_data(s,e)
                df_temp = clean_data(df_temp)
            
            # Checks to make sure cleaned dataframe actually has any tags
            if isinstance(df_temp, pd.DataFrame) == True:
                print(str(month)+'/'+str(year) + ':  ' + str(df_temp.shape[0]))
                df = pd.concat([df,df_temp], ignore_index = True)
    return df

In [5]:
# Function to save the data into sql database
def save_data(df, database_filename):
    engine = create_engine('sqlite:///'+database_filename+'.db')
    df.to_sql(database_filename, engine, index=False)

In [6]:
def main():
    '''
    Will go through all the necessary steps to extract the data from the elog, clean it, and save the data
    in an SQL database
    '''
    df = join_data_2011()
    save_data(df,'elog_data_2011')

In [None]:
# Running this will save the data that we want to collect
main()