In [None]:
import os, glob
import pandas as pd
import numpy as np
import datetime, re
import seaborn as sns
import matplotlib.pyplot as plt

## Append output files into four main files

In [None]:
#append all output data into 4 data frames
exphis,opehis,acchis,inshis = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
def com(exp,ope,acc,ins):
    for name in glob.glob('*-output.xlsx'):
        for sheet_name, df in pd.read_excel(name,sheet_name=None,engine='openpyxl').items():
            df = df.replace('ANON',name[:2], regex=True)
            if sheet_name == 'ExperimentHistory':
                cols = ['ExperimentId', 'ExperimentName', 'StartTime', 'EndTime', 'Source', 
                        'HostAddress', 'User', 'Project', 'Application', 'ApplicationVersion', 
                        'InstrumentType', 'InstrumentSerialNumber','Size', 'ReactorPosition']
                exp = pd.concat([exp,df[cols]], ignore_index=True)
            elif sheet_name == 'OperationHistory':
                ope = pd.concat([ope,df], ignore_index=True)
            elif sheet_name == 'AccessoryHistory':
                acc = pd.concat([acc,df], ignore_index=True)
            elif sheet_name == 'Instruments':
                ins = pd.concat([ins,df], ignore_index=True)
    return exp,ope,acc,ins
exphis,opehis,acchis,inshis = com(exphis,opehis,acchis,inshis)

In [None]:
exphis.info()

### Experiment History

In [None]:
#info check
#exphis.info()
#df.info()
#exphis.head()

In [None]:
def tidy_exphis(exphis_df):
    df = exphis_df.copy()
    df[['StartTime','EndTime']] = df[['StartTime','EndTime']].astype('datetime64[ns]')
    df = df.fillna(value = {'Application':'Touchscreen'})
    df = df[df['Application'] != 'icontrol labmax'] #filter out incontrol labmax
    # keep the 5 year record
    con = (df['StartTime'].dt.year >= 2018) & (df['StartTime'].dt.year <= (2018 + 5)) 
    df = df[con].sort_values(by = 'ExperimentId').reset_index(drop=True) 

    #rename User, Project, SerialNumber columns
    df['User'] = df['User'].fillna(df['ExperimentId'].str[0:2]+'-User' + '-Unknown') 
    df['Project'] = df['Project'].fillna(df['ExperimentId'].str[0:2]+'-Project' + '-Unknown')
    df['InstrumentSerialNumber'] = df['InstrumentSerialNumber'].fillna(df['ExperimentId'].str[0:2]+'-'+df['Application']+'-Unknown')
    df['Application'] = df['Application'].str.upper() #uppercase the application column #change to uppercase
    df['CompanyName'] = df['ExperimentId'].str[0:2]

    #create new columns: startyear, expduration, startday, endday, newuser,overnight, validity
    df['StartYear'] = df['StartTime'].dt.year #get start year
    df['Exp_durationinMin'] = (df['EndTime'] - df['StartTime']).apply(lambda x: x.total_seconds()/60) # turn it into minutes
    #df['Exp_durationinHr'] = (df['EndTime'] - df['StartTime']).apply(lambda x: x.total_seconds()/360) # turn it into minutes
    df['StartDay'] = df['StartTime'].dt.strftime('%a') #get day of the week
    df['EndDay'] = df['EndTime'].dt.strftime('%a') # get day of the week

    con = df['StartTime'].dt.date < df['EndTime'].dt.date
    df['Overnight'] = [1 if i == True else 0 for i in con if pd.notnull(i)] # add overnight column
    df['Validity'] = [1 if i == True else 0 for i in df['Exp_durationinMin'] > 5 if pd.notnull(i)] #add validity column

    #how many times of a user appeared in a given year: merge frequency
    userfreq = df[['User','StartYear']].value_counts().reset_index().rename(columns={0:'UserPerYear'})
    df = df.merge(userfreq, on =['User','StartYear'],how='left')

    # add user type column based of user appeared in a year: UserType
    #df['UserType'] = ['Heavy' if i >= 50 else 'Rare' if i <=5 else 'NaN' if pd.isnull(i) else 'Moderate' for i in df['UserPerYear']]
    df['UserType'] = ['Heavy' if i >= 50 else 'Rare' if i <=5 else  'Moderate' for i in df['UserPerYear'] if pd.notnull(i)]


    # add experiment type based on time length of the experiment: ExperimentType
    df['ExperimentType'] = ['Trash' if i < 5 
                        else 'Short Experiment' if i < 60
                        else 'Day Experiment' if i <480 
                        else 'Overnight Experiment' if i <1800
                        else 'Long Experiment' for i in df['Exp_durationinMin'] if pd.notnull(i) ]

    # perform a pivot to get the each type of experiment that User has done
    appusertype = df.pivot_table(values='ExperimentId',columns='Application',index=['User','StartYear'],fill_value=0,margins=True,aggfunc='count').reset_index() 
    appusertype['advanced_exp'] = appusertype['All'] - appusertype['TOUCHSCREEN'] - appusertype['ICONTROL']
    
    appusertype['AppuserType'] = ['TSonly' if x/y >= 0.9 else 'Adv' if z >=3 else 'Basic' 
                                    for x,y,z in zip(appusertype['TOUCHSCREEN'],appusertype['All'],appusertype['advanced_exp'])] 
    appusertype = appusertype[['User','StartYear','AppuserType']]
   
    df = pd.merge(df,appusertype, on=('User','StartYear'),how='left')
    con = df[['User','StartYear']].duplicated(keep=False) #keep all duplicates ?
    df['NewUser'] = ['Exped User' if i == True else 'New User' for i in con]
    return df

### Operation History

In [None]:
opstype = pd.read_csv('OperationType.csv')
#dict = opstype.value_counts().reset_index().drop(columns=0)

def tidy_opehis(opehis_df):
    df = opehis_df.copy()
    df['StartTime'] = pd.to_datetime(df['StartTime'], errors='coerce')
    #[df['StartTime']=='9999-12-31 18:59:59.9999999'] #a few cols contains invalid dates
    df = df[(df['StartTime'].dt.year >= 2018) & (df['StartTime'].dt.year <= (2018+5))]
    df = df.merge(opstype,on='OperationType',how='left')
    df['StartedOn'] = ['Touchscreen' if i == 1 else 'iControl' for i in df['StartedOnTouchScreen']]
    return df

### Call Functions

In [None]:
#call functions
exphisfull = tidy_exphis(exphis)
opehisfull = tidy_opehis(opehis)

exphisfull.to_csv('combined_ExperimentHistory.csv')
opehisfull.to_csv('combined_OperationHistory.csv')
acchis.to_csv('cmobined_AccessoryHistory.csv')
inshis.to_csv('combined_Instruments.csv')

### User Experiences Profile

In [None]:
#### double check later for a better method, the first date of the exp in a year
#### 1st exp and last exp of a specific by (adding a date column of when they did the exps (1st and last), company, user types)
 #duplicated user as experience user otherwise as new users ?
userfreq = exphisfull[['User','NewUser','AppuserType','CompanyName','StartYear','StartTime','UserPerYear']]
uf_grouped = userfreq.groupby(['User','NewUser','AppuserType','CompanyName','StartYear']).agg({'UserPerYear':sum,'StartTime':[np.min,np.max]})
uf_grouped.columns = ['ExpDonePerYear','1stExp','LastExp'] #df.columns.map(''.format)
uf_grouped = uf_grouped.reset_index()
uf_grouped.to_csv('combined_UsersExperienceProfile.csv')
#uf_grouped.head()

### Overlap Experiments of iControl and EasyMax

In [None]:
sort_cols = ['InstrumentSerialNumber','User','StartTime','EndTime'] #always sorting using these four columns
group_cols = ['InstrumentSerialNumber','User']

#1. clean and remove non-duplicated entries
easymax = exphisfull[exphisfull['Validity']==1][['InstrumentSerialNumber','CompanyName','InstrumentType','User','ExperimentName','ReactorPosition','StartTime','EndTime']]
easymax = easymax.sort_values(sort_cols)
easymax = easymax[easymax['InstrumentType'].str.contains(r'EasyMax')==True]
easymax = easymax.dropna(subset=['ReactorPosition'])
easymax = easymax[easymax[group_cols].duplicated(keep=False)]

#2.get overlap experiments done by same user on two ReactorPositions: two locations, same user, same serial number
tworpsexp = easymax[['InstrumentSerialNumber','User','ReactorPosition']].value_counts().to_frame('Counts')
tworpsexp = tworpsexp.unstack('ReactorPosition').dropna()
tworpsexp.columns = ['RP1','RP2']
tworpsexp = tworpsexp.reset_index()[group_cols]
f = tworpsexp.merge(easymax, on = group_cols)

#3. find overlap experiments by shifting the start time and RP
def overlap_experiments(df): 
    rpcon = df['ReactorPosition'].diff(periods=1) #condition comparing with previous row with a diff of 1
    datecon = df['StartTime'].dt.date.diff(periods=1) 
    a = df[(rpcon == 1) & ((datecon =='0 days') | (datecon=='1 days'))]
    b = df.loc[a.index-1]
    c = pd.concat([a,b]).sort_values(sort_cols).reset_index(drop=True)
    overlap = (c['StartTime'] >  c.groupby(group_cols)['EndTime'].shift()) #overlaps
    overlap = pd.concat([c.iloc[c[overlap].index + 1],c[overlap]])
    overlap = overlap.sort_values(sort_cols).reset_index(drop=True)
    return overlap

overlaps = overlap_experiments(f)
overlaps.to_csv('combined_overlaps.csv')

### Experiment Summary Table 

In [None]:
def get_exp_summary(df):
    df_valid = df[df['ExperimentName'].duplicated(keep=False) & df['Validity']==1]
    df_pivot = df_valid.pivot_table(index=['ExperimentName','StartYear','CompanyName'],columns='Application',values='ExperimentId',aggfunc='count', margins=True, margins_name='Total').reset_index().rename_axis(None, axis=1)
    df_pivot = df_pivot[df_pivot['Total'] < 10].reset_index(drop=True)
    df_pivot['EasyMaxOverlap'] = df['ExperimentName'].isin(overlaps['ExperimentName'])
    all_EM_exp = df[df['InstrumentType'].str.contains(r'EasyMax')==1]['ExperimentName'] #get all the exp names with EM
    noneEM = df_pivot[~df_pivot['ExperimentName'].isin(all_EM_exp)].index
    df_pivot.loc[noneEM,['EasyMaxOverlap']] = 'NaN' #update values: no exp on easy max -> null, overlap --> True, easy max exp but not overlap --> false
    df_pivot.iloc[:,3:12] = df_pivot.iloc[:,3:12].notnull()
    return df_pivot
exp_summary = get_exp_summary(exphisfull)
exp_summary.to_csv('combined_experiment_summary.csv')
#exp_summary.head(20)

In [None]:
#spot check
# df[df['ExperimentName']=='A1-ExperimentName-012-000731'][['InstrumentSerialNumber','CompanyName','InstrumentType','User','ExperimentName','ReactorPosition','StartTime','EndTime']]

By SerialNumber and User, two reactor positions required for overlapping

In [None]:
#iControl and other exptype duplications
dupexps = exphisfull.copy()
dupexps['StartTime'] = dupexps['StartTime'].dt.strftime("%Y-%m-%d")
dupexps = dupexps[dupexps[['ExperimentName','User','StartTime']].duplicated(keep=False)] #find duplicates and keep all
dupexps = dupexps.pivot_table(index='ExperimentName',columns='Application',values='ExperimentId',aggfunc='count').notnull()   #pivot by application and ExpName, and aff by count, pass table to check null
dupexps = dupexps[dupexps['ICONTROL']==True] #keep duplicated exps with one is ICONTROL 
dupexps = dupexps.sum(axis=1).reset_index().rename(columns={0:'Count'}).query('Count > 1')['ExperimentName'] #get grand total, and query the ExpName row that has more than one none null value 
dupexps = exphisfull[exphisfull['ExperimentName'].isin(dupexps)].reset_index(drop=True) #subset the duplicated experiments
easymaxdups = dupexps[dupexps['InstrumentType'].str.contains('EasyMax',na=False)].reset_index(drop=True).sort_values(['ExperimentId','ExperimentName']) #filter out non easymax experiments and save the information
#easymaxdups.to_csv('combined_dupexps.csv')
#easymaxdups[easymaxdups[['ExperimentName','User']].duplicated(keep=False)].sort_values('ExperimentName')[easymaxdups['Validity']==1]