In [None]:
import os
import pandas as pd
directory = "raw"

In [None]:
df = pd.read_stata(os.path.join(directory, 'interview__actions.dta'))
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])

df['interview_actions_idx'] = df.groupby('interview__id').transform('first').index

# we assume the interview__actions file to be sorted by survey solutions, if not add row below
# df.sort_values(['interview__id', 'datetime'], inplace=True)

# get the index of the first reject/review event for each interview
df_rejected = df[df['action'].isin(['RejectedBySupervisor', 'RejectedByHQ', 'OpenedBySupervisor', 'OpenedByHQ'])]
first_reject_idx = df_rejected.groupby('interview__id')['datetime'].idxmin()

# get dataframe where each row is the last 'Completed' event prior to first reject event for each interview
def get_last_complete(group):
    reject_time = first_reject_idx.get(group.name)
    if reject_time is not None:
        group = group.loc[group.index < reject_time]
    if any(group['action'] == 'Completed'):
        return group[group['action'] == 'Completed'].iloc[[-1]]
    else:
        return pd.DataFrame()

last_complete_df = df.groupby('interview__id').apply(get_last_complete)

# create a new dataframe with interviewer and datetime
interview_df = last_complete_df[['originator', 'datetime', 'responsible__name', 'interview_actions_idx']].copy()
interview_df.columns = ['interviewer', 'datetime', 'supervisor', 'interview_actions_idx']

# Add column for the total number of interviews per interviewer
interview_df['total_interviews'] = interview_df.groupby('interviewer')['interviewer'].transform('count')

# Add column for the sequential number of each interview per interviewer
interview_df.sort_values(['interviewer', 'datetime'], inplace=True)
interview_df['interview_sequence'] = interview_df.groupby('interviewer').cumcount() + 1

# Add column for the date of each interview
interview_df['date'] = interview_df['datetime'].dt.date

# Calculate the number of days passed since the first date
interview_df['days_since_start'] = (interview_df['datetime'] - interview_df.groupby('interviewer')['datetime'].transform('min')).dt.days


# bring in columns from the diagnostics file
diagnostics_df = pd.read_stata(os.path.join(directory, 'interview__diagnostics.dta'))
interview_df = interview_df.merge(diagnostics_df, on='interview__id', how='outer')  # or left join, if we want to filter out those who do not have a completed event


#interview_df = interview_df.sort_values(by='interview_actions_idx')
interview_df

# variables
interviewer: interviewer who first completed the interviewer (note: may be reassigned to other interviewer afterwards, but rare)
datetime: the datetime of the first completion (note: may be rejected and again marked as complete, several times)
supervisor: the supervisor assigned to interviewer at the time (note: field organisation may differ)
interview_actions_idx: index of the row on the interview_actions file from which information was picked
total_interview: total interviews done by the interviewer
interview_sequence: sequential number by interviewer of the interview, by first completion date
date: date of first completion
days_since_start: first completion date in days since first completion in the data set
rest of variables is from interview_diagnostics

