# Applications processing Automation

The purpose of this code is to automate the first trivial filtering steps in the processing of the applications for the TReND in Africa Computational Neuroscience and Machine Learning Basics course.

This code is organized as a set of functions to be applied as a processing pipeline on the application responses data (See [documentation](https://docs.google.com/document/d/1n4pMEOgMuenuFpN6zXQtZlpYFXwPat2P4-SzZaN8mFg/edit?usp=drivesdk)).

### **How to use (as a developer):**
Just clone the Github repository and get into the business!\
If you have anaconda and yupyter installed locally you can just clone the repory directly on your machine. Elsewise, you can clone it into Google Colab.
(In either case, don't forget to regularly pull and push changes).

### **How to use (as a reviewer):**
If you are on Github now, open this notebook in Google Colab, or clone the whole repo locally, so you can run the cells. In case of running it in Colab, don't forget to save and download the resulting Excel sheet of the processed responses into a local folder.

In [79]:
import numpy as np
import pandas as pd

In [80]:
# Note that you have to download the responses data Excel sheet from Google Drive and put it in the same folder as the code.
# You don't have to do this if you cloned the Github repo (all will be organized in the repo).
 
# TODO: Load data directly from Google Drive.

# Loading students responses data
#DATA_DIR = './Copy of Answers_Application_form_TReND_Comp_Neuro_FIRSTPASS.xlsx'
STD_DATA_DIR = './dummy_students_responses_data.xlsx'
std_raw_responses_df = pd.read_excel(STD_DATA_DIR)

# Loading references responses data
REF_DATA_DIR = './dummy_references_responses_data.xlsx'
ref_raw_responses_df = pd.read_excel(REF_DATA_DIR)

# Adding two columns to the responses DataFrame (initialized with None for all cells).
std_raw_responses_df['Flag'] = None  #String ("flagged" or None)
std_raw_responses_df['Notes'] = None #String (Text of notes == reasaons for flagging)
std_raw_responses_df['Recommendation Letter 1'] = None
std_raw_responses_df['Recommendation Letter 2'] = None

# Flag the reference response if they submit more than one letter (keeo the last letter submitted)
ref_raw_responses_df['Flag'] = None
ref_raw_responses_df['Notes'] = None
ref_raw_responses_df['Matched'] = None

In [81]:
ref_raw_responses_df.columns

Index(['Email Address', 'Student Code', 'Student First Name',
       'Student Last Name', 'Letter', 'Flag', 'Notes', 'Matched'],
      dtype='object')

In [82]:
# Use this dictionary as a reference for column names.

questions_dict = {i: column for i, column in enumerate(std_raw_responses_df.columns)}
questions_dict

{0: 'Timestamp',
 1: 'Email address',
 2: 'First Name',
 3: 'Last Name',
 4: 'Unnamed: 4',
 5: 'Unnamed: 5',
 6: 'Unnamed: 6',
 7: 'Career Stage',
 8: 'Name of current University or Research Institution',
 9: 'Undergraduate degree (completed or ongoing, eg. Neuroscience, Mathematics)',
 10: "Master's degree (completed or ongoing, if applicable, eg. Neuroscience, Mathematics)",
 11: 'PhD degree (completed or ongoing, if applicable, eg. Neuroscience, Mathematics)',
 12: 'Current research focus or research focus of the last research project you were engaged in (if applicable)',
 13: 'Why would you like to attend the course? (2000 characters max)',
 14: 'How do you think you could contribute to the course?  (2000 characters max)',
 15: 'At the end of the first week the students will start a short individual research project. What would be your dream project?  (2000 characters max)',
 16: 'Please attach a 1-page CV in pdf format (documents longer than one page will be discarded). If you hav

In [83]:
# Used indices of the student responses DataFrame

std_idcs = {
    'email_idx' : 1,
    'firstname_idx' : 2,
    'lastname_idx' : 3,
    'flag_idx' : 18,
    'notes_idx' : 19,
    'first_recomm_letter_idx' : 20,
    'second_recomm_letter_idx' : 21,
    'first_ref_email_idx' : 22,
    'second_ref_email_idx' : 23
}

# Used indices of the reference responses DataFrame
ref_idcs = {
    'email_idx' : 'Email Address',
    'std_email_idx' : 'Student Code',
    'std_firstname_idx' : 'Student First Name',
    'std_lastname_idx' : 'Student Last Name',
    'flag_idx' : 'Flag',
    'notes_idx' : 'Notes',
    'letter_idx' : 'Letter'
}

std_str_qs = [std_idcs['email_idx'], std_idcs['firstname_idx'], std_idcs['lastname_idx']]
ref_str_qs = [ref_idcs['email_idx'], ref_idcs['std_email_idx'], ref_idcs['std_firstname_idx'], ref_idcs['std_lastname_idx']]

# Carefully specify names of the columns to be processed (mostly responses for essay questions).
essay_qs = [13, 14, 15]

# Specify the minimum and maximun number of words for ansewrs for essay questions.
# Note: These parameters apply for for all essay questions.
MIN_WORDS_NUM = 20
MAX_WORDS_NUM = 300

## Utility functions

In [84]:
def word_count(answer):
    """
    Takes a specific answer (cell) of a specific essay question and returns the answer's number of words.
    """
    
    return len(answer.split())

def to_lowercase(std_df, std_str_qs, ref_df, ref_str_qs):
    """
    For more rigid string comparisons, convert all answers needed for comparison to lowercase.
    """
    
    for q in std_str_qs:
        std_df[q] = std_df[q].str.lower()
        
    for q in ref_str_qs:
        ref_df[q] = ref_df[q].str.lower()
        
    return std_df, ref_df


def set_flag(responses_df, email):
    """
    Sets the 'flag' column value to "flagged" for a response chosen by it's 'Email address'
    """
    
    # This modifies the DataFrame itself (i.e change in place)
    # # 1 - 'Email address' column, 18 = 'flag' column
    responses_df.iloc[responses_df[1] == email, 18] = "flagged"

    
def leave_note(responses_df, response_index, note_text):
    """
    Appends a note to the 'Notes' column.
    """
    
    edited_responses_df = responses_df.copy()
    
    if note_text not in str(edited_responses_df.iloc[response_index, 19]):
        if edited_responses_df.iloc[response_index, 19] is None:
            edited_responses_df.iloc[response_index, 19] = note_text
        else:
            edited_responses_df.iloc[response_index, 19] = str(edited_responses_df.iloc[response_index, 19]) + " - " + note_text 
            
    return edited_responses_df
    
    
def column_names_to_indices(df):
    """
    Replaces column names with indices.
    """
    
    processed_df = df.rename(columns={column: i for i, column in enumerate(questions_dict.values())})

    return processed_df


def indices_to_column_names(df):
    """
    Replaces indices with column names.
    """

    processed_df = df.rename(columns={i: column for i, column in enumerate(questions_dict.values())})

    return processed_df


def remove_flagged(df):
    """
    Remove f;agged columns.
    """

    processed_df = df.drop(df[(df[flag_index] == 'flagged')].index)
    
    return processed_df



## Main Pipeline

In [85]:
def remove_duplicates(responses_df):
    """
    removes duplicated rows (responses) based on 'Email address' and keeps the last response submitted.
    Note: Some students may make changes to their responses and submit a new one,
    this's why this function keeps the last response submitted and removes preceding ones.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        response_df: the responses data (DataFrame)
    returns:
        edited_responses_df: An edited response_df with duplicates removed
    """
    edited_responses_df = responses_df.copy()
    
    # 1 = 'Email address' column
    edited_responses_df.drop_duplicates(subset=[1], keep='last')
    
    return edited_responses_df


def flag_duplicates(responses_df):
    """
    flags duplicated rows (responses) based on 'Email address' and keeps the last response submitted.
    Note: Some students may make changes to their responses and submit a new one,
    this's why this function keeps the last response submitted and flag preceding ones, and leaves a note.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        response_df: the responses data (DataFrame)
    returns:
        edited_responses_df: An edited responses_df with 'flag' column updated
    """
    
    edited_responses_df = responses_df.copy()
    
    # Format: df['col'] = (value_if_false).where(condition, value_if_true)
    
    # 18 = 'flag' column, 19 = 'notes' column
    edited_responses_df[18] = (edited_responses_df[18]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_responses_df.duplicated(subset=[1], keep='last'))),
        "flagged"
    )
    # 18 = 'flag' column, 19 = 'notes' column
    edited_responses_df[19] = (edited_responses_df[19]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_responses_df.duplicated(subset=[1], keep='last'))),
        "A duplicated response"
    )
    
    return edited_responses_df


def flag_duplicate_refs(ref_responses_df):
    """
    flags duplicated reference responces (i.e. submitting multiple letters), and keep the last submitted one.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        ref_response_df: the responses data (DataFrame)
    returns:
        edited_ref_responses_df: An edited responses_df with 'flag' column updated
    """
    
    edited_ref_responses_df = ref_responses_df.copy()
    
    # Format: df['col'] = (value_if_false).where(condition, value_if_true)
    
    # 18 = 'flag' column, 19 = 'notes' column
    edited_ref_responses_df['Flag'] = (edited_ref_responses_df['Flag']).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_ref_responses_df.duplicated(subset=['Email Address', 'Student Code'], keep='last'))),
        "flagged"
    )
    # 18 = 'flag' column, 19 = 'notes' column
    edited_ref_responses_df['Notes'] = (edited_ref_responses_df['Notes']).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_ref_responses_df.duplicated(subset=['Email Address', 'Student Code'], keep='last'))),
        "Submitted more than one letter for the same student"
    )
    
    return edited_ref_responses_df


def flag_short(responses_df, essay_qs):
    """
    flags insufficently short answers (less than a specific lower limit) for a specified
    set of essay questions, and leaves a note.
    
    params :
        response_df: the responses data (DataFrame)
        essay_qs   : essay questions (list)
    returns:
        edited_responses_df: An edited responses_df with short answers flagged
    """
    
    edited_responses_df = responses_df.copy()
    
    # Go through all the responses and for each response go through the answers for the essay questions
    for row_index in range(len(edited_responses_df)):
        for question in essay_qs:
            
            if word_count(str(edited_responses_df.iloc[row_index, question])) < MIN_WORDS_NUM:
                edited_responses_df.iloc[row_index, 18] = "flagged"
                
                edited_responses_df = leave_note(edited_responses_df, row_index, "Insufficient short answer/s")        
                    
    return edited_responses_df
                    

# Should we flag long answers ??
def flag_long(responses_df, essay_qs):
    """
    flags extremely long answers (more than a specific upprt limit) for a specified
    set of essay questions, and leaves a note.
    
    params :
        response_df: the responses data (DataFrame)
        essay_qs   : essay questions (list)
    returns:
        edited_response_df: An edited responses_df with long answers flagged
    """
     
    edited_responses_df = responses_df.copy()
    
    # Go through all the responses and for each response go through the answers for the essay questions
    for row_index in range(len(edited_responses_df)):
        for question in essay_qs:
            
            if word_count(str(edited_responses_df.iloc[row_index, question])) > MAX_WORDS_NUM:
                edited_responses_df.iloc[row_index, 18] = "flagged"
                
                edited_responses_df = leave_note(edited_responses_df, row_index, "Extremely long answer/s")
                        
    return edited_responses_df


def match_refs_based_on_stdn_email(std_responses_df, ref_responses_df):
    """
    Matches references with the student/s they are supporting, and flags student response if they get less than the required
    number of reference letters, and leaves a note.
    
    params :
        std_responses_df : students responses data (DataFrame)
        ref_responses_df : references responses data (DataFrame)
    returns:
        edited_std_responses_df: An edited std_responses_df with answers with unsatisfied conditions for recommendation letters flagged
        ref_responses_df: The ref_responses_df but with marking the "Matched" column for letters those successfully matched.
    """
     
    ref_responses_df = flag_duplicate_refs(ref_responses_df)
    edited_std_responses_df = std_responses_df.copy()
    
    for row_index in range(len(edited_std_responses_df)):
        
        if edited_std_responses_df.iloc[row_index, 20] is not None and edited_std_responses_df.iloc[row_index, 21] is not None:
            continue
            
        # Flag student response if BOTH of their references didn't submit any letter
        
        if edited_std_responses_df.iloc[row_index, 1] not in ref_responses_df['Student Code'].values:
            
            edited_std_responses_df.iloc[row_index, 18] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got no recommendation letters")
        
        # Flag student response if ANY of their references didn't submit any letter
        # Assign the one submitted letters to that student
        
        elif ref_responses_df['Student Code'].value_counts()[edited_std_responses_df.iloc[row_index, 1]] == 1:
            
            edited_std_responses_df.iloc[row_index, 18] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got only one recommendation letter")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, 'Student Code'] == edited_std_responses_df.iloc[row_index, 1]:
                    edited_std_responses_df.iloc[row_index, 20] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
        
        # This, from here below, would look much prettier with a while loop!
    
        # If BOTH references subnitted ONLY ONE letter,
        # Assign the right two letters to the specific student
        
        elif ref_responses_df['Student Code'].value_counts()[edited_std_responses_df.iloc[row_index, 1]] == 2:
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, 'Student Code'] == edited_std_responses_df.iloc[row_index, 1]:
                    edited_std_responses_df.iloc[row_index, 20] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
                    
                    break
                
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, 'Student Code'] == edited_std_responses_df.iloc[row_index, 1] and edited_std_responses_df.iloc[row_index, 20] is not None:
                    edited_std_responses_df.iloc[row_index, 21] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
        
        # Flag student response if one of or both their references submitted MORE THAN ONE letter
        # And assign the right two letters to that student
        
        elif ref_responses_df['Student Code'].value_counts()[edited_std_responses_df.iloc[row_index, 1]] > 2:
            
            #edited_std_responses_df.iloc[row_index, 18] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index,
                                                 "Some reference/s submitted more than two letters (The last was taken)")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, 'Student Code'] == edited_std_responses_df.iloc[row_index, 1] and ref_responses_df.loc[ref_index, 'Flag'] is None:
                    edited_std_responses_df.iloc[row_index, 20] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
                    
                    break
                    
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, 'Student Code'] == edited_std_responses_df.iloc[row_index, 1] and edited_std_responses_df.iloc[row_index, 20] is not None and ref_responses_df.loc[ref_index, 'Flag'] is None:
                    edited_std_responses_df.iloc[row_index, 21] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
                    
    return edited_std_responses_df, ref_responses_df


def match_refs_based_on_stdn_name(std_responses_df, ref_responses_df):
    """
    Matches references with the student/s they are supporting, and flags student response if they get less than the required
    number of reference letters, and leaves a note.
    
    params :
        std_responses_df : students responses data (DataFrame)
        ref_responses_df : references responses data (DataFrame)
    returns:
        edited_std_responses_df: An edited std_responses_df with answers with unsatisfied conditions for recommendation letters flagged,
        ref_responses_df: The ref_responses_df but with marking the "Matched" column for letters those successfully matched.
    """
     
    ref_responses_df = flag_duplicate_refs(ref_responses_df)
    edited_std_responses_df = std_responses_df.copy()
    
    for row_index in range(len(edited_std_responses_df)):
        
        if edited_std_responses_df.iloc[row_index, 20] is not None and edited_std_responses_df.iloc[row_index, 21] is not None:
            continue
        
        # Flag student response if BOTH of their references didn't submit any letter
        
        firstname_condition = edited_std_responses_df.iloc[row_index, 2] in ref_responses_df['Student First Name'].values
        lastname_condition = edited_std_responses_df.iloc[row_index, 3] in ref_responses_df['Student Last Name'].values
        
        if not firstname_condition and not lastname_condition:
            
            edited_std_responses_df.iloc[row_index, 18] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got no recommendation letters")
        
        # Flag student response if ANY of their references didn't submit any letter
        # Assign the one submitted letters to that student
        
        elif ref_responses_df[['Student First Name', 'Student Last Name']].value_counts()[tuple(edited_std_responses_df.iloc[row_index, 2:4])] == 1:
            
            edited_std_responses_df.iloc[row_index, 18] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got only one recommendation letter")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, ['Student First Name', 'Student Last Name']].tolist() == edited_std_responses_df.iloc[row_index, 2:4].tolist():
                    edited_std_responses_df.iloc[row_index, 20] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
        
        # This, from here below, would look much prettier with a while loop!
    
        # If BOTH references subnitted ONLY ONE letter,
        # Assign the right two letters to the specific student
        
        elif ref_responses_df[['Student First Name', 'Student Last Name']].value_counts()[tuple(edited_std_responses_df.iloc[row_index, 2:4])] == 2:
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, ['Student First Name', 'Student Last Name']].tolist() == edited_std_responses_df.iloc[row_index, 2:4].tolist():
                    edited_std_responses_df.iloc[row_index, 20] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
                    
                    break
                
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, ['Student First Name', 'Student Last Name']].tolist() == edited_std_responses_df.iloc[row_index, 2:4].tolist() and edited_std_responses_df.iloc[row_index, 20] is not None:
                    edited_std_responses_df.iloc[row_index, 21] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
        
        # Flag student response if one of or both their references submitted MORE THAN ONE letter
        # And assign the right two letters to that student
        
        elif ref_responses_df[['Student First Name', 'Student Last Name']].value_counts()[tuple(edited_std_responses_df.iloc[row_index, 2:4])] > 2:
            
            #edited_std_responses_df.iloc[row_index, 18] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index,
                                                 "Some reference/s submitted more than two letters (The last was taken)")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, ['Student First Name', 'Student Last Name']].tolist() == edited_std_responses_df.iloc[row_index, 2:4].tolist() and ref_responses_df.loc[ref_index, 'Flag'] is None:
                    edited_std_responses_df.iloc[row_index, 20] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
                    
                    break
                    
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.loc[ref_index, ['Student First Name', 'Student Last Name']].tolist() == edited_std_responses_df.iloc[row_index, 2:4].tolist() and edited_std_responses_df.iloc[row_index, 20] is not None and ref_responses_df.loc[ref_index, 'Flag'] is None:
                    edited_std_responses_df.iloc[row_index, 21] = ref_responses_df.loc[ref_index, 'Letter']
                    ref_responses_df['Matched'] = "matched"
                    
    return edited_std_responses_df, ref_responses_df


def match_refs_based_on_ref_email(std_responses_df, ref_responses_df):
    """
    Matches references with the student/s they are supporting, and flags student response if they get less than the required
    number of reference letters, and leaves a note.
    
    params :
        std_responses_df : students responses data (DataFrame)
        ref_responses_df : references responses data (DataFrame)
    returns:
        edited_std_responses_df: An edited std_responses_df with answers with unsatisfied conditions for recommendation letters flagged,
        ref_responses_df: The ref_responses_df but with marking the "Matched" column for letters those successfully matched.
    """
     
    ref_responses_df = flag_duplicate_refs(ref_responses_df)
    edited_std_responses_df = std_responses_df.copy()
    
    # Check if the first reference submitted their recommendation letter
    
    for row_index in range(len(edited_std_responses_df)):
        
        if edited_std_responses_df.iloc[row_index, 20] is not None and edited_std_responses_df.iloc[row_index, 21] is not None:
            continue
        
        if edited_std_responses_df.iloc[row_index, 20] is None:
        
            # Flag student response if the first specified reference by the student didn't submit their letter

            if edited_std_responses_df.iloc[row_index, 'first ref email'] not in ref_responses_df['Email Address'].values:

                edited_std_responses_df.iloc[row_index, 18] = "flagged"
                edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Didn't get a letter from their first reference")

            # If they did, add that letter to the corresponding student response in the student responses Excel sheet

            else:
                email_condition = edited_std_responses_df.iloc[row_index, 'first ref email'] == ref_responses_df['Email Address']
                firstname_condition = edited_std_responses_df.iloc[row_index, 2] == ref_responses_df['student first name']
                lastname_condition = edited_std_responses_df.iloc[row_index, 3] == ref_responses_df['student last name']

                if email_condition and firstname_condition and lastname_condition:

                    letter = ref_responses_df.loc[email_condition and firstname_condition and lastname_condition, 'Letter']
                    edited_std_responses_df.iloc[row_index, 20] = letter
                    ref_responses_df['Matched'] = "matched"
    
    # Check if the second reference submitted their recommendation letter
    
    for row_index in range(len(edited_std_responses_df)):
        
        if edited_std_responses_df.iloc[row_index, 21] in None:
        
            # Flag student response if the second specified reference by the student didn't submit their letter

            if edited_std_responses_df.iloc[row_index, 'second ref email'] not in ref_responses_df['Email Address'].values:

                edited_std_responses_df.iloc[row_index, 18] = "flagged"
                edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Didn't get a letters from their second reference")

            # If they did, add that letter to the corresponding student response in the student responses Excel sheet

            else:
                email_condition = edited_std_responses_df.iloc[row_index, 'second ref email'] == ref_responses_df['Email Address']
                firstname_condition = edited_std_responses_df.iloc[row_index, 2] == ref_responses_df['student first name']
                lastname_condition = edited_std_responses_df.iloc[row_index, 3] == ref_responses_df['student last name']

                if email_condition and firstname_condition and lastname_condition:

                    letter = ref_responses_df.loc[email_condition and firstname_condition and lastname_condition, 'Letter']
                    edited_std_responses_df.iloc[row_index, 21] = letter
                    ref_responses_df['Matched'] = "matched"
                    
    return edited_std_responses_df, ref_responses_df


def match_references(std_responses_df, ref_responses_df):
    
    edited_std_responses_df, ref_responses_df = match_refs_based_on_stdn_email(std_responses_df, ref_responses_df)
    edited_std_responses_df, ref_responses_df = match_refs_based_on_stdn_name(edited_std_responses_df, ref_responses_df)
    #df3 = match_refs_based_on_ref_email(df1, ref_responses_df)
    
    return edited_std_responses_df, ref_responses_df

In [86]:
def main(std_responses_df, ref_responses_df):
    
    std_responses_df = column_names_to_indices(std_responses_df)
    
    responses_df_flagged_duplicates = flag_duplicates(std_responses_df)
    responses_df_flagged_short = flag_short(responses_df_flagged_duplicates, essay_qs)
    responses_df_flagged_long = flag_long (responses_df_flagged_short, essay_qs)
    
    responses_df_lowercase, ref_responses_df_lowercase = to_lowercase(responses_df_flagged_long, std_str_qs, ref_responses_df, ref_str_qs)
    responses_df_final, ref_responses_df_final = match_references(responses_df_lowercase, ref_responses_df_lowercase)
    
    # Putting back original column names, and saving the Excel file
    named_responses_df_final = indices_to_column_names(responses_df_final)
    named_responses_df_final.to_excel("filtered_responses.xlsx")
    
    responses_df_final_flagged_removed = remove_flagged(responses_df_final)
    
    # Putting back original column names, and saving the Excel file
    named_responses_df_final_flagged_removed = indices_to_column_names(responses_df_final_flagged_removed)
    named_responses_df_final_flagged_removed.to_excel("filtered_responses_with_flagged_removed.xlsx")
    
    return responses_df_final, named_responses_df_final, responses_df_final_flagged_removed, named_responses_df_final_flagged_removed, ref_responses_df_final

In [87]:
responses_df_final, named_responses_df_final, responses_df_final_flagged_removed, named_responses_df_final_flagged_removed, ref_responses_df_final = main(std_raw_responses_df, ref_raw_responses_df)

In [88]:
responses_df_final.iloc[:, 18:]

Unnamed: 0,18,19,20,21
0,flagged,A duplicated response - Got only one recommend...,PDF 1,
1,flagged,A duplicated response - Got only one recommend...,PDF 1,
2,flagged,Insufficient short answer/s - Got only one rec...,PDF 1,
3,flagged,Insufficient short answer/s,PDF 2,PDF 15
4,flagged,Insufficient short answer/s,PDF 3,PDF 16
5,flagged,Insufficient short answer/s - Got only one rec...,PDF 4,PDF 17
6,flagged,Insufficient short answer/s - Got only one rec...,PDF 5,PDF 18
7,flagged,Insufficient short answer/s,PDF 6,PDF 19
8,flagged,Insufficient short answer/s - Got only one rec...,PDF 7,
9,flagged,Insufficient short answer/s - Extremely long a...,PDF 8,PDF 21


In [89]:
responses_df_final.iloc[:, 1:2]

Unnamed: 0,1
0,student1@gmail.com
1,student1@gmail.com
2,student1@gmail.com
3,amihretu@andrew.cmu.edu
4,asare.dorcas18@gmail.com
5,ashabilane@gmail.com
6,asma.ahmed1701@alexmed.edu.eg
7,bodoi@umat.edu.gh
8,atadanas@gmail.com
9,kiagebrenda@gmail.com


In [90]:
responses_df_final.iloc[:, 2:4]

Unnamed: 0,2,3
0,abigaile,ukunze dukorerimana
1,abigaile,ukunze dukorerimana
2,abigaile,ukunze dukorerimana
3,arisema mezgebe,mihretu
4,asare,dorcas
5,ashabilan,ebrahim
6,asmaa,reda
7,benjamin,odoi
8,benjamin sogodam,atadana
9,brenda,nyarango


In [91]:
ref_responses_df_final

Unnamed: 0,Email Address,Student Code,Student First Name,Student Last Name,Letter,Flag,Notes,Matched
0,reference1@gmail.com,student1@gmail.com,abigaile,ukunze dukorerimana,PDF 1,,,matched
1,reference2@gmail.com,amihretu@andrew.cmu.edu,arisema mezgebe,mihretu,PDF 2,,,matched
2,reference3@gmail.com,asare.dorcas18@gmail.com,asare,dorcas,PDF 3,,,matched
3,reference4@gmail.com,i don't care,ashabilan,ebrahim,PDF 4,,,matched
4,reference5@gmail.com,i don't care,asmaa,reda,PDF 5,,,matched
5,reference6@gmail.com,bodoi@umat.edu.gh,benjamin,odoi,PDF 6,,,matched
6,reference7@gmail.com,atadanas@gmail.com,benjamin sogodam,atadana,PDF 7,,,matched
7,reference8@gmail.com,kiagebrenda@gmail.com,brenda,nyarango,PDF 8,,,matched
8,reference9@gmail.com,brianashiundu000@gmail.com,brian,mboya,PDF 9,,,matched
9,reference10@gmail.com,ambambaakambabrunodupon@gmail.com,bruno dupon,ambamba akamba,PDF 10,,,matched


In [92]:
responses_df_final_flagged_removed.iloc[:, 18:]

Unnamed: 0,18,19,20,21
15,,,PDF 11,PDF 24
16,,,PDF 12,PDF 25
18,,Some reference/s submitted more than two lette...,PDF 14,PDF 29


In [93]:
responses_df_final_flagged_removed.iloc[:, 1:2]

Unnamed: 0,1
15,dawoudusman6@gmail.com
16,dikubaashley@gmail.com
18,student3@gmail.com


In [94]:
responses_df_final_flagged_removed.iloc[:, 2:4]

Unnamed: 0,2,3
15,dawoud,usman
16,dikuba,alicia ashley
18,elizabeth,mbirbah
