# Applications Processing Automation

(*By: [@mahmoud-elmakki](https://github.com/mahmoud-elmakki)*)

The purpose of this code is to automate the first trivial filtering steps in the processing of the applications for the TReND in Africa Computational Neuroscience and Machine Learning Basics course.

This code is organized as a set of functions to be applied as a processing pipeline on the application responses data (See [documentation](https://docs.google.com/document/d/1n4pMEOgMuenuFpN6zXQtZlpYFXwPat2P4-SzZaN8mFg/edit?usp=drivesdk)).

### **How to use (as a developer):**
Just clone the Github repository and get into the business!\
If you have anaconda and yupyter installed locally you can just clone the repory directly on your machine. Elsewise, you can clone it into Google Colab.
(In either case, don't forget to regularly pull and push changes).

### **How to use (as a reviewer):**
If you are on Github now, open this notebook in Google Colab, or clone the whole repo locally, so you can run the cells. In case of running it in Colab, don't forget to save and download the resulting Excel sheet of the processed responses into a local folder.

In [None]:
import numpy as np
import pandas as pd

import os

In [None]:
# Note that you have to download the responses data Excel sheet from Google Drive and put it in the same folder as the code.
# You don't have to do this if you cloned the Github repo (all will be organized in the repo).
 
# TODO: Load data directly from Google Drive.

# Loading students responses data
STD_DATA_DIR = './responses_data/TReND Comp Neuro application form Rwanda 2024 (Responses).xlsx'
std_raw_responses_df = pd.read_excel(STD_DATA_DIR)

# Loading references responses data
REF_DATA_DIR = './responses_data/Recommendation-Letter-Portal.xlsx'
ref_raw_responses_df = pd.read_excel(REF_DATA_DIR)

# Adding two columns to the responses DataFrame (initialized with None for all cells).
std_raw_responses_df['Flag'] = None  #String ("flagged" or None)
std_raw_responses_df['Notes'] = None #String (Text of notes == reasaons for flagging)
std_raw_responses_df['Recommendation Letter 1'] = None
std_raw_responses_df['Recommendation Letter 2'] = None

# Flag the reference response if they submit more than one letter (keeo the last letter submitted)
ref_raw_responses_df['Flag'] = None
ref_raw_responses_df['Notes'] = None
ref_raw_responses_df['Matched'] = "unmatched"

# Just specify folder names - thje code will create the directory
RESULTS_FOLDER_NAME = "filtered_responses"
RESULTS_DIR = os.path.join(os.getcwd(), RESULTS_FOLDER_NAME)

if not os.path.exists(RESULTS_DIR):
    os.mkdir(RESULTS_DIR)

In [None]:
# Specify the minimum and maximun number of words for ansewrs for essay questions.
# Note: These parameters apply for for all essay questions.

MIN_WORDS_NUM = 10
MAX_WORDS_NUM = 500

In [None]:
ref_raw_responses_df.columns

In [None]:
# Use this dictionary as a reference for column names.

std_questions_dict = {i: column for i, column in enumerate(std_raw_responses_df.columns)}
std_questions_dict

In [None]:
# Use this dictionary as a reference for column names.

ref_questions_dict = {i: column for i, column in enumerate(ref_raw_responses_df.columns)}
ref_questions_dict

In [None]:
# Used indices of the student responses DataFrame

std_idcs = {
    'email_idx' : 1,
    'firstname_idx' : 2,
    'lastname_idx' : 3,
    'ref' : {
        'first_ref_email_idx' : 25,
        'second_ref_email_idx' : 27
          },
    'flag_idx' : 28,
    'notes_idx' : 29,
    'first_recomm_letter_idx' : 30,
    'second_recomm_letter_idx' : 31,
}

# Used indices of the reference responses DataFrame
ref_idcs = {
    'email_idx' : 1,
    'std' : {
        'firstname_idx' : 3,
        'lastname_idx' : 4,
        'email_idx' : 5
    },
    'letter_idx' : 7,
    'flag_idx' : 8,
    'notes_idx' : 9,
    'matched_idx' : 10
}

std_str_qs = [std_idcs['email_idx'], std_idcs['firstname_idx'], std_idcs['lastname_idx']]
ref_str_qs = [ref_idcs['email_idx'], ref_idcs['std']['email_idx'], ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]

std_names = [std_idcs['firstname_idx'], std_idcs['lastname_idx']]
ref_names = [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]

# Carefully specify names of the columns to be processed (mostly responses for essay questions).
essay_qs = [20, 21, 22]

## Utility functions

In [None]:
def word_count(answer):
    """
    Takes a specific answer (cell) of a specific essay question and returns the answer's number of words.
    """
    return len(answer.split())


def to_lowercase(std_df, std_str_qs, ref_df, ref_str_qs):
    """
    For more rigid string comparisons, convert all answers needed for comparison to lowercase.
    """
    for q in std_str_qs:
        std_df[q] = std_df[q].str.lower()
        
    for q in ref_str_qs:
        ref_df[q] = ref_df[q].str.lower()
        
    return std_df, ref_df


def to_uppercase(std_df, std_str_qs, ref_df, ref_str_qs):
    """
    This to bring names back as they were.
    """
    for q in std_str_qs:
        std_df[q] = std_df[q].str.title()
        
    for q in ref_str_qs:
        ref_df[q] = ref_df[q].str.title()
        
    return std_df, ref_df


def remove_spaces(std_df, std_str_qs, ref_df, ref_str_qs):
    """
    Remove spaces from names and emails.
    """
    for q in std_str_qs:
        std_df[q] = std_df[q].str.replace(" ", "")
        
    for q in ref_str_qs:
        ref_df[q] = ref_df[q].str.replace(" ", "")
        
    return std_df, ref_df


def set_flag(responses_df, email):
    """
    Sets the 'flag' column value to "flagged" for a response chosen by it's 'Email address'
    """
    # This modifies the DataFrame itself (i.e change in place)
    responses_df.iloc[responses_df[std_idcs['email_idx']] == email, std_idcs['flag_idx']] = "flagged"

    
def leave_note(responses_df, response_index, note_text):
    """
    Appends a note to the 'Notes' column.
    """
    edited_responses_df = responses_df.copy()
    
    if note_text not in str(edited_responses_df.iloc[response_index, std_idcs['notes_idx']]):
        if edited_responses_df.iloc[response_index, std_idcs['notes_idx']] is None:
            edited_responses_df.iloc[response_index, std_idcs['notes_idx']] = note_text
        else:
            edited_responses_df.iloc[response_index, std_idcs['notes_idx']] = str(edited_responses_df.iloc[response_index, std_idcs['notes_idx']]) + ". " + note_text 
            
    return edited_responses_df
    
    
def column_names_to_indices(df, indices_dict):
    """
    Replaces column names with indices.
    """
    processed_df = df.rename(columns={column: i for i, column in enumerate(indices_dict.values())})

    return processed_df


def indices_to_column_names(df, indices_dict):
    """
    Replaces indices with column names.
    """
    processed_df = df.rename(columns={i: column for i, column in enumerate(indices_dict.values())})

    return processed_df


def remove_flagged(df):
    """
    Remove f;agged columns.
    """
    processed_df = df.drop(df[(df[std_idcs['flag_idx']] == 'flagged')].index)
    
    return processed_df


def get_unmatched_letters(ref_responses_df):
    """
    Gets unmatched recommendation letters.
    """
    ref_responses_df_unmatched = ref_responses_df.loc[ref_responses_df[ref_idcs['matched_idx']] == "unmatched"]
    
    return ref_responses_df_unmatched


def get_std_by_email(std_responses_df, email):
    
    return std_responses_df.loc[std_responses_df[std_idcs['email_idx']] == email]


def get_ref_by_email(ref_responses_df, email):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['email_idx']] == email]


def get_std_by_email_from_ref(ref_responses_df, std_email):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['std']['email_idx']] == std_email]


def get_std_by_firstname_from_ref(ref_responses_df, firstname):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['std']['firstname_idx']] == firstname]


def get_std_by_lastname_from_ref(ref_responses_df, lastname):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['std']['lastname_idx']] == lastname]


def get_std_by_firstname_and_lastname_from_ref(ref_responses_df, firstname, lastname):
    
    return ref_responses_df.loc[(ref_responses_df[ref_idcs['std']['firstname_idx']] == firstname) & (ref_responses_df_final[ref_idcs['std']['lastname_idx']] == lastname)] 

## Main Pipeline

In [None]:
def remove_duplicates(responses_df):
    """
    removes duplicated rows (responses) based on 'Email address' and keeps the last response submitted.
    Note: Some students may make changes to their responses and submit a new one,
    this's why this function keeps the last response submitted and removes preceding ones.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        response_df: the responses data (DataFrame)
    returns:
        edited_responses_df: An edited response_df with duplicates removed
    """
    
    edited_responses_df = responses_df.copy()
    
    edited_responses_df.drop_duplicates(subset=[std_idcs['email_idx']], keep='last')
    
    return edited_responses_df


def flag_duplicates(responses_df):
    """
    flags duplicated rows (responses) based on 'Email address' and keeps the last response submitted.
    Note: Some students may make changes to their responses and submit a new one,
    this's why this function keeps the last response submitted and flag preceding ones, and leaves a note.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        response_df: the responses data (DataFrame)
    returns:
        edited_responses_df: An edited responses_df with 'flag' column updated
    """
    
    edited_responses_df = responses_df.copy()
    
    # Format: df['col'] = (value_if_false).where(condition, value_if_true)
    
    edited_responses_df[std_idcs['flag_idx']] = (edited_responses_df[std_idcs['flag_idx']]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_responses_df.duplicated(subset=[1], keep='last'))),
        "flagged"
    )

    edited_responses_df[std_idcs['notes_idx']] = (edited_responses_df[std_idcs['notes_idx']]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_responses_df.duplicated(subset=[1], keep='last'))),
        "A duplicated response"
    )
    
    return edited_responses_df


def flag_duplicate_refs(ref_responses_df):
    """
    flags duplicated reference responces (i.e. submitting multiple letters), and keep the last submitted one.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        ref_response_df: the responses data (DataFrame)
    returns:
        edited_ref_responses_df: An edited responses_df with 'flag' column updated
    """
    
    edited_ref_responses_df = ref_responses_df.copy()
    
    # Format: df['col'] = (value_if_false).where(condition, value_if_true)
    
    edited_ref_responses_df[ref_idcs['flag_idx']] = (edited_ref_responses_df[ref_idcs['flag_idx']]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_ref_responses_df.duplicated(subset=[ref_idcs['email_idx'], ref_idcs['std']['email_idx']], keep='last'))),
        "flagged"
    )

    edited_ref_responses_df[ref_idcs['notes_idx']] = (edited_ref_responses_df[ref_idcs['notes_idx']]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_ref_responses_df.duplicated(subset=[ref_idcs['email_idx'], ref_idcs['std']['email_idx']], keep='last'))),
        "Submitted more than one letter for the same student"
    )
    
    return edited_ref_responses_df


def flag_short(responses_df, essay_qs):
    """
    flags insufficently short answers (less than a specific lower limit) for a specified
    set of essay questions, and leaves a note.
    
    params :
        response_df: the responses data (DataFrame)
        essay_qs   : essay questions (list)
    returns:
        edited_responses_df: An edited responses_df with short answers flagged
    """
    
    edited_responses_df = responses_df.copy()
    
    # Go through all the responses and for each response go through the answers for the essay questions
    for row_index in range(len(edited_responses_df)):
        for question in essay_qs:
            
            if word_count(str(edited_responses_df.iloc[row_index, question])) < MIN_WORDS_NUM:
                edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
                
                edited_responses_df = leave_note(edited_responses_df, row_index, "Insufficient short answer/s")        
                    
    return edited_responses_df
                    

# Should we flag long answers ??
def flag_long(responses_df, essay_qs):
    """
    flags extremely long answers (more than a specific upprt limit) for a specified
    set of essay questions, and leaves a note.
    
    params :
        response_df: the responses data (DataFrame)
        essay_qs   : essay questions (list)
    returns:
        edited_response_df: An edited responses_df with long answers flagged
    """
     
    edited_responses_df = responses_df.copy()
    
    # Go through all the responses and for each response go through the answers for the essay questions
    for row_index in range(len(edited_responses_df)):
        for question in essay_qs:
            
            if word_count(str(edited_responses_df.iloc[row_index, question])) > MAX_WORDS_NUM:
                edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
                
                edited_responses_df = leave_note(edited_responses_df, row_index, "Extremely long answer/s")
                        
    return edited_responses_df


def match_refs_based_on_stdn_email(std_responses_df, ref_responses_df):
    """
    Matches references with the student/s they are supporting, and flags student response if they get less than the required
    number of reference letters, and leaves a note.
    
    params :
        std_responses_df : students responses data (DataFrame)
        ref_responses_df : references responses data (DataFrame)
    returns:
        edited_std_responses_df: An edited std_responses_df with answers with unsatisfied conditions for recommendation letters flagged
        ref_responses_df: The ref_responses_df but with marking the "Matched" column for letters those successfully matched.
    """
     
    ref_responses_df = flag_duplicate_refs(ref_responses_df)
    edited_std_responses_df = std_responses_df.copy()
    
    for row_index in range(len(edited_std_responses_df)):
        
        if edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None and edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] is not None:
            continue
            
        # Flag student response if BOTH of their references didn't submit any letter
        
        if edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] not in ref_responses_df[ref_idcs['std']['email_idx']].values:
            
            edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got no recommendation letters")
        
        # Flag student response if ANY of their references didn't submit any letter
        # Assign the one submitted letters to that student
        
        elif ref_responses_df[ref_idcs['std']['email_idx']].value_counts()[edited_std_responses_df.iloc[row_index, std_idcs['email_idx']]] == 1:
            
            edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got only one recommendation letter")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']]:
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
        
        # This, from here below, would look much prettier with a while loop!
    
        # If BOTH references subnitted ONLY ONE letter,
        # Assign the right two letters to the specific student
        
        elif ref_responses_df[ref_idcs['std']['email_idx']].value_counts()[edited_std_responses_df.iloc[row_index, std_idcs['email_idx']]] == 2:
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']]:
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
                    break
                
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] and edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None:
                    edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
        
        # Flag student response if one of or both their references submitted MORE THAN ONE letter
        # And assign the right two letters to that student
        
        elif ref_responses_df[ref_idcs['std']['email_idx']].value_counts()[edited_std_responses_df.iloc[row_index, std_idcs['email_idx']]] > 2:
            
            #edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index,
                                                 "Some reference/s submitted more than two letters (The last was taken)")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
                    break
                    
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] and edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
    return edited_std_responses_df, ref_responses_df


def match_refs_based_on_stdn_name(std_responses_df, ref_responses_df):
    """
    Matches references with the student/s they are supporting, and flags student response if they get less than the required
    number of reference letters, and leaves a note.
    
    params :
        std_responses_df : students responses data (DataFrame)
        ref_responses_df : references responses data (DataFrame)
    returns:
        edited_std_responses_df: An edited std_responses_df with answers with unsatisfied conditions for recommendation letters flagged,
        ref_responses_df: The ref_responses_df but with marking the "Matched" column for letters those successfully matched.
    """
     
    ref_responses_df = flag_duplicate_refs(ref_responses_df)
    edited_std_responses_df = std_responses_df.copy()
    
    for row_index in range(len(edited_std_responses_df)):
        
        if edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None and edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] is not None:
            continue
            
        try:
            got_one_letter = ref_responses_df[[ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].value_counts()[tuple(edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1])] == 1
        
        except KeyError:
            got_one_letter = False
            
        try:
            got_two_letters = ref_responses_df[[ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].value_counts()[tuple(edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1])] == 2

        except KeyError:
            got_two_letters = False 
            
        try:
            got_more_than_two_letters = ref_responses_df[[ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].value_counts()[tuple(edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1])] > 2
       
        except KeyError:
            got_more_than_two_letters = False 
        
        if not got_one_letter and not got_two_letters and not got_more_than_two_letters:
            got_no_letters = True
            
        else:
            got_no_letters = False
    
        # Flag student response if BOTH of their references didn't submit any letter
          
        if got_no_letters:
            
            edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got no recommendation letters")
        
        # Flag student response if ANY of their references didn't submit any letter
        # Assign the one submitted letters to that student
            
        if got_one_letter:
            
            edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got only one recommendation letter")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist():
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
                    break
        
        # This, from here below, would look much prettier with a while loop!
    
        # If BOTH references subnitted ONLY ONE letter,
        # Assign the right two letters to the specific student
        
        if got_two_letters:
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist():
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
                    break
                
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist() and edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
        
        # Check if there are references who submitted MORE THAN ONE letter to the same student
        # And assign the right two letters to that student
        
        if got_more_than_two_letters:
            
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index,
                                                 "Some reference/s submitted more than two letters (The last was taken)")

            for ref_index in range(len(ref_responses_df)):

                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist() and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:

                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"

                    break

            for ref_index in range(len(ref_responses_df)):

                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist() and edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:

                    edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
    return edited_std_responses_df, ref_responses_df


def match_references(std_responses_df, ref_responses_df):
    
    edited_std_responses_df, ref_responses_df = match_refs_based_on_stdn_email(std_responses_df, ref_responses_df)
    #edited_std_responses_df, ref_responses_df = match_refs_based_on_stdn_name(std_responses_df, ref_responses_df)
    
    return edited_std_responses_df, ref_responses_df

In [None]:
def main(std_responses_df, ref_responses_df):
    
    std_responses_df = column_names_to_indices(std_responses_df, std_questions_dict)
    ref_responses_df = column_names_to_indices(ref_responses_df, ref_questions_dict)
    
    responses_df_flagged_duplicates = flag_duplicates(std_responses_df)
    responses_df_flagged_short = flag_short(responses_df_flagged_duplicates, essay_qs)
    responses_df_flagged_long = flag_long (responses_df_flagged_short, essay_qs)
    
    responses_df_spaces_removed, ref_responses_df_lowercase = remove_spaces(responses_df_flagged_long, std_str_qs, ref_responses_df, ref_str_qs)
    
    responses_df_lowercase, ref_responses_df_lowercase = to_lowercase(responses_df_spaces_removed, std_str_qs, ref_responses_df, ref_str_qs)
    responses_df_matched, ref_responses_df_matched = match_references(responses_df_lowercase, ref_responses_df_lowercase)
    responses_df_final, ref_responses_df_final = to_uppercase(responses_df_matched, std_str_qs, ref_responses_df_matched, ref_str_qs)
    
    ref_responses_df_unmatched = get_unmatched_letters(ref_responses_df_final)
    
    # Putting back original column names, and saving the Excel file
    named_responses_df_final = indices_to_column_names(responses_df_final, std_questions_dict)
    named_responses_df_final.to_excel(RESULTS_FOLDER_NAME + "/filtered_responses_with_flagged.xlsx")
    
    responses_df_final_flagged_removed = remove_flagged(responses_df_final)
    
    # Putting back original column names, and saving the Excel file
    named_responses_df_final_flagged_removed = indices_to_column_names(responses_df_final_flagged_removed, std_questions_dict)
    named_responses_df_final_flagged_removed.to_excel(RESULTS_FOLDER_NAME + "/filtered_responses_with_flagged_removed.xlsx")
    
    # Putting back original column names, and saving the Excel file
    named_ref_responses_df_final = indices_to_column_names(ref_responses_df_final, ref_questions_dict)
    named_ref_responses_df_final.to_excel(RESULTS_FOLDER_NAME + "/ref_responses_with_flagged.xlsx")
    
    return responses_df_final, responses_df_final_flagged_removed, ref_responses_df_final, ref_responses_df_unmatched

In [None]:
responses_df_final, responses_df_final_flagged_removed, ref_responses_df_final, ref_responses_df_unmatched = main(std_raw_responses_df, ref_raw_responses_df)

In [None]:
responses_df_final.iloc[:, std_idcs['flag_idx']:]

In [None]:
responses_df_final.iloc[:, std_idcs['email_idx']:std_idcs['email_idx'] + 1]

In [None]:
responses_df_final.iloc[:, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1]

In [None]:
ref_responses_df_final

In [None]:
ref_responses_df_unmatched

In [None]:
responses_df_final_flagged_removed.iloc[:, std_idcs['flag_idx']:]

In [None]:
responses_df_final_flagged_removed.iloc[:, std_idcs['email_idx']:std_idcs['email_idx'] + 1]

In [None]:
responses_df_final_flagged_removed.iloc[:, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1]

In [None]:
ref_responses_df_final.groupby([ref_idcs['std']['email_idx']]).size().reset_index().rename(columns={0: 'count'})

In [None]:
ref_responses_df_final.groupby([ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]).size().reset_index().rename(columns={0: 'count'})