# Applications Processing Automation

<a target="_blank" href="https://colab.research.google.com/github/trendinafrica/student_selection_process_automation/blob/main/main.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

(*By: [@mahmoud-elmakki](https://github.com/mahmoud-elmakki)*)

The purpose of this code is to automate the first trivial filtering steps in the processing of the applications for the TReND in Africa Computational Neuroscience and Machine Learning Basics course.

This code is organized as a set of functions to be applied as a processing pipeline on the application responses data (See [documentation](https://docs.google.com/document/d/1n4pMEOgMuenuFpN6zXQtZlpYFXwPat2P4-SzZaN8mFg/edit?usp=drivesdk)).

Also see [weighted_grading_first_round.ipynb](https://github.com/trendinafrica/student_selection_process_automation/blob/main/weighted_grading_first_round.ipynb), and [weighted_grading_second_round.ipynb](https://github.com/trendinafrica/student_selection_process_automation/blob/main/weighted_grading_second_round.ipynb).

### **How to use (as a developer):**
Just clone the Github repository and get into the business!\
If you have anaconda and yupyter installed locally you can just clone the repory directly on your machine. Elsewise, you can clone it into Google Colab.
(In either case, if you have any valuable contributions, don't hesitatte to do a Pull Request).

### **How to use (as a reviewer):**
If you are on Github now, open this notebook in Google Colab, or clone the whole repo locally, so you can run the cells. In case of running it in Colab, don't forget to save and download the resulting Excel sheet of the processed responses into a local folder.

In [None]:
import numpy as np
import pandas as pd

import os

## Load Data

In [None]:
# Note that you have to download the responses data Excel sheet from Google Drive and put it in the same folder as the code.
# You don't have to do this if you cloned the Github repo (all will be organized in the repo).
 
# TODO: Load data directly from Google Drive.

# Loading students responses data
STD_DATA_DIR = './responses_data/TReND Comp Neuro application form Rwanda 2024 (Responses).xlsx'
std_raw_responses_df = pd.read_excel(STD_DATA_DIR)

# Loading references responses data
REF_DATA_DIR = './responses_data/Recommendation Letter Portal (Responses).xlsx'
ref_raw_responses_df = pd.read_excel(REF_DATA_DIR)

In [None]:
len(std_raw_responses_df)

In [None]:
len(ref_raw_responses_df)

In [None]:
# Just specify folder names - thje code will create the directory

RESULTS_FOLDER_NAME = "filtered_responses"
RESULTS_DIR = os.path.join(os.getcwd(), RESULTS_FOLDER_NAME)

if not os.path.exists(RESULTS_DIR):
    os.mkdir(RESULTS_DIR)
    
    
LETTERS_STATS_FOLDER_NAME = "letters_stats"
LETTERS_STATS_DIR = os.path.join(os.getcwd(), LETTERS_STATS_FOLDER_NAME)

if not os.path.exists(LETTERS_STATS_DIR):
    os.mkdir(LETTERS_STATS_DIR)
    
    
FLAGGED_STATS_FOLDER_NAME = "flagged_stats"
FLAGGED_STATS_DIR = os.path.join(os.getcwd(), FLAGGED_STATS_FOLDER_NAME)

if not os.path.exists(FLAGGED_STATS_DIR):
    os.mkdir(FLAGGED_STATS_DIR)

In [None]:
# Columns added to the student responses DataFrame (initialized with None for all cells).
std_raw_responses_df['Flag'] = None  #String ("flagged" or None)
std_raw_responses_df['Notes'] = None #String (Text of notes == reasaons for flagging)
std_raw_responses_df['Recommendation Letter 1'] = None
std_raw_responses_df['Recommendation Letter 2'] = None

# Columns added to the referee responses DataFrame.
ref_raw_responses_df['Flag'] = None
ref_raw_responses_df['Notes'] = None
ref_raw_responses_df['Matched'] = "unmatched"

In [None]:
std_columns = std_raw_responses_df.columns
ref_columns = ref_raw_responses_df.columns

## configuration

In [None]:
# Specify the minimum and maximun number of words for ansewrs for essay questions.
# Note: These parameters apply for for all essay questions

MIN_WORDS_NUM = 50
MAX_WORDS_NUM = 350

# To check if the recommendation letter was submitted by an institutional email, or not.
UNOFFICIAL_EMAILS = ["gmail", "yahoo", "hotmail"]

# Flag, or not flag, students who got one recommendation letter.
FLAG_ONE_LETTER = False

FLAG_NON_AFRICANS = False
FLAG_TRAVELLING_ABROAD = True

FLAG_NA = True

# Countries to accept people from:
african_countries = ['Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros', 'Djibouti', 'Democratic Republic of the Congo', 'Egypt', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', "Ivory Coast (CÃ´te d'Ivoire)", 'Kenya', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Republic of the Congo', 'Rwanda', 'Sao Tome & Principe', 'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe']

notes_dict = {
    
    'std' : {
        'long_ans' : "Extremely long answer/s",
        'short_ans' : "Insufficiently short answer/s",
        'got_no_letters' : "Got no recommendation letters",
        'got_one_letter' : "Got only one recommendation letter",
        'duplicate' : "A duplicated response",
        'traveling_abroad' : "Traveling from outside Africa",
        'non_african' : "Non-african",
        'no_cv' : "Didn't upload a CV",
        'non_institutional_ref_email' : "Submitted a non-institutional email address for one of or both of their referees",
        'duplicate_letters' : "Some reference/s submitted more than one letter (The latest was taken)",
    },
    'ref' : {
        'no_student' : "Couldn't find a student with the specified email address",
        'duplicate' : "Submitted more than one letter for the same student",
    }
}

In [None]:
len(african_countries)

In [None]:
# Use this dictionary as a reference for column names.

std_questions_dict = {i: column for i, column in enumerate(std_raw_responses_df.columns)}
std_questions_dict

In [None]:
# Use this dictionary as a reference for column names.

ref_questions_dict = {i: column for i, column in enumerate(ref_raw_responses_df.columns)}
ref_questions_dict

In [None]:
# Used indices of the student responses DataFrame

std_idcs = {
    'email_idx' : 1,
    'firstname_idx' : 2,
    'lastname_idx' : 3,
    'nat_idx' : 5,
    'resid_idx' : 6,
    'cv_idx' : 23,
    'ref' : {
        'first_ref_email_idx' : 25,
        'second_ref_email_idx' : 27
          },
    'flag_idx' : 28,
    'notes_idx' : 29,
    'first_recomm_letter_idx' : 30,
    'second_recomm_letter_idx' : 31,
}

# Used indices of the reference responses DataFrame
ref_idcs = {
    'email_idx' : 1,
    'name_idx' : 2,
    'std' : {
        'firstname_idx' : 3,
        'lastname_idx' : 4,
        'email_idx' : 5
    },
    'letter_idx' : 7,
    'flag_idx' : 8,
    'notes_idx' : 9,
    'matched_idx' : 10
}

required_fields = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27]

std_str_qs = [std_idcs['email_idx'], std_idcs['firstname_idx'], std_idcs['lastname_idx'], std_idcs['ref']['first_ref_email_idx'], std_idcs['ref']['second_ref_email_idx']]
ref_str_qs = [ref_idcs['email_idx'], ref_idcs['std']['email_idx'], ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]

std_names = [std_idcs['firstname_idx'], std_idcs['lastname_idx']]
ref_names = [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]

# Carefully specify names of the columns to be processed (mostly responses for essay questions).
essay_qs = [20, 21, 22]

## Utility functions

In [None]:
def word_count(answer):
    """
    Takes a specific answer (cell) of a specific essay question and returns the answer's number of words.
    """
    return len(answer.split())


def to_lowercase(std_df, ref_df, std_str_qs=std_str_qs, ref_str_qs=ref_str_qs):
    """
    For more rigid string comparisons, convert all answers needed for comparison to lowercase.
    """
    for q in std_str_qs:
        std_df[q] = std_df[q].str.lower()
        
    for q in ref_str_qs:
        ref_df[q] = ref_df[q].str.lower()
        
    return std_df, ref_df


def to_uppercase(std_df, ref_df, std_names=std_names, ref_names=ref_names):
    """
    This to bring names back as they were.
    """
    for q in std_names:
        std_df[q] = std_df[q].str.title()
        
    for q in ref_names:
        ref_df[q] = ref_df[q].str.title()
        
    return std_df, ref_df


def remove_spaces(std_df, ref_df, std_str_qs=std_str_qs, ref_str_qs=ref_str_qs):
    """
    Remove spaces from names and emails.
    """
    for q in std_str_qs:
        std_df[q] = std_df[q].str.replace(" ", "")
        std_df[q] = std_df[q].replace(",", ".")
        
    for q in ref_str_qs:
        ref_df[q] = ref_df[q].str.replace(" ", "")
        ref_df[q] = ref_df[q].replace(",", ".")
        
    return std_df, ref_df


def set_flag(responses_df, email):
    """
    Sets the 'flag' column value to "flagged" for a response chosen by it's 'Email address'
    """
    # This modifies the DataFrame itself (i.e change in place)
    responses_df.iloc[responses_df[std_idcs['email_idx']] == email, std_idcs['flag_idx']] = "flagged"

    
def leave_note(responses_df, response_index, note_text):
    """
    Appends a note to the 'Notes' column.
    """
    edited_responses_df = responses_df.copy()
    
    if note_text not in str(edited_responses_df.iloc[response_index, std_idcs['notes_idx']]):
        if edited_responses_df.iloc[response_index, std_idcs['notes_idx']] is None:
            edited_responses_df.iloc[response_index, std_idcs['notes_idx']] = note_text
        else:
            edited_responses_df.iloc[response_index, std_idcs['notes_idx']] = str(edited_responses_df.iloc[response_index, std_idcs['notes_idx']]) + ". " + note_text 
            
    return edited_responses_df


def remove_note(responses_df, response_index, note_text):
    
    edited_responses_df = responses_df.copy()
    
    if note_text in str(edited_responses_df.iloc[response_index, std_idcs['notes_idx']]):
        edited_responses_df.iloc[response_index, std_idcs['notes_idx']] = edited_responses_df.iloc[response_index, std_idcs['notes_idx']].replace(note_text + ". ", "")
    
    return edited_responses_df


def leave_note_for_ref(ref_responses_df, response_index, note_text):
    """
    Appends a note to the 'Notes' column.
    """
    edited_ref_responses_df = ref_responses_df.copy()
    
    if note_text not in str(ref_responses_df.iloc[response_index, ref_idcs['notes_idx']]):
        if edited_ref_responses_df.iloc[response_index, ref_idcs['notes_idx']] is None:
            edited_ref_responses_df.iloc[response_index, ref_idcs['notes_idx']] = note_text
        else:
            edited_ref_responses_df.iloc[response_index, ref_idcs['notes_idx']] = str(edited_ref_responses_df.iloc[response_index, ref_idcs['notes_idx']]) + ". " + note_text 
            
    return edited_ref_responses_df

    
def column_names_to_indices(df, indices_dict):
    """
    Replaces column names with indices.
    """
    processed_df = df.rename(columns={column: i for i, column in enumerate(indices_dict.values())})

    return processed_df


def indices_to_column_names(df, indices_dict):
    """
    Replaces indices with column names.
    """
    processed_df = df.rename(columns={i: column for i, column in enumerate(indices_dict.values())})

    return processed_df


def remove_flagged(df):
    """
    Remove f;agged columns.
    """
    processed_df = df.drop(df[(df[std_idcs['flag_idx']] == 'flagged')].index)
    
    return processed_df


def get_unmatched_letters(std_responses_df, ref_responses_df):
    """
    Gets unmatched recommendation letters.
    """
    ref_responses_df_unmatched = ref_responses_df.loc[ref_responses_df[ref_idcs['matched_idx']] == "unmatched"]

    return ref_responses_df_unmatched


def got_two_letters(std_responses_df):

    return std_responses_df.loc[(std_responses_df[std_idcs['first_recomm_letter_idx']].notnull()) & (std_responses_df[std_idcs['second_recomm_letter_idx']].notnull())] 


def got_one_letter(std_responses_df):

    return std_responses_df.loc[(std_responses_df[std_idcs['first_recomm_letter_idx']].isnull()) ^ (std_responses_df[std_idcs['second_recomm_letter_idx']].isnull())] 


def got_no_letters(std_responses_df):

    return std_responses_df.loc[(std_responses_df[std_idcs['first_recomm_letter_idx']].isnull()) & (std_responses_df[std_idcs['second_recomm_letter_idx']].isnull())] 


def get_letters_counts(ref_responses_df):
    
    return ref_responses_df.groupby([ref_idcs['std']['email_idx']]).size().reset_index().rename(columns={0: '# letters'})


def save_letters_stats(std_responses_df, ref_responses_df, letters_stats_dir=LETTERS_STATS_DIR):
    
    got_two_letters(std_responses_df).sort_values(by=[std_idcs['firstname_idx']]).to_excel(letters_stats_dir + "/got_two_letters.xlsx")
    got_one_letter(std_responses_df).sort_values(by=[std_idcs['firstname_idx']]).to_excel(letters_stats_dir + "/got_one_letter.xlsx")
    got_no_letters(std_responses_df).sort_values(by=[std_idcs['firstname_idx']]).to_excel(letters_stats_dir + "/got_no_letters.xlsx")
    
    get_letters_counts(ref_responses_df).to_excel(letters_stats_dir + "/letters_counts(1 or 2).xlsx")


def get_flagged_stats(responses_df, flag_note, filename, notes_dict=notes_dict):
    
    columns = responses_df.columns
    flagged_df = pd.DataFrame(columns=columns)
    
    for row_index, row in responses_df.iterrows():
    
        if responses_df.iloc[row_index, std_idcs['flag_idx']] == "flagged":
            if flag_note in str(responses_df.iloc[row_index, std_idcs['notes_idx']]):
                flagged_df.loc[len(flagged_df.index)] = responses_df.loc[row_index].values
    
    indices_to_column_names(flagged_df, std_questions_dict).to_excel(FLAGGED_STATS_FOLDER_NAME + "/" + filename + ".xlsx")
        
    return flagged_df


def get_all_flagged_stats(responses_df, notes_dict=notes_dict):
    
    long_ans_df = get_flagged_stats(responses_df, notes_dict['std']['long_ans'], "long_answers")
    short_ans_df = get_flagged_stats(responses_df, notes_dict['std']['short_ans'], "short_answers") 
    got_no_letters_df = get_flagged_stats(responses_df, notes_dict['std']['got_no_letters'], "got_no_letters")
    duplicate_df = get_flagged_stats(responses_df, notes_dict['std']['duplicate'], "duplicate_responses")
    traveling_abroad_df = get_flagged_stats(responses_df, notes_dict['std']['traveling_abroad'], "traveling_abroad")
    non_african_df = get_flagged_stats(responses_df, notes_dict['std']['non_african'], "non_african")
    no_cv_df = get_flagged_stats(responses_df, notes_dict['std']['no_cv'], "no_cv")
            
    return (long_ans_df, short_ans_df, got_no_letters_df, duplicate_df,
            traveling_abroad_df, non_african_df, no_cv_df)


def get_std_by_email(std_responses_df, email):
    
    return std_responses_df.loc[std_responses_df[std_idcs['email_idx']] == email]


def get_ref_by_email(ref_responses_df, email):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['email_idx']] == email]


def get_ref_by_std_email(ref_responses_df, std_email):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['std']['email_idx']] == std_email]


def get_std_by_email_from_ref(ref_responses_df, std_email):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['std']['email_idx']] == std_email]


def get_std_by_firstname_from_ref(ref_responses_df, firstname):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['std']['firstname_idx']] == firstname]


def get_std_by_lastname_from_ref(ref_responses_df, lastname):
    
    return ref_responses_df.loc[ref_responses_df[ref_idcs['std']['lastname_idx']] == lastname]


def get_std_by_firstname_and_lastname_from_ref(ref_responses_df, std_firstname, std_lastname):
    
    return ref_responses_df.loc[(ref_responses_df[ref_idcs['std']['firstname_idx']] == std_firstname) & (ref_responses_df[ref_idcs['std']['lastname_idx']] == std_lastname)]


def get_std_by_firstname(std_responses_df, firstname):
    
    return std_responses_df.loc[std_responses_df[std_idcs['firstname_idx']] == firstname]


def get_std_by_lastname(std_responses_df, lastname):
    
    return std_responses_df.loc[std_responses_df[std_idcs['lastname_idx']] == lastname]


def get_std_by_firstname_and_lastname(std_responses_df, firstname, lastname):
    
    return std_responses_df.loc[(std_responses_df[std_idcs['firstname_idx']] == firstname) & (std_responses_df[ref_idcs['std']['lastname_idx']] == lastname)]


def get_std_by_ref_email(std_responses_df, ref_email):
    
    return std_responses_df.loc[(std_responses_df[std_idcs['ref']['first_ref_email_idx']] == ref_email) | (std_responses_df[std_idcs['ref']['second_ref_email_idx']] == ref_email)]


def get_by_nat_country(std_responses_df, country):
    
    return std_responses_df.loc[std_responses_df[std_idcs['nat_idx']] == country]
    

def get_by_resid_country(std_responses_df, country):
    
    return std_responses_df.loc[std_responses_df[std_idcs['resid_idx']] == country]
    
    
def get_std_emails(std_responses_df):
    
    return std_responses_df.iloc[:, std_idcs['email_idx']:std_idcs['email_idx'] + 1]


def get_ref_emails(ref_responses_df):
    
    return ref_responses_df.iloc[:, ref_idcs['email_idx']:ref_idcs['email_idx'] + 1]


def get_std_names(std_responses_df):

    return std_responses_df.iloc[:, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1]


def get_ref_names(ref_responses_df):

    return ref_responses_df.iloc[:, ref_idcs['name_idx']:std_idcs['name_idx'] + 1]


def get_std_summary(std_responses_df):
    
    return pd.concat([std_responses_df.iloc[:, std_idcs['email_idx']:std_idcs['firstname_idx']+2], std_responses_df.iloc[:, std_idcs['flag_idx']:]], axis=1)

## Main Pipeline

In [None]:
def remove_duplicates(responses_df):
    """
    removes duplicated rows (responses) based on 'Email address' and keeps the last response submitted.
    Note: Some students may make changes to their responses and submit a new one,
    this's why this function keeps the last response submitted and removes preceding ones.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        response_df: the responses data (DataFrame)
    returns:
        edited_responses_df: An edited response_df with duplicates removed
    """
    
    edited_responses_df = responses_df.copy()
    
    edited_responses_df.drop_duplicates(subset=[std_idcs['email_idx']], keep='last')
    
    return edited_responses_df


def flag_duplicates(responses_df):
    """
    flags duplicated rows (responses) based on 'Email address' and keeps the last response submitted.
    Note: Some students may make changes to their responses and submit a new one,
    this's why this function keeps the last response submitted and flag preceding ones, and leaves a note.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        response_df: the responses data (DataFrame)
    returns:
        edited_responses_df: An edited responses_df with 'flag' column updated
    """
    
    edited_responses_df = responses_df.copy()
    
    # Format: df['col'] = (value_if_false).where(condition, value_if_true)
    
    edited_responses_df[std_idcs['flag_idx']] = (edited_responses_df[std_idcs['flag_idx']]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_responses_df.duplicated(subset=[1], keep='last'))),
        "flagged"
    )

    edited_responses_df[std_idcs['notes_idx']] = (edited_responses_df[std_idcs['notes_idx']]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_responses_df.duplicated(subset=[1], keep='last'))),
        "A duplicated response"
    )
    
    return edited_responses_df


def flag_no_cvs(responses_df):
    """
    flags students who didn't upload a CV.
    
    params :
        response_df: the responses data (DataFrame)
    returns:
        edited_responses_df: An edited responses_df with 'flag' column updated
    """
    
    edited_responses_df = responses_df.copy()
    
    for row_index in range(len(edited_responses_df)):
        
        if pd.isnull(edited_responses_df.iloc[row_index, std_idcs['cv_idx']]):
            
            edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_responses_df = leave_note(edited_responses_df, row_index, "Didn't upload a CV")  

    return edited_responses_df


def flag_non_africans(responses_df):

    edited_responses_df = responses_df.copy()
    
    for row_index, row in edited_responses_df.iterrows():
    
        if FLAG_NON_AFRICANS:

            if edited_responses_df.iloc[row_index, std_idcs['nat_idx']] not in african_countries:

                edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
                edited_responses_df = leave_note(edited_responses_df, row_index, "Non-african")

        if FLAG_TRAVELLING_ABROAD:

            if edited_responses_df.iloc[row_index, std_idcs['resid_idx']] not in african_countries:

                edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
                edited_responses_df = leave_note(edited_responses_df, row_index, "Traveling from outside Africa")
        
    return edited_responses_df


def flag_duplicate_refs(ref_responses_df):
    """
    flags duplicated reference responces (i.e. submitting multiple letters), and keep the last submitted one.
    
    TODO: Check with the organizers what else is an adequate action.
    
    params :
        ref_response_df: the responses data (DataFrame)
    returns:
        edited_ref_responses_df: An edited responses_df with 'flag' column updated
    """
    
    edited_ref_responses_df = ref_responses_df.copy()
    
    # Format: df['col'] = (value_if_false).where(condition, value_if_true)
    
    edited_ref_responses_df[ref_idcs['flag_idx']] = (edited_ref_responses_df[ref_idcs['flag_idx']]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_ref_responses_df.duplicated(subset=[ref_idcs['email_idx'], ref_idcs['std']['email_idx']], keep='last'))),
        "flagged"
    )

    edited_ref_responses_df[ref_idcs['notes_idx']] = (edited_ref_responses_df[ref_idcs['notes_idx']]).where(
        # True/False nupmy array - True: duplicated, False: unique (before inversion)
        np.invert(np.array(edited_ref_responses_df.duplicated(subset=[ref_idcs['email_idx'], ref_idcs['std']['email_idx']], keep='last'))),
        "Submitted more than one letter for the same student"
    )
    
    return edited_ref_responses_df


def flag_refs_with_no_students(std_responses_df, ref_responses_df):
    """
    Flag referees whose students didn't submit an application.
    """
    
    edited_ref_responses_df = ref_responses_df.copy()
    
    for ref_idx in range(len(edited_ref_responses_df)):
        
        if edited_ref_responses_df.iloc[ref_idx, ref_idcs['std']['email_idx']] not in std_responses_df[std_idcs['email_idx']].tolist():
            
            #edited_ref_responses_df.iloc[ref_idx, ref_idcs['flag_idx']] = "flagged"
            edited_ref_responses_df = leave_note_for_ref(edited_ref_responses_df, ref_idx, "Couldn't find a student with the specified email address")  
            

    return edited_ref_responses_df


def flag_short(responses_df, essay_qs):
    """
    flags insufficently short answers (less than a specific lower limit) for a specified
    set of essay questions, and leaves a note.
    
    params :
        response_df: the responses data (DataFrame)
        essay_qs   : essay questions (list)
    returns:
        edited_responses_df: An edited responses_df with short answers flagged
    """
    
    edited_responses_df = responses_df.copy()
    
    # Go through all the responses and for each response go through the answers for the essay questions
    for row_index in range(len(edited_responses_df)):
        
        for question in essay_qs:
            
            if word_count(str(edited_responses_df.iloc[row_index, question])) < MIN_WORDS_NUM:
                edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
                
                edited_responses_df = leave_note(edited_responses_df, row_index, "Insufficiently short answer/s")        
                    
    return edited_responses_df
                    

def flag_long(responses_df, essay_qs):
    """
    flags extremely long answers (more than a specific upprt limit) for a specified
    set of essay questions, and leaves a note.
    
    params :
        response_df: the responses data (DataFrame)
        essay_qs   : essay questions (list)
    returns:
        edited_response_df: An edited responses_df with long answers flagged
    """
     
    edited_responses_df = responses_df.copy()
    
    # Go through all the responses and for each response go through the answers for the essay questions
    for row_index in range(len(edited_responses_df)):
        for question in essay_qs:
            
            if word_count(str(edited_responses_df.iloc[row_index, question])) > MAX_WORDS_NUM:
                edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
                
                edited_responses_df = leave_note(edited_responses_df, row_index, "Extremely long answer/s")
                        
    return edited_responses_df


def flag_unofficial_emails(responses_df, unofficial_emails=UNOFFICIAL_EMAILS):
    
    edited_responses_df = responses_df.copy()
    
    for row_index in range(len(edited_responses_df)):
        
        try:
            is_unofficial = any([unofficial_email in edited_responses_df.iloc[row_index, std_idcs['ref']['first_ref_email_idx']] for unofficial_email in unofficial_emails] + [unofficial_email in edited_responses_df.iloc[row_index, std_idcs['ref']['second_ref_email_idx']] for unofficial_email in unofficial_emails])
        
        except TypeError:
            is_unofficial = True
        
        if is_unofficial:
        
            edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_responses_df = leave_note(edited_responses_df, row_index, "Submitted a non-institutional email address for one of or both of their referees")        
                    
    return edited_responses_df


def flag_na(responses_df, required_fields=required_fields):
    
    edited_responses_df = responses_df.copy()
    
    if FLAG_NA:
        
        for row_index, row in edited_responses_df.iterrows():
            
            for required_field in required_fields:
                
                if pd.isnull(edited_responses_df.iloc[row_index, required_field]):

                    edited_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
                    edited_responses_df = leave_note(edited_responses_df, row_index, "Filled a required field with 'N/A'")
                    
                    break
                    
    return edited_responses_df


def match_refs_based_on_stdn_email(std_responses_df, ref_responses_df):
    """
    Matches references with the student/s they are supporting, and flags student response if they get less than the required
    number of reference letters, and leaves a note.
    
    params :
        std_responses_df : students responses data (DataFrame)
        ref_responses_df : references responses data (DataFrame)
    returns:
        edited_std_responses_df: An edited std_responses_df with answers with unsatisfied conditions for recommendation letters flagged
        ref_responses_df: The ref_responses_df but with marking the "Matched" column for letters those successfully matched.
    """
     
    ref_responses_df = flag_duplicate_refs(ref_responses_df)
    edited_std_responses_df = std_responses_df.copy()
    
    for row_index in range(len(edited_std_responses_df)):
        
        if edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None or edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] is not None:
            continue
            
        # Flag student response if BOTH of their references didn't submit any letter
        
        if edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] not in ref_responses_df[ref_idcs['std']['email_idx']].values:
            
            edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got no recommendation letters")
        
        # Flag student response if ANY of their references didn't submit any letter
        # Assign the one submitted letters to that student
        
        elif ref_responses_df[ref_idcs['std']['email_idx']].value_counts()[edited_std_responses_df.iloc[row_index, std_idcs['email_idx']]] == 1:
            
            if FLAG_ONE_LETTER:
                edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got only one recommendation letter")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
                    break
        
        # This, from here below, would look much prettier with a while loop!
    
        # If BOTH references subnitted ONLY ONE letter,
        # Assign the right two letters to the specific student
        
        elif ref_responses_df[ref_idcs['std']['email_idx']].value_counts()[edited_std_responses_df.iloc[row_index, std_idcs['email_idx']]] == 2:
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
                    break
                
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] and edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
        
        # Flag student response if one of or both their references submitted MORE THAN ONE letter
        # And assign the right two letters to that student
        
        elif ref_responses_df[ref_idcs['std']['email_idx']].value_counts()[edited_std_responses_df.iloc[row_index, std_idcs['email_idx']]] > 2:
            
            #edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index,
                                                 "Some reference/s submitted more than one letter (The latest was taken)")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
                    break
                    
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue
                
                if ref_responses_df.iloc[ref_index, ref_idcs['std']['email_idx']] == edited_std_responses_df.iloc[row_index, std_idcs['email_idx']] and edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    
    return edited_std_responses_df, ref_responses_df


def match_refs_based_on_stdn_name(std_responses_df, ref_responses_df):
    """
    Matches references with the student/s they are supporting, and flags student response if they get less than the required
    number of reference letters, and leaves a note.
    
    params :
        std_responses_df : students responses data (DataFrame)
        ref_responses_df : references responses data (DataFrame)
    returns:
        edited_std_responses_df: An edited std_responses_df with answers with unsatisfied conditions for recommendation letters flagged,
        ref_responses_df: The ref_responses_df but with marking the "Matched" column for letters those successfully matched.
    """
     
    ref_responses_df = flag_duplicate_refs(ref_responses_df)
    edited_std_responses_df = std_responses_df.copy()
    
    for row_index in range(len(edited_std_responses_df)):
        
        if edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None or edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] is not None:
            continue
        
        if len(get_std_by_firstname_and_lastname(edited_std_responses_df, edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']], edited_std_responses_df.iloc[row_index, std_idcs['lastname_idx']])) > 1:
            continue
        
        try:
            got_one_letter = ref_responses_df[[ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].value_counts()[tuple(edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1])] == 1
        
        except KeyError:
            got_one_letter = False
            
        try:
            got_two_letters = ref_responses_df[[ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].value_counts()[tuple(edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1])] == 2

        except KeyError:
            got_two_letters = False 
            
        try:
            got_more_than_two_letters = ref_responses_df[[ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].value_counts()[tuple(edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1])] > 2
       
        except KeyError:
            got_more_than_two_letters = False 
        
        if not got_one_letter and not got_two_letters and not got_more_than_two_letters:
            got_no_letters = True
            
        else:
            got_no_letters = False
    
        # Flag student response if BOTH of their references didn't submit any letter
          
        if got_no_letters:
            
            edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got no recommendation letters")
        
        # Flag student response if ANY of their references didn't submit any letter
        # Assign the one submitted letters to that student
            
        if got_one_letter:
            
            if FLAG_ONE_LETTER:
                edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got only one recommendation letter")
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist() and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    edited_std_responses_df = remove_note(edited_std_responses_df, row_index, "Got no recommendation letters")
                    
                    break
        
        # This, from here below, would look much prettier with a while loop!
    
        # If BOTH references subnitted ONLY ONE letter,
        # Assign the right two letters to the specific student
        
        if got_two_letters:
            
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue
                
                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist() and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    edited_std_responses_df = remove_note(edited_std_responses_df, row_index, "Got no recommendation letters")
                    
                    break
                
            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue
                
                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist() and edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                    
                    edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    edited_std_responses_df = remove_note(edited_std_responses_df, row_index, "Got no recommendation letters")
        
        # Check if there are references who submitted MORE THAN ONE letter to the same student
        # And assign the right two letters to that student
        
        if got_more_than_two_letters:
            
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index,
                                                 "Some reference/s submitted more than one letters (The latest was taken)")

            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue

                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist() and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:

                    edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    edited_std_responses_df = remove_note(edited_std_responses_df, row_index, "Got no recommendation letters")

                    break

            for ref_index in range(len(ref_responses_df)):
                
                if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue

                if ref_responses_df.iloc[ref_index, [ref_idcs['std']['firstname_idx'], ref_idcs['std']['lastname_idx']]].tolist() == edited_std_responses_df.iloc[row_index, std_idcs['firstname_idx']:std_idcs['lastname_idx'] + 1].tolist() and edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is not None and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:

                    edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                    ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                    edited_std_responses_df = remove_note(edited_std_responses_df, row_index, "Got no recommendation letters")
                    
    return edited_std_responses_df, ref_responses_df


def match_refs_based_on_ref_email(std_responses_df, ref_responses_df):
    """
    Matches references with the student/s they are supporting, and flags student response if they get less than the required
    number of reference letters, and leaves a note.
    
    params :
        std_responses_df : students responses data (DataFrame)
        ref_responses_df : references responses data (DataFrame)
    returns:
        edited_std_responses_df: An edited std_responses_df with answers with unsatisfied conditions for recommendation letters flagged,
        ref_responses_df: The ref_responses_df but with marking the "Matched" column for letters those successfully matched.
    """
   
    ref_responses_df = flag_duplicate_refs(ref_responses_df)
    edited_std_responses_df = std_responses_df.copy()
    
    for ref_index in range(len(ref_responses_df)):
        
        if ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] == "matched":
                    continue
        
        if len(get_ref_by_email(ref_responses_df, ref_responses_df.iloc[ref_index, ref_idcs['email_idx']])) > 1:
            continue
        
        if len(get_std_by_ref_email(edited_std_responses_df, ref_responses_df.iloc[ref_index, ref_idcs['email_idx']])) > 1:
            continue
        
        if len(get_std_by_ref_email(edited_std_responses_df, ref_responses_df.iloc[ref_index, ref_idcs['email_idx']])) == 1:
            
            student = get_std_by_ref_email(edited_std_responses_df, ref_responses_df.iloc[ref_index, ref_idcs['email_idx']])
            student_email = student.iloc[0, std_idcs['email_idx']]
            
            if ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:
                
                students = get_std_by_email(edited_std_responses_df, student_email)
                
                for row_index, row in students.iterrows():
                
                    if edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] is None and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:

                        edited_std_responses_df.iloc[row_index, std_idcs['first_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                        ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                        edited_std_responses_df = remove_note(edited_std_responses_df, row_index, "Got no recommendation letters")
                        
                        break
                                            
                    elif edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] is None and ref_responses_df.iloc[ref_index, ref_idcs['flag_idx']] is None:

                        edited_std_responses_df.iloc[row_index, std_idcs['second_recomm_letter_idx']] = ref_responses_df.iloc[ref_index, ref_idcs['letter_idx']]
                        ref_responses_df.iloc[ref_index, ref_idcs['matched_idx']] = "matched"
                        edited_std_responses_df = remove_note(edited_std_responses_df, row_index, "Got no recommendation letters")
    
    for row_index, row in edited_std_responses_df.iterrows():
            
        if bool(pd.isnull(row[std_idcs['first_recomm_letter_idx']])) != bool(pd.isnull(row[std_idcs['second_recomm_letter_idx']])):
            
            if FLAG_ONE_LETTER:
                edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
                
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got only one recommendation letter")
            
        if pd.isnull(row[std_idcs['first_recomm_letter_idx']]) and pd.isnull(row[std_idcs['second_recomm_letter_idx']]):
            
            edited_std_responses_df.iloc[row_index, std_idcs['flag_idx']] = "flagged"
            edited_std_responses_df = leave_note(edited_std_responses_df, row_index, "Got no recommendation letters")
        
    return edited_std_responses_df, ref_responses_df
    

def match_references(std_responses_df, ref_responses_df):
    
    edited_std_responses_df, ref_responses_df = match_refs_based_on_stdn_email(std_responses_df, ref_responses_df)
    edited_std_responses_df, ref_responses_df = match_refs_based_on_ref_email(edited_std_responses_df, ref_responses_df)
    edited_std_responses_df, ref_responses_df = match_refs_based_on_stdn_name(edited_std_responses_df, ref_responses_df)
    
    return edited_std_responses_df, ref_responses_df

In [None]:
def main(std_responses_df, ref_responses_df):
    
    std_responses_df = column_names_to_indices(std_responses_df, std_questions_dict)
    ref_responses_df = column_names_to_indices(ref_responses_df, ref_questions_dict)
    
    responses_df_flagged_duplicates = flag_duplicates(std_responses_df)
    responses_df_flagged_na = flag_na(responses_df_flagged_duplicates)
    responses_df_flagged_short = flag_short(responses_df_flagged_na, essay_qs)
    responses_df_flagged_long = flag_long (responses_df_flagged_short, essay_qs)
    responses_df_flagged_no_cv = flag_no_cvs(responses_df_flagged_long)
    responses_df_flagged_nonafricans = flag_non_africans(responses_df_flagged_no_cv)
    
    responses_df_spaces_removed, ref_responses_df_spaces_removed = remove_spaces(responses_df_flagged_nonafricans, ref_responses_df)
    
    responses_df_lowercase, ref_responses_df_lowercase = to_lowercase(responses_df_spaces_removed, ref_responses_df_spaces_removed)
    
    responses_df_matched, ref_responses_df_matched = match_references(responses_df_lowercase, ref_responses_df_lowercase)
    ref_responses_df_final = flag_refs_with_no_students(responses_df_matched, ref_responses_df_matched)
    
    responses_df_final, ref_responses_df_final = to_uppercase(responses_df_matched, ref_responses_df_final)
    
    ref_responses_df_unmatched = get_unmatched_letters(responses_df_final, ref_responses_df_final)
    named_ref_responses_df_unmatched = indices_to_column_names(ref_responses_df_unmatched, ref_questions_dict)
    named_ref_responses_df_unmatched.to_excel(RESULTS_FOLDER_NAME + "/unmatched_letters.xlsx")
    named_ref_responses_df_unmatched.to_excel(LETTERS_STATS_FOLDER_NAME + "/unmatched_letters.xlsx")
    
    # Putting back original column names, and saving the Excel file
    named_responses_df_final = indices_to_column_names(responses_df_final, std_questions_dict)
    named_responses_df_final.to_excel(RESULTS_FOLDER_NAME + "/filtered_responses_with_flagged.xlsx")
    
    save_letters_stats(responses_df_final, ref_responses_df_final)
    
    responses_df_final_flagged_removed = remove_flagged(responses_df_final)
    
    # Putting back original column names, and saving the Excel file
    named_responses_df_final_flagged_removed = indices_to_column_names(responses_df_final_flagged_removed, std_questions_dict)
    named_responses_df_final_flagged_removed.to_excel(RESULTS_FOLDER_NAME + "/filtered_responses_with_flagged_removed.xlsx")
    
    # Putting back original column names, and saving the Excel file
    named_ref_responses_df_final = indices_to_column_names(ref_responses_df_final, ref_questions_dict)
    named_ref_responses_df_final.to_excel(RESULTS_FOLDER_NAME + "/ref_responses_with_flagged.xlsx")
    
    return (responses_df_final, responses_df_final_flagged_removed,
            ref_responses_df_final, ref_responses_df_unmatched)

In [None]:
responses_df_final, responses_df_final_flagged_removed, ref_responses_df_final, ref_responses_df_unmatched = main(std_raw_responses_df, ref_raw_responses_df)

long_ans_df, short_ans_df, got_no_letters_df, duplicate_df, traveling_abroad_df, non_african_df, no_cv_df = get_all_flagged_stats(responses_df_final)

In [None]:
get_std_summary(responses_df_final)

In [None]:
ref_responses_df_final

In [None]:
get_std_summary(responses_df_final_flagged_removed)

In [None]:
len(responses_df_final)

In [None]:
len(responses_df_final_flagged_removed)

In [None]:
len(ref_responses_df_final)

In [None]:
len(ref_responses_df_unmatched)

In [None]:
len(got_two_letters(responses_df_final))

In [None]:
len(got_one_letter(responses_df_final)) 

In [None]:
len(got_no_letters(responses_df_final))

In [None]:
len(got_no_letters_df)

In [None]:
len(traveling_abroad_df)

In [None]:
len(long_ans_df)

In [None]:
len(short_ans_df)

In [None]:
len(no_cv_df)

In [None]:
get_by_nat_country(responses_df_final, "Other")