In [1]:
from pathlib import Path
import random
import os

from tqdm import tqdm
import pandas as pd
import numpy as np


from utils import anonymize_df

In [2]:

# Folder where the excel files for Conflicts of Interest grid and raw application data are stored.
# Such files are atm expected to be Excel exports from Google Sheets.
data_path = Path.cwd().parent / "source_data"
print(data_path, data_path.exists())

# Filename of the conflicts of interest grid file. In the file, there's columns marking the reviewers names and x's 
# for conflicts of interest report for pairs of reviewers and applications.
coi_filename = "conflicts_of_interest_grid.xlsx"

# Filename of the raw application data file downloaded from google:
raw_applications_filename = "raw_applications.xlsx"


/Users/vigji/code/applications-evaluation/source_data True


In [3]:
# Read and show the Conflicts of Interest grid file:
coi_df = pd.read_excel(data_path / coi_filename)
coi_df = anonymize_df(coi_df, drop_columns=["Name", "Surname", "E-mail", "Affiliation (institution, laboratory)", "Position "])
coi_df.head()

  df.index = df[seed_from].apply(_convert_to_word_sequence, dict(n_words=n_words))


Unnamed: 0_level_0,Ania,Luigi,Mateusz,Matilde,Natalia
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
spell-concepts,,,,,
uri-descriptions,,,,,
trail-tenant,,,,,
bathrooms-engineering,,,,,
denied-vc,,,,,


In [4]:
# Assign reviewers to applications using the conflicts of interest grid after anonymization.
# The assignment procedure is not necessarily converging, and can give 
# different numbers of reviews to each reviwer (I'm sure there's a better way to do this).
# We run it with different seeds until we find a valid assignment.

# Parameters:
# Number of reviewers for every application:
n_reviewers_per_appl = 3 
# Maximum number of applications per reviewer. Reviewers names have to match columns in the conflicts of interest grid:
max_applications = {"Luigi": 32, "Ania": 31, "Mateusz":31, "Matilde":32, "Natalia": 31}
# Maximum difference in number of reviews per reviewer:
max_difference = 1

assert set(max_applications.keys()).issubset(coi_df.columns), "Some reviewers are not in the conflicts of interest grid."

# Read conflict of interest to exclude reviewers:
possibile_revs = []
for idx, names in coi_df.iterrows():
    valid = names.apply(lambda x: not isinstance(x, str))
    names = names[valid].index

    possibile_revs.append(dict(code=idx, 
                          names=names.values, 
                          n_valid=len(names)))
    
possibile_revs = pd.DataFrame(possibile_revs)

# Make sure we assign the tough ones first:
possibile_revs = possibile_revs.set_index("code")
possibile_revs = possibile_revs.sort_values(by="n_valid")["names"]

print("Trying combinations...")
for i in tqdm(range(1000)):
    candidates_counts = {name: 0 for name in max_applications.keys()}
    assignments = []

    np.random.seed(i)
    assignements_df = []
    for n, (code, candidates) in enumerate(possibile_revs.items()):

        candidates = list(candidates)

        # Exclude reviewers already saturated:
        for name, count in candidates_counts.items():
            if count >= max_applications[name]:
                candidates.pop(candidates.index(name))            

        # shuffle and extract:
        np.random.shuffle(candidates)
        extracted = candidates[:n_reviewers_per_appl]

        # Make sure all reviews get correct number of reviews:
        if len(extracted) != n_reviewers_per_appl:
            continue
        
        for name in extracted:
            candidates_counts[name] += 1
            assignements_df.append(dict(code=code, reviewer=name))

    assignements_df = pd.DataFrame(assignements_df)

    # check for maximum allowed difference in numbers of reviews per evaluator:
    if np.max(list(candidates_counts.values())) - np.min(list(candidates_counts.values())) <= max_difference:
        break
    
print("First valid seed: ", i)
print("Applications per reviewer: ", candidates_counts)
print("Total applications assigned: ", sum(list(candidates_counts.values())))



Trying combinations...


  6%|▋         | 64/1000 [00:00<00:00, 3676.04it/s]

First valid seed:  64
Applications per reviewer:  {'Luigi': 30, 'Ania': 31, 'Mateusz': 31, 'Matilde': 31, 'Natalia': 30}
Total applications assigned:  153





In [5]:
raw_data_df = pd.read_excel(data_path / raw_applications_filename)
raw_data_df.head()

Unnamed: 0,Sygnatura czasowa,Name,Surname,E-mail,"Affiliation (institution, laboratory)","Position (e.g. PhD student, postdoc)",What are your pronouns?,How would you describe your research work?,"Do you self-identify as from an equity-deserving community? \nNote: The term “equity-deserving communities” refers to communities that experience significant collective biases and barriers to participating in society - including attitudinal, historic, social and environmental barriers based on race, ethnicity, disability, gender identity, sex, sexual orientation and other aspects of identity.",Do you need a fee waiver or / and a travel grant to be able to attend the workshop?,"If yes, please provide us with more information regarding your need for financial support (the extent of assistance needed, what are the other possible sources of funding).",Please describe the research project you are working on. In what way do you think our workshop can improve it? In what way will it improve your research career in general? (200-250 words),"What is your definition of ""naturalistic behavior""? (50 - 100 words)","Below there are examples of behavioral studies done on different animals. Please select one of them and write in what way the paradigm used in the study is naturalistic (3 arguments) - and in what way it is not (also 3 arguments) [150 - 200 words]\n\n1. Rats playing hide and seek game with an experimenter (Reinhold, 2019; DOI: 10.1126/science.aax470)\n\n2. Neuroimaging study of humans playing with a fidget spinner (Narukawa, 2023; DOI: 10.1038/s41598-023-43109-7)\n\n3. Fish navigating terrestial environment by driving a vehicle (Givon, 2021; DOI: 10.1016/j.bbr.2021.113711)\n\n4. Study on dogs using a special interface with buttons to comunicate with humans (Robinson, 2020; DOI: 10.1145/3357236.3395462)\n\n"
0,2023-09-26 09:46:24.932,Carolina,Duro,carol.duro@gmail.com,Ludwig Maximilian University of Munich,PhD Student,She / her,Neurobiology,No,I need fee waiver and a travel grant,My Graduate Schoold (GSN-LMU) could provide so...,The development of the human brain occurs earl...,Naturalistic relates to an unchanged and norma...,"I choose the paradigm 1. On one hand, this stu..."
1,2023-09-26 20:00:00.427,Aial,Sobeh,aealsobh123@gmail.com,University of Haifa,PhD student,He / him,Cognitive neuroscience,Yes,I need fee waiver,Participating in the school will require me to...,My research projects probe the neural mechanis...,Cognitive computations transform inputs into o...,Related to second study: \n\n\nUnaturalistic:\...
2,2023-09-30 21:18:32.281,Islam,Faress,islam.faress@dandrite.au.dk,"DANDRITE, Aarhus University Nabavi lab",Postdoc,He / him,Neurobiology,Yes,I need fee waiver and a travel grant,The funds available for travelling at our lab ...,I have completed two studies where I investiga...,Naturalistic behavior is an innate behavior wh...,"I chose study ""1""\n\nThe study is naturalistic..."
3,2023-10-02 03:23:47.761,Amir,Jafari,jafari.amir@posgraduacao.uerj.br,"Laboratório de Neurofisiologia, Instituto de B...",PhD student,He / him,behavior,Yes,I need travel grant,"Dear Nencki Open Lab committee, I am writing t...",I’m working on a Swiss mice with oral nicotine...,In 2021 when I`ve introduced old_classic metho...,"The chosen study, titled ""Rats playing hide an..."
4,2023-10-19 01:53:02.865,Federico Amadeo,Cavanna,f.cavanna@gmail.com,Computational Cognitive Neuroscience Lab,PdD candidate,He / him,Cognitive neuroscience,Yes,I need fee waiver and a travel grant,"Sadly, I am faced with a challenging situation...",My research project focuses on the study of th...,The term refers to behaviour that occurs 'in t...,In favour:\n- The researchers asked subjects t...


In [6]:
# Anonymize and drop a bunch of columns not needed for evaluation:
data_df = anonymize_df(raw_data_df, 
                    drop_columns=["Name", "Surname", "E-mail", 
                        "Affiliation (institution, laboratory)", 
                        "Position (e.g. PhD student, postdoc)",
                        "How would you describe your research work?",
                         "Sygnatura czasowa",
                         "What are your pronouns?",
                         "Do you need a fee waiver or / and a travel grant to be able to attend the workshop?",
                         "If yes, please provide us with more information regarding your need for financial support (the extent of assistance needed, what are the other possible sources of funding).",
                        "Do you self-identify as from an equity-deserving community? \nNote: The term “equity-deserving communities” refers to communities that experience significant collective biases and barriers to participating in society - including attitudinal, historic, social and environmental barriers based on race, ethnicity, disability, gender identity, sex, sexual orientation and other aspects of identity." 
                        ])

# The columns remaining in the dataframe are the ones to be evaluated (student project, def of nat behavior, study commentary).
# We keep the question asked to print in the evaluation sheet (with some cropping), and we give aliases to the columns.

COL_DESCRIPTION_MAX_LEN = 247  # we want to crop out some text
new_columns = ["current-project", "definition", "study-commentary"]  # new names for the columns

cols_description = {k: val[:COL_DESCRIPTION_MAX_LEN] for k, val in zip(new_columns, data_df.columns)}
data_df.columns = new_columns

data_df.head()  # dataframe of data to be evaluated


  df.index = df[seed_from].apply(_convert_to_word_sequence, dict(n_words=n_words))


Unnamed: 0_level_0,current-project,definition,study-commentary
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
spell-concepts,The development of the human brain occurs earl...,Naturalistic relates to an unchanged and norma...,"I choose the paradigm 1. On one hand, this stu..."
uri-descriptions,My research projects probe the neural mechanis...,Cognitive computations transform inputs into o...,Related to second study: \n\n\nUnaturalistic:\...
trail-tenant,I have completed two studies where I investiga...,Naturalistic behavior is an innate behavior wh...,"I chose study ""1""\n\nThe study is naturalistic..."
bathrooms-engineering,I’m working on a Swiss mice with oral nicotine...,In 2021 when I`ve introduced old_classic metho...,"The chosen study, titled ""Rats playing hide an..."
denied-vc,My research project focuses on the study of th...,The term refers to behaviour that occurs 'in t...,In favour:\n- The researchers asked subjects t...


In [7]:
# Save the final markdown/pdfs to review, and the score sheet. 

# First, do a random shuffle of the applications for each reviewer
# to break orders and avoid first to last bias.

# Then, export csv scoresheet, and markdown file with the answers to the evaluation questions.

# Optionally, the markdown is parsed to create a nice pdf. This requires pandoc to be installed.
# Modify the css file to change the style of the pdf.

export_pdf = True
np.random.seed(42)  # seed for reproducibility
destination_dir = data_path / "reviewers_exports"
destination_dir.mkdir(exist_ok=True)

for reviewer in assignements_df.reviewer.unique():
    # Reviewer subfolder:
    reviewer_dest_dir = destination_dir / reviewer
    reviewer_dest_dir.mkdir(exist_ok=True)

    # Filter reviewer applications:
    appl_idxs = assignements_df.loc[assignements_df["reviewer"] == reviewer, "code"]
    data_selected_df = data_df.loc[appl_idxs.values, :]

    # Randomize rows
    data_selected_df = data_selected_df.sample(frac=1)

    # Create score sheet:
    score_sheet = pd.DataFrame(index=data_selected_df.index, columns={"score (1-5)": np.nan})
    score_sheet.to_csv(reviewer_dest_dir / f"{reviewer}_score-sheet.csv")

    #Produce evaluation markdown:
    txt = ""
    for key, vals in data_selected_df.iterrows():
        txt += f"## candidate: {key}\n\n"
        for section in new_columns:
            txt += f"#### {section.replace('-', ' ')}\n\n_{cols_description[section]}_\n\n{vals[section]}\n\n"

    md_filename = reviewer_dest_dir / f"{reviewer}_applications.md"
    with open(md_filename, "w") as f:
        f.write(txt)

    # Export to pdf:
    if export_pdf:
        os.system(f"pandoc {md_filename} -f gfm -o {md_filename.parent / md_filename.stem}.pdf --css={data_path / 'pandoc'}.css -t html5 --metadata title='{reviewer}'")

Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                        
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                                        
Loading pages (1/6)
Counting pages (2/6)                                               
Resolving links (4/6)                                                       
Loading headers and footers (5/6)                                           
Printing pages (6/6)
Done                                                     