Now, we merge the data from different semesters. Between 2017 and 2019, the test was applied in class.

In [1]:
import pandas as pd
import numpy as np

In [2]:
answer_key = pd.read_csv('answer_key.csv')

In [3]:
# import and merge data from all semesters
data_2017_2019 = pd.read_csv('data_2017_2019.csv')
data_2020_1 = pd.read_csv('data_2020_1.csv')
data_2020_2 = pd.read_csv('data_2020_2.csv')
data_2021_1 = pd.read_csv('data_2021_1.csv')
data_2021_2 = pd.read_csv('data_2021_2.csv')

data = pd.concat([data_2017_2019, data_2020_1, data_2020_2, data_2021_1, data_2021_2])

In [4]:
# separate name from last name, in order to atribute a gender by the name. The gender by name is collected from the census. 
# the column gender_probability returns the probability of a person with a certain name being of a certain gender.

data[['name', 'last_name']] = data['student'].str.split( ' ', n=1, expand=True )

names_csv = pd.read_csv("nomes.csv.gz")
gender_dict = pd.Series( names_csv["classification"].values, index = names_csv["first_name"] )
gender_prob_dict = pd.Series( names_csv["ratio"].values, index = names_csv["first_name"] )
names = data['name'].str.normalize("NFKD").str.encode("ascii", errors='ignore').str.decode("ascii").str.upper()

data.insert( 2, 'gender', gender_dict.reindex( names ).values )
data.insert( 3, 'gender_probability', gender_prob_dict.reindex( names ).values )
data.drop( columns = ['name','last_name'], inplace=True )

In [5]:
# creating an ID for students and deleting the column with the students' names
data.insert(0, 'id', np.arange(len(data)) )
data.drop(['student'], axis = 1, inplace=True)

In [6]:
# creating a variable with the student's shift based on the code of his class (day shift or night shift)
shift_condition = [
    (data['class'] == 'A1') | (data['class'] == 'A2') | (data['class'] == 'A3') | (data['class'] == 'B1') | (data['class'] == 'B2') | 
    (data['class'] == 'B3') | (data['class'] == 'M1') | (data['class'] == 'M2') | (data['class'] == 'M3') | (data['class'] == 'N1') | 
    (data['class'] == 'N2') | (data['class'] == 'N3'), 
    (data['class'] == 'E1') | (data['class'] == 'F1') | (data['class'] == 'F')
    ]
shift = ['D', 'N']
data['shift'] = np.select(shift_condition, shift)

In [7]:
#creating a variable with the student's status at the end of the course (pass/drop/fail/fail with score < 40)
grade_condition = [
    (data['final_grade'] < 40.0),
    (data['final_grade'] >= 40.0) & (data['final_grade'] < 60.0),
    (data['final_grade'] >= 60.0),
    data['final_grade'].isna()
    ]
status = ['FF', 'F', 'P', 'D']
data['status'] = np.select(grade_condition, status)

In [8]:
# atribute score for the pre-test based on the answer key
data = data.assign( T1_score = 0 )

for _,row in answer_key.query('Q != 28 & Q!=29').iterrows():
    data['T1_score'] += 1*(data["T1Q"+str(row["Q"])] == row["G"])

# A point is awarded for 28 & 29 only if both are correctly answered. 
data['T1_score'] += 1*\
    ( data["T1Q28"] == answer_key.query('Q == 28').iloc[0]['G'] )*\
    ( data["T1Q29"] == answer_key.query('Q == 29').iloc[0]['G'] ) 

# Assign NaN scores to people who answered none of the questions, i.e., didn't take the test
data.loc[ data[ [ f"T1Q{x}" for x in range(1,31+1) ] ].isna().all('columns'), 'T1_score' ] = np.nan

In [9]:
# atribute score for the post-test based on the answer key
data = data.assign( T2_score = 0 )

for _,row in answer_key.query('Q != 28 & Q!=29').iterrows():
    data['T2_score'] += 1*(data["T2Q"+str(row["Q"])] == row["G"])

data['T2_score'] += 1*\
    ( data["T2Q28"] == answer_key.query('Q == 28').iloc[0]['G'] )*\
    ( data["T2Q29"] == answer_key.query('Q == 29').iloc[0]['G'] )

data.loc[ data[ [ f"T2Q{x}" for x in range(1,31+1) ] ].isna().all('columns'), 'T2_score' ] = np.nan

In [10]:
# Calculating normalized gain. This is a popular way to quantify learning in Physics Education Research
data = data.assign(normalized_gain = ((data['T2_score'] - data['T1_score'])/(30 - data['T1_score'])))

In [11]:
data.to_csv("data_2017_2021.csv", index=False)