In [1]:
import pandas as pd
import utils
import preprocessing
import factors
from copy import deepcopy

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chiara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chiara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
### MODIFY JUST HERE THE PARAMETERS OF THE ANALYSIS

#PARAMETERS IMPORT

#specify the year of the analysis
year = 2018
#file path of the analysis output
file_path = f"analysis{year}.xlsx"
#specify if there is a qualtrics questionnaire for this year
qualtrics_data = True

#________________________________________________________________________________________________________________________
#PARAMETERS PREPROCESSING

#specify with which value we want to substitute the range of salaries (possible values are: "mean", "min", "max")
method_range = "mean"

#in satisfaction, which value to assign to "Yes", when the evaluation is not from a scale from -2 (Really Unsatisfied) 
# to +2 (Really Satisfied)
use_recommendations = True
value_binary = 2

#how many years back do you want to go to find a valid salary
years = 0

#_______________________________________________________________________________________________________________________
#PARAMETERS SCORE
weights = {
        "is_woman":5,
        "is_int":5,
        "career_jump":5,
        "satisfaction":5,
        "career_service":5,
        "mobility":8,
        "salary":20,
        "salary_increase_perc":5,
        "salary_increase_abs":5
    }

#Specify in which way do you want to substitute null values
# Possible values are: 
# "ignore" (do not consider the null values in the averages)
# "general" (substitute every missing value with the general average of that variable)
# "group" (sustitute missing values with the average of the subgroup)
na_method = "ignore"


#number of decimals to include in the final table 
decimals = 2

In [3]:
if qualtrics_data == False and years == 0:
    years = 1

#import BDD file
df_y1_y2 = utils.import_BDD(f"data/BDD{year}.csv")

#import qualtrics file (substituted by an empty file if not avaialable)
df_y3 = utils.import_qualtrics(f"data/qualtrics{year}.csv")

#merge the two data sources (BDD and qualtrics)
df_all = pd.merge(df_y1_y2,df_y3, how="outer", on="bid")

#import file containing admission codes
admissions = pd.read_csv("data/admission.csv")
admissions.dropna(inplace = True)
admissions.drop("STVATTS_DESC", axis = 1, inplace = True)

#modify something in admission valid just for year 2020
if year == 2020:
    df = deepcopy(df_all)
    df["Admission"] = df["admission1"]
    df["Admission AST"] = df["admission1"].replace(["ASTF", "ASTI"], "AST")
else:
    df = df_all.merge(admissions, how = "left", left_on = "admission1", right_on = "STVATTS_CODE")
    df.drop("STVATTS_CODE", axis = 1, inplace = True)


#preprocess the file
df_prep = preprocessing.preprocessing_df(df, method_range,value_binary)

#compute all the single variables
df_factors = df_factors = factors.factors_df(df_prep, grouping_criteria=["Admission", "Admission AST"], years_before = years, qualtrics = qualtrics_data, recommendations = use_recommendations)

#add info about the chaires if available
try:
    chaires = pd.read_csv(f"data/chaires{year}.csv")
    chaires["Chaires"] = chaires.Chaires.apply(lambda x: x.split(",")[0] if type(x)==str else x)
    df_factors = pd.merge(df_factors, chaires, how = "left", left_on= "BID", right_on="Ecole_BID")
    df_factors.drop("Ecole_BID", axis = 1)
    groups = ["is_woman", "is_int", "Admission", "Admission AST", "Chaires"]

except:
    groups = ["is_woman", "is_int", "Admission", "Admission AST"]




In [4]:
#run the analysis for each subgroup and save the result as an excel file
with pd.ExcelWriter(file_path) as writer:
    df_factors.to_excel(writer, sheet_name = "factors", index = False)
    for group in groups:
        temp = factors.score(df_factors, group, weights, na_method, weighted=False)
        temp = utils.round_all(temp,decimals)
        temp.to_excel(writer, sheet_name=group, index=False)