In [32]:
import pandas as pd
import utils
import preprocessing
import factors
from factors import *
import numpy as np

In [101]:
#redefine the function that computes all the factors to eliminate scaling and elimination of outliers
def factors_df(df, grouping_criteria=[], years_before = 0, qualtrics = True, recommendations = True):
    
    grouping_criteria = grouping_criteria if type(grouping_criteria)==list else [grouping_criteria]
    
    #create new dataframe
    new_df = pd.DataFrame()

    #add a column for each factor
    new_df["BID"] = df["bid"]
    new_df["is_woman"]= df.apply(gender, axis = 1)
    new_df["is_int"]= df.apply(international, axis = 1)
    new_df["salary_increase_abs"] = df.apply(lambda x: salary_increase(x,qualtrics), axis = 1).apply(lambda x: x[0])
    new_df["salary_increase_perc"] = df.apply(lambda x: salary_increase(x,qualtrics), axis = 1).apply(lambda x: x[1])
    new_df["salary"] = df.apply(lambda x: salary(x, years_before),axis = 1)
    new_df["satisfaction"] = df.apply(lambda x: satisfaction(x,recommendations), axis = 1)
    new_df["career_service"] = df.apply(career_services, axis = 1)
    new_df["mobility"] = df.apply(lambda x: mobility(x, qualtrics), axis = 1)
    new_df["career_jump"] = df.apply(career_jump, axis = 1)

    #add columns for grouping criteria
    for i in grouping_criteria:
        new_df[i] = df[i]
    
    return new_df

In [4]:
def load_data(year):
    #import BDD file
    df_y1_y2 = utils.import_BDD(f"data/BDD{year}.csv")
    #import qualtrics file (substituted by an empty file if not avaialable)
    df_y3 = utils.import_qualtrics(f"data/qualtrics{year}.csv")
    #merge the two data sources (BDD and qualtrics)
    df_all = pd.merge(df_y1_y2,df_y3, how="outer", on="bid")
    #import file containing admission codes
    admissions = pd.read_csv("data/admission.csv")
    admissions.dropna(inplace = True)
    admissions.drop("STVATTS_DESC", axis = 1, inplace = True)

    #join the admissions codes with the general df
    df = df_all.merge(admissions, how = "left", left_on = "admission1", right_on = "STVATTS_CODE")
    df.drop("STVATTS_CODE", axis = 1, inplace = True)
    return df

In [5]:
df20 = load_data(2020)
df19 = load_data(2019)
df18 = load_data(2018)

In [6]:
df20 = preprocessing.preprocessing_df(df20, "mean",2)
df19 = preprocessing.preprocessing_df(df19, "mean", 2)
df18 = preprocessing.preprocessing_df(df18, "mean", 2)

In [12]:
factors20 = factors_df(df20, 
                                grouping_criteria=["Admission", "Admission AST"],
                                years_before = 1, qualtrics = False,
                                recommendations = False)

                                #remember that for 2020 we do not have qualtrics data, so we cannot use recommendations
                                #(which are in qualtrics data)
                                #and we don't have the last year, so years before must be 1

factors19 = factors_df(df19, 
                                grouping_criteria=["Admission", "Admission AST"],
                                years_before = 0, qualtrics = True,
                                recommendations = True)

factors18 = factors_df(df18, 
                                grouping_criteria=["Admission", "Admission AST"],
                                years_before = 0, qualtrics = True,
                                recommendations = True)

In [18]:
factors18[["salary_increase_abs", "salary_increase_perc", "salary", "satisfaction", "career_service"]].describe()

Unnamed: 0,salary_increase_abs,salary_increase_perc,salary,satisfaction,career_service
count,320.0,320.0,366.0,314.0,265.0
mean,25751.622441,0.587753,68344.262295,1.085191,1.671698
std,25910.407585,1.382106,22585.219009,0.86839,0.892812
min,-40000.0,-0.418848,50000.0,-2.0,0.0
25%,10008.196721,0.22,50000.0,1.0,1.0
50%,20000.0,0.44,55500.0,1.0,2.0
75%,31365.0,0.663355,75500.0,2.0,2.0
max,163830.0,24.0,125500.0,2.0,3.0


In [43]:
factors18["salary_increase_abs"].max()

163830.0

In [49]:
def describe(variable):
    description = pd.DataFrame(index = ["mean", "std", "min", "0.01 percentile", "0.05 percentile", "0.95 percentile", "0.99 percentile", "max"])
    
    year = 2018
    for i in [factors18, factors19, factors20]:
        series = i[variable]
        stats = []
        stats.append(round(series.mean(),2))
        stats.append(round(series.std(),2))
        stats.append(series.min())
        stats.append(series.quantile(0.01))
        stats.append(series.quantile(0.05))
        stats.append(series.quantile(0.95))
        stats.append(series.quantile(0.99))
        stats.append(series.max())

        description[str(year)] = stats

        year +=1
    
    return description


In [58]:
for i in ["salary_increase_abs", "salary_increase_perc", "salary", "satisfaction", "career_service"]:
    print(i)
    print(describe(i).round(2))
    print("_____________________________________________________ \n")

salary_increase_abs
                      2018       2019       2020
mean              25751.62    9615.08   17049.57
std               25910.41   21951.69   52502.15
min              -40000.00 -150995.84  -46000.00
0.01 percentile   -5025.00  -60544.13  -17929.78
0.05 percentile       0.00   -6581.45   -3325.50
0.95 percentile   79858.92   30175.00   62813.85
0.99 percentile  120602.50   45145.00  164667.41
max              163830.00   64500.00  582623.93
_____________________________________________________ 

salary_increase_perc
                  2018  2019  2020
mean              0.59  0.33  0.26
std               1.38  0.59  0.44
min              -0.42 -0.73 -0.41
0.01 percentile  -0.09 -0.44 -0.19
0.05 percentile   0.00 -0.12 -0.06
0.95 percentile   1.27  0.73  0.88
0.99 percentile   2.47  3.27  2.42
max              24.00  4.00  2.85
_____________________________________________________ 

salary
                      2018      2019        2020
mean              68344.26  57982.9

In [61]:
def describe_sat(variable):
    description = pd.DataFrame(index = ["mean", "std", "min", "max"])
    
    year = 2018
    for i in [factors18, factors19, factors20]:
        series = i[variable]
        stats = []
        stats.append(round(series.mean(),2))
        stats.append(round(series.std(),2))
        stats.append(series.min())
        stats.append(series.max())

        description[str(year)] = stats

        year +=1
    
    return description

In [87]:
factors18.columns

Index(['BID', 'is_woman', 'is_int', 'salary_increase_abs',
       'salary_increase_perc', 'salary', 'satisfaction', 'career_service',
       'mobility', 'career_jump', 'Admission', 'Admission AST'],
      dtype='object')

In [94]:
factors18[["is_woman","is_int","mobility", "career_jump"]].mode().values.tolist()[0]

[0.0, 0.0, 0.0, 1.0]

In [95]:
def mode():
    modes = pd.DataFrame(index = ["is_woman","is_int","mobility", "career_jump"])
    
    year = 2018
    for i in [factors18, factors19, factors20]:
        modes[str(year)] = i[["is_woman","is_int","mobility", "career_jump"]].mode().values.tolist()[0]

        year +=1
    
    return modes

In [100]:
describe_sat("satisfaction")

Unnamed: 0,2018,2019,2020
mean,1.09,1.01,0.93
std,0.87,0.95,0.9
min,-2.0,-2.0,-2.0
max,2.0,2.0,2.0


In [64]:
describe_sat("career_service")

Unnamed: 0,2018,2019,2020
mean,1.67,1.63,1.47
std,0.89,0.89,0.89
min,0.0,0.0,0.0
max,3.0,3.0,3.0


In [99]:
mode()

Unnamed: 0,2018,2019,2020
is_woman,0.0,1.0,0.0
is_int,0.0,0.0,0.0
mobility,0.0,0.0,1.0
career_jump,1.0,0.0,0.0
