In [2]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import seaborn as sns

In [None]:
data = pd.read_csv("Output/merge_stud_batla.csv")

In [5]:
def year_before(x):
    if x["intervention_index"] <= 0 and x["intervention_index"] >= -24:
        return 1
    else:
        return 0
data["year_before"] = data.apply(year_before, axis=1)

In [6]:
## Filter only the first 24 months after the intervention and the 24 months before the intervention
data = data.loc[(data["intervention_index"]>=-24) & (data["intervention_index"]<= 24)]

In [7]:
df = data.copy()

In [8]:
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["year"] = df["date"].dt.year

In [9]:
cpi = {2014:99.8,2015:99.2,2016:98.7,2017:98.9,2018:99.7,2019:100.5,2020:100,2021:101.5,2022:105.9}

## Calculate the real wagebase 2021
base_year = 2021
def real_wage(x):
    if x["year"] in cpi.keys():
        return x["wage_per_month"]*(cpi[base_year]/cpi[x["year"]])
    else:
        return x["wage_per_month"]*(cpi[base_year]/cpi[base_year])

df["real_wage"] = df.apply(real_wage, axis=1)

In [10]:
def help_col(x):
    if x["year_before"] == 1:
        return x["real_wage"]
    else:
        ## return nan
        return np.nan

df["help_col"] = df.apply(help_col, axis=1)

df["mean_before"] = df.groupby("id")["help_col"].transform("mean")

df = df.drop("help_col", axis=1)

In [11]:
## Add wage std column
df["wage_std"] = (df["real_wage"] - df["real_wage"].mean()) / df["real_wage"].std()

In [None]:
print(df["wage_std"].describe())

In [12]:
## Drop id's with std 
df["max_std"] = df.groupby("id")["wage_std"].transform("max")
df_big_std = df.loc[(df["max_std"] > 5)]
df = df.loc[(df["max_std"] <= 5)]

In [13]:
df["is_before"] = df["intervention_index"].apply(lambda x: 1 if x < 0 else 0)

## creates column is low before is mean wage is below 2500
df["is_low_before"] = df["mean_before"].apply(lambda x: 1 if x <= 2500 else 0)

df["is_below_mean_before"] = df["mean_before"].apply(lambda x: 1 if x <= 5400 else 0)

df["is_above_mean_before"] = df["mean_before"].apply(lambda x: 1 if x >= 5400 else 0)

In [None]:
eshcolot = list(df["eshcol_name"].unique())

## drop nan from eshcolot
eshcolot = [x for x in eshcolot if str(x) != 'nan']

for eshcol in eshcolot:
    df_e = df.loc[df["eshcol_name"] == eshcol]
    df_e.to_csv(eshcol + '.csv', index=False)

In [16]:
print(df["is_low_before"].value_counts(normalize=True))
print(df["is_below_mean_before"].value_counts(normalize=True))
print(df["is_above_mean_before"].value_counts(normalize=True))

0    0.605607
1    0.394393
Name: is_low_before, dtype: float64
1    0.599362
0    0.400638
Name: is_below_mean_before, dtype: float64
0    0.599362
1    0.400638
Name: is_above_mean_before, dtype: float64


In [14]:
after = df.loc[(df["is_before"]==0)]
before = df.loc[(df["is_before"]==1)]

In [None]:
print(before["real_wage"].describe())


In [15]:

print(df["real_wage"].describe())

print(before["real_wage"].describe())

print(after["real_wage"].describe())


count    5.502210e+06
mean     7.466071e+03
std      9.305340e+03
min      0.000000e+00
25%      0.000000e+00
50%      5.339698e+03
75%      1.075404e+04
max      7.413470e+04
Name: real_wage, dtype: float64
count    2.878056e+06
mean     6.254076e+03
std      8.821655e+03
min      0.000000e+00
25%      0.000000e+00
50%      3.424102e+03
75%      9.263957e+03
max      7.405686e+04
Name: real_wage, dtype: float64
count    2.624154e+06
mean     8.795333e+03
std      9.634702e+03
min      0.000000e+00
25%      8.445000e+02
50%      6.757739e+03
75%      1.218195e+04
max      7.413470e+04
Name: real_wage, dtype: float64


In [None]:
## Basic EDA

## Number of unique id's
print("number of id's", df["id"].nunique())

## Year range
print("Wage years range", df["date"].min(), df["date"].max())
print("Course years range", df["end_date"].min(), df["end_date"].max())

In [None]:
df["date"] = pd.to_datetime(df["date"])
by_year = df.groupby(df["date"].dt.year)["real_wage"].mean().reset_index()

In [None]:

hfont = {'fontname':'Heebo'}
color = "#F84040"
color2 = "#F8C440"

In [None]:
## kde dist plot before and after intervention
dist_1 = df.loc[(df["is_before"]==1)].groupby(["id"])["real_wage"].mean().reset_index()
dist_2 = df.loc[(df["is_before"]==0)].groupby(["id"])["real_wage"].mean().reset_index()
ax = sns.kdeplot(dist_1["real_wage"], label="before")
ax = sns.kdeplot(dist_2["real_wage"], label="after")
xtitle = ("Wage Per Month")
ax.set_xlabel(xtitle, **hfont)
ax.set_xlim(0, 40000)
ax.lines[0].set_color(color)
ax.lines[1].set_color(color2)
ax.legend()

plt.show()


In [None]:
df_b_a = df.groupby(["id","is_before"])["real_wage"].mean().reset_index()
df_b = df.loc[(df["is_before"]==1)]
df_a = df.loc[(df["is_before"]==0)]


In [None]:
df_a["real_wage"].describe()

In [None]:
print(df_b["real_wage"].mean())
print(df_a["real_wage"].mean())


In [None]:
print(df_b["real_wage"].median())
print(df_a["real_wage"].median())

In [None]:
## CDF plot
ax = sns.kdeplot(df_b["real_wage"], cumulative=True, label="Before Course")
ax = sns.kdeplot(df_a["real_wage"], cumulative=True, label="After Course")
xtitle = ("שכר חודשי ממוצע")[::-1]
ax.set_xlabel(xtitle)
ytitle = ("שכיחות")[::-1]
ax.set_ylabel(ytitle)
## add line in 5400
## add title to line
ax.set_xlim(0, 20000)
ax.lines[0].set_color(color)
ax.lines[1].set_color(color2)
## Add horizontal line in median
ax.axhline(y=0.5, color="blue", linestyle="--")

plt.show()

In [None]:
df["end_year"].value_counts()

In [None]:
#   Plot A
### 24 month before and after intervention
### Before Covid
### Normalized (std < 4)

## filter after before 2019
plot_a = df.loc[(df["end_year"] < 2020)]
plot_a = plot_a.groupby("intervention_index")["real_wage"].agg(["mean", "std"]).reset_index()


hfont = {'fontname':'Heebo'}
ylabel = ("שכר חודשי ממוצע")[::-1]
xlabel = ("חודשים לפני ואחרי סיום ההכשרה")[::-1]
line_title = ("חודש סיום")[::-1]

# Line plot of mean wage
plt.figure(figsize=(10, 5))
sns.lineplot(x="intervention_index", y="mean", data=plot_a)
plt.gca().lines[0].set_color("#006BA2")
plt.xlabel(xlabel, **hfont)
plt.ylabel(ylabel, **hfont)
plt.yticks(np.arange(2000, 11000, 1000))
plt.axvline(x=0, color="black", linestyle=(0, (1, 5))) # intervention month
plt.text(-0.4, 4500, line_title, rotation=90, backgroundcolor="white", **hfont)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
## Remove ticks
plt.gca().tick_params(axis='y', which='both', length=0)

plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x)))
plt.gca().yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ## Add horizontal grid





In [None]:
#   Plot A_2
### 24 month before and after intervention
### Before Covid
### Normalized (std < 4)

## filter after before 2019
plot_a2 = df.loc[(df["date"] < "2023-01-01")]
plot_a2 = plot_a2.loc[(plot_a2["is_low_before"] == 1)]
plot_a2 = plot_a2.groupby("intervention_index")["real_wage"].agg(["mean", "std"]).reset_index()


hfont = {'fontname':'Heebo'}
ylabel = ("שכר חודשי ממוצע")[::-1]
xlabel = ("חודשים לפני ואחרי סיום ההכשרה")[::-1]
line_title = ("חודש סיום")[::-1]

# Line plot of mean wage
plt.figure(figsize=(10, 5))
sns.lineplot(x="intervention_index", y="mean", data=plot_a2)
plt.gca().lines[0].set_color("#006BA2")
plt.xlabel(xlabel, **hfont)
plt.ylabel(ylabel, **hfont)
plt.yticks(np.arange(0,7500, 1000))
plt.axvline(x=0, color="black", linestyle=(0, (1, 5))) # intervention month
plt.text(-0.4, 4500, line_title, rotation=90, backgroundcolor="white", **hfont)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
## Remove ticks
plt.gca().tick_params(axis='y', which='both', length=0)

plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x)))
plt.gca().yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ## Add horizontal grid

In [None]:
## Plot B
### 24 month before and after intervention
### Before Covid

## filter after before 2019
plot_b = df.loc[(df["date"] < "2023-01-01")]
plot_b = plot_b.groupby(["intervention_index","is_low_before"])["real_wage"].agg(["mean", "std"]).reset_index()
plot_b["is_low_before"] = plot_b["is_low_before"].replace({0: "ןכ", 1: "אל"})

# Line plot of mean wage
plt.figure(figsize=(10, 5))
sns.lineplot(x="intervention_index", y="mean", data=plot_b, hue="is_low_before", palette=["#006BA2", "#DB444B"])
plt.xlabel(xlabel, **hfont)
plt.ylabel(ylabel, **hfont)
plt.yticks(np.arange(0, 12500, 2000))
plt.axvline(x=0, color="black", linestyle=(0, (1, 5))) # intervention month
plt.text(-0.4, 4000, line_title, rotation=90, backgroundcolor="white", **hfont)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)

plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x)))
plt.gca().yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ## Add horizontal grid

plt.gca().tick_params(axis='y', which='both', length=0)

handles, labels = plt.gca().get_legend_handles_labels()
plt.gca().legend(handles=handles[0:], labels=labels[0:], loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)



In [None]:
## Plot C
### 24 month before and after intervention
### Before Covid
### By gender

## filter after before 2019
plot_c = df.loc[(df["date"] < "2023-01-01")]
plot_c = plot_c.groupby(["intervention_index","gender"])["real_wage"].agg(["mean", "std"]).reset_index()

# Line plot of mean wage
plt.figure(figsize=(10, 5))
sns.lineplot(x="intervention_index", y="mean", data=plot_c, hue="gender", palette=["#006BA2", "#DB444B"])
plt.xlabel(xlabel, **hfont)
plt.ylabel(ylabel, **hfont)
plt.yticks(np.arange(0, 13000, 2000))
plt.axvline(x=0, color="black", linestyle=(0, (1, 5))) # intervention month
plt.text(-0.4, 600, line_title, rotation=90, backgroundcolor="white", **hfont)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)

plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x)))
plt.gca().yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ## Add horizontal grid

legend_title = ("מגדר")[::-1]
plt.gca().get_legend().set_title(legend_title)
labels = ["םירבג", "םישנ"]

plt.gca().tick_params(axis='y', which='both', length=0)


for t, l in zip(plt.gca().get_legend().texts, labels): t.set_text(l)
## legend labels side by side not on top of each other
handles, labels = plt.gca().get_legend_handles_labels()
plt.gca().legend(handles=handles[0:], labels=labels[0:], loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)


In [None]:
eshcolot = df["eshcol"].value_counts().index.tolist()

In [None]:
eshcol ="ניהול משרד"

נהיגה
ניהול משרד
חינוך בגיל הרך
חשמלאים
פיתוח תוכנה
קוסמטיקה
תשתיות תקשורת- הדרכה, הטמעה ניהול ויישום
חשב שכר
בודקי תוכנה ו QA
טבחות וקונדיטאות
עיצוב שיער
מנהלי עבודה בבניין
מיזוג אוויר
עיבוד שבבי
דפוס/עיצוב 
ניהול לוגיסטיקה שרשרת אספקה, מלאי ורכש
הפעלת עגורנים
מכונאות רכב/אוטוטרוניקה
מסגרות וריתוך
ניהול קמפיינים ומדיה חברתית
בניה ותחזוקה

In [None]:
## Plot D
### 24 month before and after intervention
### Before Covid
### by eshcol

## filter after before 2019
plot_d = df.loc[(df["date"] < "2023-01-01")]
plot_d = plot_d.groupby(["intervention_index","eshcol"])["real_wage"].agg(["mean", "std"]).reset_index()


# Line plot of mean wage
## All lines grey except for the one we want to highlight in red and in the front


plt.figure(figsize=(10, 5))

## plot all lines grey
for i in plot_d["eshcol"].unique():
    plot_d_temp = plot_d.loc[(plot_d["eshcol"] == i)]
    sns.lineplot(x="intervention_index", y="mean", data=plot_d_temp, color="#E0e0e0", alpha=0.5)

## plot the line we want to highlight in red

plot_d_temp = plot_d.loc[(plot_d["eshcol"] == eshcol)]
sns.lineplot(x="intervention_index", y="mean", data=plot_d_temp, color="#F84040")


plt.xlabel(xlabel, **hfont)
plt.ylabel(ylabel, **hfont)
plt.yticks(np.arange(0, 30000, 4000))
plt.axvline(x=0, color="black", linestyle=(0, (1, 10))) # intervention month
#plt.text(-0.4, 4000, line_title, rotation=90, backgroundcolor="white", **hfont)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x)))
plt.gca().yaxis.grid(True, linestyle='-', which='major', color="lightgrey", alpha=0.5) ## Add horizontal grid


In [None]:
import plotly.express as px

## line plot with different line to eshcol
fig = px.line(plot_d, x="intervention_index", y="mean", color="eshcol")
fig.show()
## save plot to html
fig.write_html("eshcolot.html")

In [None]:
## Plot E
### 24 month before and after intervention
### Before Covid
### by eshcol and worked before training

## filter after before ___
plot_e = df.loc[(df["date"] < "2023-01-01")]
plot_e = plot_e.loc[(plot_e["end_year"] >= 2016)]
plot_e = plot_e.groupby(["intervention_index","eshcol", "is_low_before"])["real_wage"].agg(["mean", "std","count"]).reset_index()

# Line plot of mean wage

plot_e_temp = plot_e.loc[(plot_e["eshcol"] == eshcol)]
## Reaplce is_low_before with "כן" and "לא"
plot_e_temp["is_low_before"] = plot_e_temp["is_low_before"].replace({1: "אל", 0: "ןכ"})

plt.figure(figsize=(10, 5))
sns.lineplot(x="intervention_index", y="mean", data=plot_e_temp, hue="is_low_before", palette=["#006BA2", "#DB444B"])
plt.xlabel(xlabel, **hfont)
plt.ylabel(ylabel, **hfont)
plt.yticks(np.arange(0, 9500, 2500))
plt.axvline(x=0, color="black", linestyle=(0, (1, 10))) # intervention month
#plt.text(-0.3, 4500, line_title, rotation=90, backgroundcolor="white", **hfont)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

plt.gca().tick_params(axis='y', which='both', length=0)
plt.gca().spines['left'].set_visible(False)

plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x)))
plt.gca().yaxis.grid(True, linestyle='-', which='major', color="lightgrey", alpha=0.5) ## Add horizontal grid
handles, labels = plt.gca().get_legend_handles_labels()
plt.gca().legend(handles=handles[0:], labels=labels[0:], loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)
