In [624]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel

In [None]:
np.random.seed(12345)

In [625]:
# Download the latest SO dataset here: https://drive.google.com/open?id=1QOmVDpd8hcVYqqUXDXf68UMDWQZP0wQV
data_2019 = pd.read_csv("developer_survey_2019/survey_results_public.csv", low_memory=False)

In [626]:
to_drop = ["Respondent", "OpenSource", "CareerSat", "JobSat", "JobSeek", "ResumeUpdate", "SurveyLength", "SurveyEase", "WelcomeChange", "EntTeams", "ScreenName", "LastIn", "SO", "Blockchain", "WorkChallenge", "BetterLife", "OffOn", "Currency", "CompTotal", "CompFreq", "MainBranch", "PlatformDesireNextYear", "LanguageDesireNextYear", "DatabaseDesireNextYear", "MiscTechDesireNextYear", "WebFrameDesireNextYear", "MgrMoney", "ITperson"]

In [627]:
def col_drop(df, to_drop):

    df_dropped = df.copy()

    for flag in to_drop:
        try:
            df_dropped.drop([x for x in df_dropped.columns if flag in x], axis=1, inplace=True)
        except:
            pass

    return df_dropped

In [628]:
def string_replace(s):
    try:
        s = float(s)
    except:
        s = -1000

    return s

In [629]:
def text_clean(text):
    text = str(text).replace(" ", "_").replace("-", "_").replace(
        ",", "_").replace(".", "").replace("+", "p").replace("#", "s").replace(
            "/", "_").replace("'", "").replace("ʼ", "").replace(
                "(", "_").replace(")", "_").replace("’", "").replace(
                    "__", "_").replace("__", "_").replace("“", "").replace(
                        "”", "").replace(":", "_").replace("&", "_").lower()

    return text

In [630]:
def create_controls(df, exclude):

    controls = {}

    for col in df.columns:
        if col != exclude:
            controls[col] = {"omitted": text_clean(pd.Series([x for sub in list(data_2019[col].apply(text_clean).apply(lambda x: str(x).split(";"))) for x in sub]).value_counts().idxmax()), "controls": list(set([x for sub in list(data_2019[col].apply(text_clean).apply(lambda x: str(x).split(";"))) for x in sub]))}
    return controls

In [631]:
def design_matrix(df, controls):
    dm = df.copy()
    
    for control in controls.keys():
        dm[control] = dm[control].apply(text_clean)

        if control in num_columns:
            for c in controls[control]["controls"]:
                dm[control+"_"+c] = (dm[control] == c) * 1

        else:
            for c in controls[control]["controls"]:
                dm[control+"_"+c] = dm[control].apply(lambda x: c in str(x).split(";")) * 1

        dm.drop(control, axis=1, inplace=True)
        dm.drop(control+"_"+controls[control]["omitted"], axis=1, inplace=True)
    
    return dm

In [632]:
data_2019 = col_drop(data_2019, to_drop)
print("Success!")
print(data_2019.head().T.shape)

Success!
(46, 5)


In [633]:
# Only consider those with income between $10,000 and $250,000
data_2019 = data_2019[(data_2019["ConvertedComp"] >= 10000) & (data_2019["ConvertedComp"] <= 250000)]
data_2019["ConvertedComp"] = np.log(data_2019["ConvertedComp"])
data_2019 = data_2019.rename(columns = {"ConvertedComp": "Income"})

# Only consider US respondents
data_2019 = data_2019[data_2019["Country"] == "United States"]

# Only consider 18+ respondents
data_2019 = data_2019[data_2019["Age"] >= 18]

# Only consider respondents in the workforce
data_2019 = data_2019[data_2019["Employment"] != "Retired"]
data_2019 = data_2019[data_2019["Employment"] != "Not employed, and not looking for work"]

data_2019 = data_2019[data_2019["WorkWeekHrs"] >= 5]

# Only consider those with at least some education
data_2019 = data_2019[data_2019["EdLevel"] != "I never completed any formal education"]

data_2019 = data_2019.fillna("no_answer")

num_columns = ["Age", "Age1stCode", "YearsCode", "YearsCodePro", "WorkWeekHrs", "CodeRevHrs"]

# Convert numeric columns to int
for col in num_columns:
    data_2019[col] = data_2019[col].astype("int32", errors="ignore")

data_2019["Age1stCode"].replace("Younger than 5 years", "4", inplace=True)
data_2019["YearsCode"].replace("Less than 1 year", "0", inplace=True)
data_2019["YearsCode"].replace("More than 50 years", "51", inplace=True)

for col in num_columns:
    data_2019[col] = data_2019[col].apply(string_replace)

# Exclude respondents who selected multiple gender, race, or sexual orientation
# options
data_2019 = data_2019[~data_2019["Gender"].str.contains(";")]
data_2019 = data_2019[~data_2019["Ethnicity"].str.contains(";")]
data_2019 = data_2019[~data_2019["Sexuality"].str.contains(";")]

# Reset index
data_2019 = data_2019.reset_index(drop=True)

In [634]:
# TODO: Bucketing for numeric responses (Age, Age1stCode, YearsCode, WorkWeekHrs, CodeRevHrs)

age_labels = ["no_answer","-20", "21-25", "26-30", "31-35", "35-40", "41-45", "45-50", "51-55", "55-60", "61-65", "66-"]
year_labels = ["no_answer","0-5", "6-10", "11-15", "16-20", "21-25", "26-30", "31-35", "35-40", "41-"]
work_week_hour_labels = ["no_answer","-10", "11-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81-"]
code_rev_hour_labels = ["no_answer","1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11-15", "16-20", "21-"]

age_buckets = np.array([-1001,0,20,25,30,35,40,45,50,55,60,65,100])
year_buckets = np.array([-1001,0,5,10,15,20,25,30,35,40,100])
work_week_hour_buckets = np.array([-1001,0,10,20,30,40,50,60,70,80,200])
code_rev_hour_buckets = np.array([-1001,0,1,2,3,4,5,6,7,8,9,10,15,20,200])

In [635]:
data_2019["Age"] = pd.cut(data_2019["Age"], age_buckets, labels=age_labels).astype("str")
data_2019["Age1stCode"] = pd.cut(data_2019["Age1stCode"], age_buckets, labels=age_labels).astype("str")
data_2019["YearsCode"] = pd.cut(data_2019["YearsCode"], year_buckets, labels=year_labels).astype("str")
data_2019["YearsCodePro"] = pd.cut(data_2019["YearsCodePro"], year_buckets, labels=year_labels).astype("str")
data_2019["WorkWeekHrs"] = pd.cut(data_2019["WorkWeekHrs"], work_week_hour_buckets, labels=work_week_hour_labels).astype("str")
data_2019["CodeRevHrs"] = pd.cut(data_2019["CodeRevHrs"], code_rev_hour_buckets, labels=code_rev_hour_labels).astype("str")

In [636]:
controls = create_controls(data_2019, "Income")

In [637]:
data_2019 = design_matrix(data_2019, controls)

In [646]:
test = "Ethnicity"

for c in controls[test]["controls"]:
    if c != controls[test]["omitted"]:
        X, Y = data_2019.drop("Income", axis=1).astype(float).copy(), data_2019["Income"].copy()

        t = test + "_" + c

        T = X[t]
        X = X.drop(t, axis=1)
        clf = LassoCV(cv=5, max_iter=10000, selection="random", n_jobs=-1)

        sfm = SelectFromModel(clf)

        H, K = sfm.fit(X.drop([x for x in X.columns if test in x], axis=1), Y).get_support(), sfm.fit(X.drop([x for x in X.columns if test in x], axis=1), T).get_support()

        U = H | K

        X_U = X[np.unique(np.append(X.drop([x for x in X.columns if test in x], axis=1).columns[U], [x for x in X.columns if test in x]))].copy()
 
        X_U.loc[:, t] = T.copy()

        X_U = sm.add_constant(X_U)
        est = sm.OLS(endog=Y, exog=X_U).fit()
        print(t, est.params[-1], est.bse[-1]*1.96)

Ethnicity_no_answer 0.021331200354 0.0324552304204
Ethnicity_black_or_of_african_descent -0.00978179275523 0.0480047508143
Ethnicity_middle_eastern 0.0608030054587 0.071859298904
Ethnicity_east_asian 0.137347445753 0.0372890260586
Ethnicity_hispanic_or_latino_latina 0.035890486528 0.0350195176524
Ethnicity_south_asian 0.0741743466812 0.0339694024594
Ethnicity_biracial 0.0769879564861 0.102485379758
Ethnicity_multiracial 0.0135744360972 0.0775224023095
Ethnicity_native_american_pacific_islander_or_indigenous_australian -0.0281193170315 0.128985689709


In [642]:
est = sm.OLS(endog=data_2019["Income"], exog=sm.add_constant(data_2019.drop("Income", axis=1))).fit()

In [643]:
est.summary()

0,1,2,3
Dep. Variable:,Income,R-squared:,0.55
Model:,OLS,Adj. R-squared:,0.536
Method:,Least Squares,F-statistic:,38.91
Date:,"Mon, 21 Oct 2019",Prob (F-statistic):,0.0
Time:,00:00:18,Log-Likelihood:,-2948.0
No. Observations:,11376,AIC:,6590.0
Df Residuals:,11029,BIC:,9137.0
Df Model:,346,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.4424,0.035,322.655,0.000,11.373,11.512
Hobbyist_no,0.0257,0.008,3.190,0.001,0.010,0.042
Employment_employed_part_time,-0.4518,0.031,-14.384,0.000,-0.513,-0.390
Employment_independent_contractor_freelancer_or_self_employed,-0.0567,0.051,-1.108,0.268,-0.157,0.044
Employment_no_answer,-0.4125,0.108,-3.826,0.000,-0.624,-0.201
Student_yes_full_time,-0.1812,0.019,-9.648,0.000,-0.218,-0.144
Student_no_answer,0.0414,0.084,0.492,0.623,-0.123,0.206
Student_yes_part_time,0.0188,0.018,1.055,0.291,-0.016,0.054
EdLevel_secondary_school_eg_american_high_school_german_realschule_or_gymnasium_etc_,-0.0080,0.029,-0.276,0.782,-0.065,0.049

0,1,2,3
Omnibus:,2015.586,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15371.505
Skew:,-0.642,Prob(JB):,0.0
Kurtosis:,8.548,Cond. No.,2250000000000000.0


In [561]:
controls

{'Age': {'controls': ['35_40',
   '45_50',
   '51_55',
   '61_65',
   '55_60',
   '31_35',
   '66_',
   '21_25',
   '41_45',
   '26_30',
   '_20'],
  'omitted': '26_30'},
 'Age1stCode': {'controls': ['no_answer',
   '45_50',
   '41_45',
   '51_55',
   '31_35',
   '21_25',
   '35_40',
   '26_30',
   '_20'],
  'omitted': '_20'},
 'CodeRev': {'controls': ['yes_because_i_see_value_in_code_review',
   'no',
   'yes_because_i_was_told_to_do_so',
   'no_answer'],
  'omitted': 'yes_because_i_see_value_in_code_review'},
 'CodeRevHrs': {'controls': ['16_20',
   'no_answer',
   '5',
   '11_15',
   '9',
   '6',
   '21_',
   '2',
   '7',
   '4',
   '1',
   '8',
   '3',
   '10'],
  'omitted': 'no_answer'},
 'Containers': {'controls': ['no_answer',
   'testing',
   'i_do_not_use_containers',
   'development',
   'outside_of_work_for_personal_projects',
   'production'],
  'omitted': 'i_do_not_use_containers'},
 'Country': {'controls': ['united_states'], 'omitted': 'united_states'},
 'DatabaseDesireNe