# Representation

## Imports

In [1]:
import analyze
import prep
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from prep import text_clean
import altair as alt
from altair_saver import save
%load_ext rpy2.ipython

%load_ext autoreload
%autoreload 2

  from pandas.core.index import Index as PandasIndex


## Load and prepare data

In [2]:
outcome = "Wage"
DATA_PATH = "../data/2020/survey_results_public.csv"
data_raw = pd.read_csv(DATA_PATH)
data_2020, keep, groups, categorical, numeric, base = prep.prep(data_raw, outcome, 2020)

DATA_PATH = "../data/2019/survey_results_public.csv"
data_raw = pd.read_csv(DATA_PATH)
data_2019, keep, groups, categorical, numeric, base = prep.prep(data_raw, outcome, 2019)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Removing 0 respondents with missing Wage
6284 developers left in the sample after cleaning
151 or 2.4% black developers in the sample with multiracial replacement
Removing 0 respondents with missing Wage
12502 developers left in the sample after cleaning
291 or 2.3% black developers in the sample with multiracial replacement


In [3]:
data = pd.concat([data_2020, data_2019], axis=0)

In [4]:
X, Y = prep.design_matrix(data, categorical, numeric, base, outcome)

Design matrix complete with 184 variables/columns


In [5]:
data["Ethnicity_Black_or_of_African_descent"] = data["Ethnicity"] == "Black or of African descent"

In [6]:
breakdown = ["DevType", "OrgSize", "Gender"]

In [7]:
for column in breakdown:
    for col in sorted(set([i for row in data[column].str.split(";") for i in row])):            
                # Create control columns
                data[column+"_"+text_clean(col)] = data[column].str.split(";").map(lambda x: col in x)

    data = data.drop([column], axis=1)

In [8]:
representation = {}

for column in breakdown:
    types = []
    
    for col in [col for col in data.columns if column in col]:
        types.append(data.loc[data[col] == 1]["Ethnicity_Black_or_of_African_descent"].mean())
    
    representation[column] = pd.Series(types, index=[col[len(column)+1:] for col in data.columns if column in col]
                                      ).sort_values(ascending=False).drop("No_answer").reset_index()
    
    representation[column] = representation[column].rename(columns={"index": "labels", 0: "data"})

In [9]:
representation["DevType"] = representation["DevType"].loc[(representation["DevType"]["labels"] != "Marketing_or_sales_professional") & \
                                                          (representation["DevType"]["labels"] != "Student")]

representation["DevType"]["labels"] = ["Educator", 
                                       "Mobile",
                                       "Designer",
                                       "Front-end",
                                       "Full-stack",
                                       "QA / Test",
                                       "Data engineer",
                                       "Desktop / Enterprise apps",
                                       "Executive",
                                       "Back-end",
                                       "Embedded apps / devices",
                                       "Data / Business analyst",
                                       "Site reliability",
                                       "Engineering manager",
                                       "Games / Graphics",
                                       "System admnistrator",
                                       "Scientist",
                                       "Database administrator",
                                       "Data scientist",
                                       "Product manager",
                                       "DevOps",
                                       "Academic researcher"]

In [10]:
representation["Gender"]["labels"] = ["Women", "Non-binary", "Men"]

In [11]:
representation["Total"] = pd.DataFrame({"labels": ["Non-Black", "Black"], "data": [1-data["Ethnicity_Black_or_of_African_descent"].mean(), data["Ethnicity_Black_or_of_African_descent"].mean()]})

## Results

In [12]:
labels = ["Non-Black", "Black"]
colors = ["#fa7026", "#612b8a"]

In [13]:
color_scale = alt.Scale(
    domain=labels,
    range=colors
)

points = alt.Chart().mark_bar(size=180, cornerRadiusBottomRight=18, cornerRadiusTopRight=18, clip=True, color="#612b8a").encode(
    x=alt.X("data", 
            axis=alt.Axis(format=".1%", title=None, tickCount=5, tickSize=0, labelFlush=False, labelFontSize=15),
    ),
    y=alt.Y("labels", 
            axis=alt.Axis(title=None, labelFontSize=15), sort="-x"
           ),
    color=alt.Color(
        'labels',
        legend=None,
        scale=color_scale
    )
)

text = alt.Chart().mark_text(
    align='left',
    baseline='middle',
    dx=10,
    dy=0,
    fontSize=15
).encode(
    text=alt.Text("data", format=".1%"),
    x="data",
    y=alt.Y("labels", sort="-x"),
)

chart = alt.layer(points, text, data=representation["Total"]).properties(
    width=600,
    height=500,
    title={
        "text": "Representation", 
        "subtitle": "% of Software Developers",
        "subtitleColor": "gray"
    }
).configure_view(
    strokeWidth=0
).configure_title(
    anchor='start',
    offset=5,
    fontSize=20
).configure_axis(
    domain=False
)

chart.save("../images/representation.png", method="selenium", webdriver="chrome", scale_factor=3.0)

chart

In [14]:
points = alt.Chart().mark_bar(cornerRadiusBottomRight=8, cornerRadiusTopRight=8, clip=True, color="#612b8a").encode(
    x=alt.X("data", 
            axis=alt.Axis(format=".1%", title=None, tickCount=5, tickSize=0, labelFlush=False, labelFontSize=15),
    ),
    y=alt.Y("labels", 
            axis=alt.Axis(title=None, labelFontSize=15), sort="-x"
           ),
    color=alt.Color("data", 
                scale=alt.Scale(scheme="inferno", domain=[0.005, 0.08]),
                legend=None)
)

text = alt.Chart().mark_text(
    align='left',
    baseline='middle',
    dx=10,
    dy=0,
    fontSize=15
).encode(
    text=alt.Text("data", format=".1%"),
    x="data",
    y=alt.Y("labels", sort="-x"),
)

chart = alt.layer(points, text, data=representation["Gender"]).properties(
    width=400,
    height=500,
    title={
        "text": "Representation by Gender", 
        "subtitle": "% Black",
        "subtitleColor": "gray"
    }
).configure_view(
    strokeWidth=0
).configure_title(
    anchor='start',
    offset=5,
    fontSize=20
).configure_axis(
    domain=False
)

chart.save("../images/representation_gender.png", method="selenium", webdriver="chrome", scale_factor=3.0)

chart

In [15]:
points = alt.Chart().mark_bar(cornerRadiusBottomRight=8, cornerRadiusTopRight=8, clip=True, color="#612b8a").encode(
    x=alt.X("data", 
            axis=alt.Axis(format=".1%", title=None, tickCount=5, tickSize=0, labelFlush=False, labelFontSize=15),
    ),
    y=alt.Y("labels", 
            axis=alt.Axis(title=None, labelFontSize=15), sort="-x"
           ),
    color=alt.Color("data", 
                    scale=alt.Scale(scheme="inferno", domain=[0.005, 0.04]),
                    legend=None)
)

text = alt.Chart().mark_text(
    align='left',
    baseline='middle',
    dx=10,
    dy=0,
    fontSize=15
).encode(
    text=alt.Text("data", format=".1%"),
    x="data",
    y=alt.Y("labels", sort="-x"),
)

chart = alt.layer(points, text, data=representation["DevType"]).properties(
    width=400,
    height=500,
    title={
        "text": "Representation by Role", 
        "subtitle": "% Black, Respondents can select multiple roles",
        "subtitleColor": "gray"
    }
).configure_view(
    strokeWidth=0
).configure_title(
    anchor='start',
    offset=5,
    fontSize=20
).configure_axis(
    domain=False
)

chart.save("../images/representation_role.png", method="selenium", webdriver="chrome", scale_factor=3.0)

chart