# StackOverflow Survey

## Imports


In [1]:
import os
import dash
import numpy as np
import plotly
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from dash import html
from dash import dcc
from dash.dependencies import Input, Output, State
from plotly.subplots import make_subplots
from sklearn.manifold import TSNE
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import AgglomerativeClustering

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram



---

## Constants

In [4]:
CSS_URI = "https://codepen.io/chriddyp/pen/bWLwgP.css"
DATA_PATH = "./"
FILE = "survey_results_public.csv"


---

## Preprocessing


In [None]:
df = pd.read_csv(os.path.join(DATA_PATH, FILE), low_memory=0)

df.head()

In [6]:
REPLACE_DICT = {
    "YearsCodePro": {"Less than 1 year": 0, "More than 50 years": 51},
    "YearsCode": {"Less than 1 year": 0, "More than 50 years": 51},
    "Age1stCode": {"Older than 85": 86, "Younger than 5 years": 4},
}


In [7]:
def split_answers(data_series, delimiter=";"):
    """
    Split multiple answers in a single string
    to a list of single strings each representing a single answers

    Parameters:
    * data_series (pd.Series): String series with answers
    * delimiter (string): Another decimal integer
                          Defaults to ";"

    Returns: (pd.Series): If column contains
    """

    # Sub functions
    def is_splittable(pd_series, delimiter):
        """Check if results multiple should be splitted - Returns boolean"""
        return pd_series.str.contains(delimiter)

    def split_answer(pd_series, delimiter):
        """Function to split single answer"""
        return pd_series.str.split(delimiter)

    # --------------------

    # Check if multiple answers exist - if none: return original
    splittable_values = is_splittable(data_series, delimiter)
    if not splittable_values.any():
        return data_series

    # Else, split each value to a list
    modified_series = split_answer(data_series, delimiter)

    # Replace NAs with empty lists
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])

    return modified_series


In [8]:
for col, replacement in REPLACE_DICT.items():
    df[col] = df[col].replace(replacement).astype(np.float32)


In [9]:
object_cols = df.select_dtypes(include="object").columns.tolist()
for col in object_cols:
    df[col] = split_answers(df[col])


In [10]:
ROLES = "roles_short_names.csv"
NA_STRING = "Not Specified"
TRANSPARENT_STRING = "rgba(0, 0, 0, 0)"

ROLE_COLS = ["DevType"]
TECH_COLS = [
    "LanguageWorkedWith",
    "DatabaseWorkedWith",
    "WebframeWorkedWith",
    "MiscTechWorkedWith",
]
TECH_NEXT_COLS = [
    "LanguageDesireNextYear",
    "DatabaseDesireNextYear",
    "WebframeDesireNextYear",
    "MiscTechDesireNextYear",
]


In [11]:
roles_names = pd.read_csv(os.path.join(DATA_PATH, ROLES), sep=';')


In [12]:
# df_raw = df.copy()
encoded_dfs = {}
for col in ROLE_COLS + TECH_COLS:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(
        binarizer.fit_transform(df[col]),
        columns=binarizer.classes_,
        index=df[col].index,
    )
    encoded_dfs[col] = encoded_df


In [13]:
df = pd.concat(encoded_dfs, axis=1)


---

## Building graphs


### Tree-map for skill frequency


In [14]:
skills_freq = df.drop("DevType", axis=1).sum().reset_index()
skills_freq.columns = ["group", "skill", "freq"]


In [15]:
fig_treemap = px.treemap(skills_freq, 
                 path=['group', 'skill'], 
                 color_continuous_scale='deep',
                 values='freq',color='freq')

# fig_treemap.update_layout(width=1400, height=700)
# fig_treemap.show()

--- 

### Dendrogram for job skills


In [16]:
sorted_roles = df["DevType"].sum().sort_values().index.tolist()
sorted_skills = (
    df.drop("DevType", axis=1)
    .sum()
    .sort_values(ascending=False)
    .droplevel(level=0)
    .index.tolist()
)


In [17]:
skills = []
for role in sorted_roles:
    role_mask = df[("DevType", role)] == 1
    skills_role = pd.concat(
        {tech_col: df.loc[role_mask, tech_col].mean() * 100 for tech_col in TECH_COLS}
    )
    skills.append(skills_role)
skills = pd.concat(skills, axis=1)


skills.columns = sorted_roles
skills = skills.reset_index(level=0, drop=True)
skills = skills.loc[sorted_skills]

skills = skills.T


In [18]:
roles_short_dict = roles_names.set_index("Original name")["Short name "].to_dict()
short_labels = [roles_short_dict[role] for role in sorted_roles]


In [19]:
fig_dendrogram = ff.create_dendrogram(
    skills, labels=short_labels, orientation="left", color_threshold=0
)
# fig_dendrogram.update_layout(height=600, width=600, showlegend=False)
# fig_dendrogram.show()


In [20]:
std_skills = StandardScaler().fit_transform(skills)
std_skills = pd.DataFrame(std_skills, columns=skills.columns, index=skills.index)


---

### Job/Skill heatmap

In [21]:
fig_heatmap = go.Figure(
    data=go.Heatmap(
        z=std_skills, x=skills.columns, y=skills.index, colorscale="magma", ygap=1
    )
)
# fig.update_layout(width=1600, height=700)
# fig.show()


---

## Dashboard for job profiles


In [22]:
app = dash.Dash(external_stylesheets=[CSS_URI])


In [23]:
app.layout = html.Div(
    [
        html.H1(
            "Analysis of Job Profiles in Software Development",
            style={"textAlign": "center", "color": "blue"},
        ),
        html.Br(),
        html.Hr(),
        # TODO implement slider for years of survey
        dcc.Graph(figure=fig_treemap, id="figure-treemap"),
        dcc.Slider(
        min=2017,
        max=2021,
        value=2020,
        step=None,
        id="year-slider",
        marks={str(y): str(y) for y in range(2011, 2022)}
        ),
        # /TODO
        html.Div(
            [
                dcc.Graph(figure=fig_dendrogram, id="figure-dendrogram"),
            ],
            style={"display": "inline-block"},
        ),
        html.Div(
            [
                dcc.Graph(figure=fig_heatmap, id="figure-heatmap"),
            ],
            style={"display": "inline-block"},
        ),
        dcc.Dropdown(
            id="role-drop",
            options=[{"label": r, "value": r} for r in sorted_roles],
            value=np.random.choice(sorted_roles),
        ),
        dcc.Graph(figure=None, id="role-graph"),
    ]
)


In [24]:
@app.callback(
    Output(component_id="role-graph", component_property="figure"),
    Input(component_id="role-drop", component_property="value"),
)
def role_skills(selected_role):
    """ """
    if not selected_role:
        return selected_role
    single_role_skills = pd.concat([skills.loc[selected_role], std_skills.loc[selected_role]], axis=1)
    single_role_skills.columns = ["percentage", "specificity"]
    single_role_skills = single_role_skills.sort_values("percentage")

    threshold = 10

    single_role_skills = single_role_skills[
        single_role_skills["percentage"] > threshold
    ]

    return px.bar(
        df,
        y=single_role_skills.index,
        x=single_role_skills["percentage"],
        color=single_role_skills["specificity"],
        color_continuous_scale="orrd",
        range_color=[std_skills.values.min(), std_skills.values.max()],
        orientation="h",
    )


In [25]:
# TODO implement callback for slider
@app.callback(
    Output(component_id="fig-treemap", component_property="figure"),
    Output(component_id="fig-dendrogram", component_property="figure"),
    Output(component_id="fig-heatmap", component_property="figure"),
    Input(component_id="year-slider", component_property="value"),
)
def update_figures_by_year(yr):
    """ """
    # TODO implement the logic for updating
    return fig_treemap, fig_dendrogram, fig_heatmap


In [None]:
app.run_server()

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [28/Oct/2021 11:56:43] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:56:44] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:56:44] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:56:44] "[37mGET /_dash-component-suites/dash/dcc/async-graph.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:56:44] "[37mGET /_dash-component-suites/dash/dcc/async-slider.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:56:44] "[37mGET /_dash-component-suites/dash/dcc/async-plotlyjs.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:56:44] "[37mGET /_dash-component-suites/dash/dcc/async-dropdown.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:56:44] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:58:55] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Oct/2021 11:59:18] "[37mPOST /_da