# Stack overflow survey 2020 analysis

In [226]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
import plotly.express as px

## Project motivation

Im interested in Stackoverflow data from 2020 to answer the following questions:
1. What are the top priorities of working as a developer professional?
1. How coding languages associated with salary?
1. Understand the correlation between salary with developer type, year of experience?

## Prepare data
Data can be downloaded from [Stackoverflow](https://insights.stackoverflow.com/survey). Used data is taken from the 2020 survey on 64,461 people with 61 columns of data.

In [227]:
# Load data
df = pd.read_csv('./data/survey_results_public.csv')

# Descriptive observation
print(f"Shape:\n{df.shape}")

print(f"Data Type:\n")
for i, v in df.dtypes.items():
    print(str(i) + ": " + str(v))

df.head()

Shape:
(64461, 61)
Data Type:

Respondent: int64
MainBranch: object
Hobbyist: object
Age: float64
Age1stCode: object
CompFreq: object
CompTotal: float64
ConvertedComp: float64
Country: object
CurrencyDesc: object
CurrencySymbol: object
DatabaseDesireNextYear: object
DatabaseWorkedWith: object
DevType: object
EdLevel: object
Employment: object
Ethnicity: object
Gender: object
JobFactors: object
JobSat: object
JobSeek: object
LanguageDesireNextYear: object
LanguageWorkedWith: object
MiscTechDesireNextYear: object
MiscTechWorkedWith: object
NEWCollabToolsDesireNextYear: object
NEWCollabToolsWorkedWith: object
NEWDevOps: object
NEWDevOpsImpt: object
NEWEdImpt: object
NEWJobHunt: object
NEWJobHuntResearch: object
NEWLearn: object
NEWOffTopic: object
NEWOnboardGood: object
NEWOtherComms: object
NEWOvertime: object
NEWPurchaseResearch: object
NEWPurpleLink: object
NEWSOSites: object
NEWStuck: object
OpSys: object
OrgSize: object
PlatformDesireNextYear: object
PlatformWorkedWith: object
Purcha

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,DatabaseDesireNextYear,DatabaseWorkedWith,DevType,EdLevel,Employment,Ethnicity,Gender,JobFactors,JobSat,JobSeek,LanguageDesireNextYear,LanguageWorkedWith,MiscTechDesireNextYear,MiscTechWorkedWith,NEWCollabToolsDesireNextYear,NEWCollabToolsWorkedWith,NEWDevOps,NEWDevOpsImpt,NEWEdImpt,NEWJobHunt,NEWJobHuntResearch,NEWLearn,NEWOffTopic,NEWOnboardGood,NEWOtherComms,NEWOvertime,NEWPurchaseResearch,NEWPurpleLink,NEWSOSites,NEWStuck,OpSys,OrgSize,PlatformDesireNextYear,PlatformWorkedWith,PurchaseWhat,Sexuality,SOAccount,SOComm,SOPartFreq,SOVisitFreq,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,EUR,Microsoft SQL Server,Elasticsearch;Microsoft SQL Server;Oracle,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Independent contractor, freelancer, or self-em...",White or of European descent,Man,"Languages, frameworks, and other technologies ...",Slightly satisfied,I am not interested in new job opportunities,C#;HTML/CSS;JavaScript,C#;HTML/CSS;JavaScript,.NET Core;Xamarin,.NET;.NET Core,Microsoft Teams;Microsoft Azure;Trello,Confluence;Jira;Slack;Microsoft Azure;Trello,No,Somewhat important,Fairly important,,,Once a year,Not sure,,No,Often: 1-2 days per week or more,Start a free trial;Ask developers I know/work ...,Amused,Stack Overflow (public Q&A for anyone who codes),Visit Stack Overflow;Go for a walk or other ph...,Windows,2 to 9 employees,Android;iOS;Kubernetes;Microsoft Azure;Windows,Windows,,Straight / Heterosexual,No,"No, not at all",,Multiple times per day,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,GBP,,,"Developer, full-stack;Developer, mobile","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,,,,Very dissatisfied,I am not interested in new job opportunities,Python;Swift,JavaScript;Swift,React Native;TensorFlow;Unity 3D,React Native,Github;Slack,Confluence;Jira;Github;Gitlab;Slack,,,Fairly important,,,Once a year,Not sure,,No,,,Amused,Stack Overflow (public Q&A for anyone who code...,Visit Stack Overflow;Go for a walk or other ph...,MacOS,"1,000 to 4,999 employees",iOS;Kubernetes;Linux;MacOS,iOS,I have little or no influence,,Yes,"Yes, definitely",Less than once per month or monthly,Multiple times per day,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,,,,,,,,,,,,Objective-C;Python;Swift,Objective-C;Python;Swift,,,,,,,,,,Once a decade,,,No,,,,Stack Overflow (public Q&A for anyone who codes),,Linux-based,,,,,,Yes,"Yes, somewhat",A few times per month or weekly,Daily or almost daily,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,ALL,,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",,White or of European descent,Man,Flex time or a flexible schedule;Office enviro...,Slightly dissatisfied,"I’m not actively looking, but I am open to new...",,,,,,,No,,Not at all important/not necessary,Curious about other opportunities;Wanting to w...,,Once a year,Not sure,Yes,Yes,Occasionally: 1-2 days per quarter but less th...,,,Stack Overflow (public Q&A for anyone who code...,,Linux-based,20 to 99 employees,,,I have a great deal of influence,Straight / Heterosexual,Yes,"Yes, definitely",A few times per month or weekly,Multiple times per day,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,,MySQL;PostgreSQL,MySQL;PostgreSQL;Redis;SQLite,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Employed full-time,White or of European descent,Man,,,,Java;Ruby;Scala,HTML/CSS;Ruby;SQL,Ansible;Chef,Ansible,"Github;Google Suite (Docs, Meet, etc)",Confluence;Jira;Github;Slack;Google Suite (Doc...,,,Very important,,,Once a year,No,,Yes,,Start a free trial;Ask developers I know/work ...,"Hello, old friend",Stack Overflow (public Q&A for anyone who code...,Call a coworker or friend;Visit Stack Overflow...,Windows,,Docker;Google Cloud Platform;Heroku;Linux;Windows,AWS;Docker;Linux;MacOS;Windows,,Straight / Heterosexual,Yes,"Yes, somewhat",Less than once per month or monthly,A few times per month or weekly,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [228]:
# Look at summarized descriptive data
pd.set_option("precision", 2)
df.describe()

Unnamed: 0,Respondent,Age,CompTotal,ConvertedComp,WorkWeekHrs
count,64461.0,45446.0,34800.0,34800.0,41151.0
mean,32554.08,30.83,3.19e+242,104000.0,40.78
std,18967.44,9.59,inf,227000.0,17.82
min,1.0,1.0,0.0,0.0,1.0
25%,16116.0,24.0,20000.0,24600.0,40.0
50%,32231.0,29.0,63000.0,54000.0,40.0
75%,49142.0,35.0,125000.0,95000.0,44.0
max,65639.0,279.0,1.11e+247,2000000.0,475.0


### 1. What are the top priorities of working as a developer professional?
Aprt from compensation & location, in this question, we would like to know which factors would affect most to developer to choose an offers over others. Interested data are taken from `JobFactors` column.

In [229]:
job_prior_df = df["JobFactors"].str.split(";", expand=True).stack().value_counts().sort_values(ascending=True)
job_prior_df = job_prior_df/df["JobFactors"].count()*100
job_prior_df = pd.DataFrame({"Job Priority": job_prior_df.index, "Percentage": job_prior_df.values})
job_prior_df["Label"] = job_prior_df.apply(lambda row: str(round(row["percentage"], 1)) + "%", axis=1)
fig = px.bar(job_prior_df, x="percentage", y="job_priority", orientation='h', title="Job priorities", text="Label")
fig.show()

Overall, developer prefers to continue working on their technology stack (languages & framework) from more than half of respondents (51.3%). Similar percentage of responses 41.4%-43.9%-44.5%, the three following important factors are company culture, flexible schedule and professional development oppotunities. The least important factors are the financial performance of the organization (11.9%), the specific team they would be working on (11.8%), and the diversity of the organization (6.9%).

### 2. How coding languages associated with salary?

In [359]:
lang_sal_df = df[(df["MainBranch"] == "I am a developer by profession")][["LanguageWorkedWith", "ConvertedComp"]]
lang_sal_df["LanguageWorkedWith"] = lang_sal_df["LanguageWorkedWith"].str.split(";")
lang_sal_df = lang_sal_df.explode("LanguageWorkedWith")
lang_sal_df = lang_sal_df.dropna(how="any", axis=0)
lang_sal_df = lang_sal_df.groupby(["LanguageWorkedWith"]).median()["ConvertedComp"].sort_values()
lang_sal_df = pd.DataFrame({"LanguageWorkedWith": lang_sal_df.index,  "Salary": lang_sal_df.values})
lang_sal_df["Label"] = lang_sal_df.apply(lambda row: "$" + human_format(round(row["Salary"], 0)), axis=1)

fig = px.bar(lang_sal_df, height=800, x="Salary", y="LanguageWorkedWith", orientation='h', title="Language vs Salary", text="Label")
fig.show()

### 3. What is the correlation between salary and developer type?

We filter developer professional to get the more accurate data

In [230]:
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])

In [231]:
dev_sal_df = df[(df["MainBranch"] == "I am a developer by profession")][["DevType", "ConvertedComp"]]
dev_sal_df = dev_sal_df.dropna(how="any", axis=0)
dev_sal_df["DevType"] = dev_sal_df["DevType"].str.split(";")
dev_sal_df = dev_sal_df.explode("DevType")
dev_sal_df = dev_sal_df.groupby(["DevType"]).median()["ConvertedComp"].sort_values()
dev_sal_df = pd.DataFrame({"DevType": dev_sal_df.index,  "Salary": dev_sal_df.values})
dev_sal_df["label"] = dev_sal_df.apply(lambda row: "$" + human_format(round(row["Salary"], 0)), axis=1)

fig = px.bar(dev_sal_df, height=800, x="Salary", y="DevType", orientation='h', title="Job priorities", text="label")
fig.show()

### How is the association if salary and programming language?


In [338]:
exp_sal_df = df[(df["MainBranch"] == "I am a developer by profession")][["YearsCodePro", "LanguageWorkedWith", "DevType", "ConvertedComp"]]
exp_sal_df["YearsCodePro"] = exp_sal_df.apply(lambda x: pd.to_numeric(x["YearsCodePro"], errors="coerce"), axis=1)
# exp_sal_df["Age1stCode"] = exp_sal_df.apply(lambda x: pd.to_numeric(x["Age1stCode"], errors="coerce"), axis=1)
exp_sal_df.dropna(how="any", inplace=True)
# exp_sal_df["CodingExperience"] = exp_sal_df.apply(lambda row: int(row["Age"]) - int(row["Age1stCode"]), axis=1)

exp_sal_df["DevType"] = exp_sal_df["DevType"].str.split(";")
exp_sal_df = exp_sal_df.explode("DevType")

# exp_sal_df.rename(columns={"ConvertedComp": "Salary"}, inplace=True)
# exp_sal_df.drop(["Age", "Age1stCode"], axis=1, inplace=True)

exp_sal_df = (
                exp_sal_df.groupby(["DevType"])
                .agg(
                    {
                        "DevType": "size", 
                        "YearsCodePro": "mean", 
                        "ConvertedComp": "median"
                    }
                )
                .rename(columns={"DevType": "Respondent", "ConvertedComp": "Salary", "YearsCodePro": "Avg Experience"})
                .reset_index()
            )
# exp_sal_df["LanguageWorkedWith"] = lang_sal_df["LanguageWorkedWith"].str.split(";")
# lang_sal_df = lang_sal_df.explode("LanguageWorkedWith")
# lang_sal_df = lang_sal_df.dropna(how="any", axis=0)
exp_sal_df

Unnamed: 0,DevType,Respondent,Avg Experience,Salary
0,Academic researcher,1455,8.2,39093.0
1,Data or business analyst,1737,10.88,56211.0
2,Data scientist or machine learning specialist,1895,8.18,59454.0
3,Database administrator,3330,11.01,51888.0
4,Designer,2664,10.02,49020.0
5,DevOps specialist,3963,10.54,69581.0
6,"Developer, QA or test",2326,9.34,55731.5
7,"Developer, back-end",17503,9.26,54480.0
8,"Developer, desktop or enterprise applications",7466,11.04,57129.0
9,"Developer, embedded applications or devices",2818,11.31,58368.0


In [358]:
show_labels = ["Academic researcher", 
               "Engineering manager", 
               "Senior executive/VP",
               "Engineer, site reliability",
               "DevOps specialist",
               "Engineer, data",
               "Data scientist or machine learning specialist",
               "Developer, full-stack",
               "Developer, back-end",
               "Developer, front-end",
               "Developer, mobile",
               "Database administrator",
               "Developer, desktop or enterprise applications"
              ]
exp_sal_df["Label"] = exp_sal_df.apply(lambda x: x["DevType"] if x["DevType"] in show_labels else "", axis=1)
fig = px.scatter(exp_sal_df, 
                 x="Avg Experience", 
                 y="Salary", 
                 text="Label",
                 color="DevType", 
                 size="Respondent", 
                 height=600, width=1100)
fig.update_traces(textposition="middle right")
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()