In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(
    "../data/so_2021_survey_results.csv",
    usecols=[
        "LanguageHaveWorkedWith",
        "LanguageWantToWorkWith",
        "Country",
        "CompTotal",
        "ConvertedCompYearly",
    ],
)
df.head()

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith,ConvertedCompYearly
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift,62268.0
1,Netherlands,,JavaScript;Python,,
2,Russian Federation,,Assembly;C;Python;R;Rust,Julia;Python;Rust,
3,Austria,,JavaScript;TypeScript,JavaScript;TypeScript,
4,United Kingdom of Great Britain and Northern I...,,Bash/Shell;HTML/CSS;Python;SQL,Bash/Shell;HTML/CSS;Python;SQL,


# Questions
1. What are the different programming languages that developers use?
2. What are the 10 most common programming languages used today?
3. What are the 10 most common programming languages that people want to use?
4. What languages are in both top 10 lists?
5. What languages are in the top 10 most used languages that people don't want to work with in the future?
6. What is the current most popular language used in each country?
7. What is the mean number of languages used in the last year?
8. What is the greatest number of langages people listed as having used in the past year?
9. How many people chose that largest number?
10. How many people in the survey claim a salary of $2 million or more?


In [3]:
# 1. different programming languages developers use
langs = df["LanguageHaveWorkedWith"].str.split(";").explode()
langs.drop_duplicates().count()

38

In [7]:
# 2. 10 most common programming languages used
langs.value_counts(ascending=False).iloc[:10]
# book version
langs.value_counts().head(10)

LanguageHaveWorkedWith
JavaScript    53587
HTML/CSS      46259
Python        39792
SQL           38835
Java          29162
Node.js       27975
TypeScript    24909
C#            22984
Bash/Shell    22385
C++           20057
Name: count, dtype: int64

In [5]:
# 3. 10 most common languages people want to use
future_langs = df["LanguageWantToWorkWith"].str.split(";").explode()
future_langs.value_counts(ascending=False).iloc[:10]

LanguageWantToWorkWith
JavaScript    37008
Python        34929
HTML/CSS      29353
TypeScript    26905
SQL           26631
Node.js       24100
C#            17999
Java          17222
Rust          15865
Go            15788
Name: count, dtype: int64

In [8]:
# 4. which languages are in both top 10 lists?
now_10 = langs.value_counts(ascending=False).head(10).index
future_10 = future_langs.value_counts(ascending=False).head(10).index
future_10[future_10.isin(now_10)]
# book version:
now_10.intersection(future_10)

Index(['JavaScript', 'HTML/CSS', 'Python', 'SQL', 'Java', 'Node.js',
       'TypeScript', 'C#'],
      dtype='object')

In [11]:
# 5. which languages have people worked with but don't want to work with
now_10.difference(future_10)
# book version uses ~isin, which is weird, because they pointed out that
# intersection was a nicer way of doing the previous query
now_10[~now_10.isin(future_10)]

Index(['Bash/Shell', 'C++'], dtype='object', name='LanguageHaveWorkedWith')

In [None]:
# 6. which languages are most popular in each country?
# interesting - when you run an .explode() it keeps the index of the parent row
all_languages = df["LanguageHaveWorkedWith"].str.split(";").explode()
# if we just join we'll get an error about duplicate column labels
# we can add a suffix using lsuffix or rsuffix, or just join on the country
(df[["Country"]].join(all_languages).groupby("Country").agg(pd.Series.mode))
# this one doesn't really do what the book wants, but is an interesting variation on it regardless
df[["Country"]].join(all_languages).groupby("Country", as_index=False).value_counts()

Unnamed: 0,Country,LanguageHaveWorkedWith,count
0,Afghanistan,JavaScript,32
1,Afghanistan,HTML/CSS,28
2,Afghanistan,C++,23
3,Afghanistan,SQL,22
4,Afghanistan,Java,21
...,...,...,...
4954,Zimbabwe,Objective-C,1
4955,Zimbabwe,Perl,1
4956,Zimbabwe,R,1
4957,Zimbabwe,Scala,1


In [36]:
# 7. what is the mean number of languages used in the last year?
df["LanguageHaveWorkedWith"].str.split(";").str.len().mean()

5.373678011583714

In [37]:
# 8. total highest number of languages used by a person in the last year
df["LanguageHaveWorkedWith"].str.split(";").str.len().max()

38.0

In [39]:
# 9. how many people chose the largest number of languages?
highest_langs = df["LanguageHaveWorkedWith"].str.split(";").str.len().max()
lang_counts = df["LanguageHaveWorkedWith"].str.split(";").str.len()
lang_counts[lang_counts == highest_langs].count()

32

In [57]:
# 10. How many people claim a salary of $2m or more?
df.loc[df["CompTotal"] >= 2_000_000].shape[0]

2369

In [59]:
# what about country breakdown for those people?
df[df["CompTotal"] >= 2_000_000].groupby("Country", as_index=False)[
    "Country"
].value_counts()


Unnamed: 0,Country,count
0,Afghanistan,1
1,Albania,3
2,Argentina,12
3,Armenia,10
4,Australia,2
...,...,...
64,United States of America,10
65,Uruguay,2
66,Uzbekistan,17
67,"Venezuela, Bolivarian Republic of...",5


In [61]:
# remove the high reported salaries from the data set
df = df.loc[~(df["CompTotal"] >= 2_000_000)]

In [None]:
# add columns for the 1/0 dummies for the results of each language response after being split on ";"
# concat with columns to widen the dataset
df = pd.concat(
    [df, df["LanguageHaveWorkedWith"].str.get_dummies(sep=";")], axis="columns"
)
df.head()

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith,ConvertedCompYearly,APL,Assembly,Bash/Shell,C,C#,...,PowerShell,Python,R,Ruby,Rust,SQL,Scala,Swift,TypeScript,VBA
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift,62268.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Netherlands,,JavaScript;Python,,,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Russian Federation,,Assembly;C;Python;R;Rust,Julia;Python;Rust,,0,1,0,1,0,...,0,1,1,0,1,0,0,0,0,0
3,Austria,,JavaScript;TypeScript,JavaScript;TypeScript,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,United Kingdom of Great Britain and Northern I...,,Bash/Shell;HTML/CSS;Python;SQL,Bash/Shell;HTML/CSS;Python;SQL,,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0


In [64]:
# average salary of someone who knows Python and Java but not JavaScript
(
    df.loc[(df["Python"] == 1) & (df["Java"] == 1) & (df["JavaScript"] == 0)][
        "CompTotal"
    ].mean()
)

162737.10379596677

In [66]:
# average salary of someone who knows Java and JavaScript but not Python
(
    df.loc[(df["Java"] == 1) & (df["JavaScript"] == 1) & (df["Python"] == 0)][
        "CompTotal"
    ].mean()
)

140867.65981559738

In [67]:
# what about front end devs? so JS + HTML/CSS
(df.loc[(df["JavaScript"] == 1) & (df["HTML/CSS"] == 1)]["CompTotal"].mean())

125525.97576994763

# Extension questions
1. What are the three things developers are most likely to do when `NEW-Stuck`?
2. What proportion of the respondents reported their gender as `Man`?
3. On average, what proportion of `YearsCoding` have been done professionally? (`YearsCodePro`)?

In [93]:
dfx = pd.read_csv(
    "../data/so_2021_survey_results.csv",
    usecols=["NEWStuck", "Gender", "YearsCode", "YearsCodePro"],
)

In [94]:
# 1. top three things developers do when stuck - ;-separated
dfx["NEWStuck"].str.split(";").explode().value_counts().head(3)

NEWStuck
Google it                            74491
Visit Stack Overflow                 66410
Do other work and come back later    39871
Name: count, dtype: int64

In [95]:
# 2. number of respondents identifying as a `Man`
dfx[["Gender"]].value_counts(normalize=True)

Gender                                                                            
Man                                                                                   0.909231
Woman                                                                                 0.050069
Prefer not to say                                                                     0.017524
Non-binary, genderqueer, or gender non-conforming                                     0.008385
Or, in your own words:                                                                0.005019
Man;Or, in your own words:                                                            0.003257
Man;Non-binary, genderqueer, or gender non-conforming                                 0.003062
Woman;Non-binary, genderqueer, or gender non-conforming                               0.001786
Man;Woman                                                                             0.000498
Man;Woman;Non-binary, genderqueer, or gender non-conforming   

In [96]:
# 3. what proportion of years coding have been done professionally?
dfx = dfx.dropna(subset=["YearsCode", "YearsCodePro"])  # no nulls
# put in valid int-able representations for the range values
dfx["YearsCode"] = (
    dfx["YearsCode"]
    .replace("Less than 1 year", "0")
    .replace("More than 50 years", "51")
    .astype(float)
)
dfx["YearsCodePro"] = (
    dfx["YearsCodePro"]
    .replace("Less than 1 year", 0)
    .replace("More than 50 years", "51")
    .astype(float)
)
# get rid of any zeroes in the denominator
dfx = dfx[~(dfx["YearsCode"] == 0)]


In [97]:
(dfx["YearsCodePro"] / dfx["YearsCode"]).mean()

0.5923711657118932