In [110]:
import pandas as pd
import numpy as np
from scipy import stats

In [19]:
py_df = pd.read_csv("../data/2020_sharing_data_outside.csv", low_memory=False)

In [None]:
oecd_df = pd.read_csv(
    "../data/oecd_locations.csv", header=None, names=["abbrev", "country"]
)

In [20]:
# columns categories are separated by '.' with 1, 2 or 3 field names
# general column names
generals = [
    "age",
    "are.you.datascientist",
    "is.python.main",
    "company.size",
    "country.live",
    "employment.status",
    "first.learn.about.main.ide",
    "how.often.use.main.ide",
    "is.python.main",
    "main.purposes",
    "missing.features.main.ide",
    "nps.main.ide",
    "python.version.most",
    "python.years",
    "python2.version.most",
    "python3.version.most",
    "several.projects",
    "team.size",
    "use.python.most",
    "years.of.coding",
]


def format_field(s):
    if s in generals:
        return ("general", s)
    else:
        return s.rsplit(".", 1)


py_df.columns = pd.MultiIndex.from_tuples(
    # [(x.rsplit(".", 1) if x.count(".") >= 2 else x.split(".")) for x in py_df.columns],
    [format_field(x) for x in py_df.columns]
)
py_df = py_df[sorted(py_df.columns)]

# Questions
1. What are the most popular 10 Python IDEs?
2. Which 10 other programming languages (`other.lang`) are most commonly used by Python devs?
3. What are the 10 most common countries respondents come from?
4. According to the Python survey, what proportion of devs have each level of experience? (`python.years`)
5. Which country has the greatest number of devs with 11+ years of experience?
6. Which country has the greatest proportion of devs with 11+ years of experience?

In [24]:
# 10 most common Python IDEs
py_df[("ide", "main")].value_counts().iloc[:10]

(ide, main)
VS Code                         8010
PyCharm Professional Edition    5144
PyCharm Community Edition       3815
Vim                             2176
Sublime Text                    1201
Jupyter Notebook                1167
Atom                             784
Other                            711
Emacs                            636
Spyder                           580
Name: count, dtype: int64

In [None]:
# 10 other programming languages are most commonly used
# each of the columns either has NaN or its programming language name, so just a count
# will figure out how many respondents answered with that language, given it skips NaN values
py_df["other.lang"].count().sort_values(ascending=False).iloc[:10]

JavaScript      16662
HTML/CSS        15469
Bash / Shell    13793
SQL             13391
C/C++           11623
Java             8109
C#               4460
PHP              4060
TypeScript       3717
Other            3592
dtype: int64

In [41]:
# 10 most common countries respondents come from
py_df[("general", "country.live")].value_counts(ascending=False).iloc[:10]

(general, country.live)
United States         3975
India                 2800
Germany               1807
China                 1155
United Kingdom        1110
France                1078
Russian Federation     935
Other country          880
Brazil                 812
Canada                 644
Name: count, dtype: int64

In [198]:
# what proportion of devs have each level of experience?
py_df[("general", "python.years")].value_counts() / py_df[
    ("general", "python.years")
].count() * 100
# The book reminds me that I can just use the normalize parameter to value_counts:
py_df[("general", "python.years")].value_counts(normalize=True)

(general, python.years)
3–5 years          0.28
Less than 1 year   0.24
1–2 years          0.22
6–10 years         0.15
11+ years          0.10
Name: proportion, dtype: float64

In [None]:
py_df[("general", "years.of.coding")].value_counts() / py_df[
    ("general", "years.of.coding")
].count() * 100

(general, years.of.coding)
Less than 1 year   33.74
1–2 years          19.41
3–5 years          19.01
11+ years          15.76
6–10 years         12.08
Name: count, dtype: float64

In [78]:
# which country has the greatest number of devs with 11+ years of experience?
devs11_by_country = (
    py_df.loc[py_df[("general", "python.years")] == "11+ years"]
    .groupby(("general", "country.live"))
    .count()[("general", "python.years")]
    .sort_values(ascending=False)
)
devs11_by_country

(general, country.live)
United States     691
Germany           308
United Kingdom    207
France            166
Australia          94
                 ... 
Tunisia             1
Egypt               1
Nigeria             1
Pakistan            1
Viet Nam            1
Name: (general, python.years), Length: 70, dtype: int64

In [None]:
# country with the greatest proportion of devs with 11+ years of experience
devs_by_country = py_df.groupby(("general", "country.live"))[
    [("general", "country.live")]
].count()

# devs11_by_country / devs_by_country
country_years = devs11_by_country.to_frame().join(devs_by_country)
(
    country_years[("general", "python.years")]
    / country_years[("general", "country.live")]
).sort_values(ascending=False)
# so Norway has the highest proportion of 11+ year Python devs with 26.5%

(general, country.live)
Norway        0.265432
Ireland       0.225490
Australia     0.225420
Belgium       0.225108
Slovenia      0.224490
                ...   
India         0.015714
Viet Nam      0.013158
Bangladesh    0.009174
Pakistan      0.006849
Nigeria       0.006173
Length: 70, dtype: float64

In [None]:
# let's look at the years of coding and the main purpose for the respondents
py_df.groupby([("general", "main.purposes")])[
    [("general", "years.of.coding")]
].value_counts(normalize=True) * 100

(general, main.purposes)                     (general, years.of.coding)
Both for work and personal                   3–5 years                    23.01
                                             Less than 1 year             21.35
                                             1–2 years                    20.96
                                             11+ years                    19.50
                                             6–10 years                   15.18
For personal, educational  or side projects  Less than 1 year             66.93
                                             1–2 years                    16.64
                                             3–5 years                     8.08
                                             11+ years                     5.14
                                             6–10 years                    3.21
For work                                     Less than 1 year             22.91
                                             3–5

# Stack Overflow survey data
1. Show the average salary for different types of employment (contractors, freelancers, full time)
2. Make a pivot table indexed by country, columns are education levels, and cells contain average salary for each education level per country
3. Create this pivot table again for only countries in the OECD subset. In which country does someone with only an Associate Degree earn the most? In which of them does someone with a doctorate earn the most?
4. Remove rows from `so_df` where `LanguageHaveWorkedWith` is `NaN`
5. Remove rows from `so_df` where Python is not a commonly used language (`LanguageHaveWorkedWith`)
6. Remove rows from `so_df` where `YearsCode` is `NaN`. How many rows remain?
7. Replace the string `Less than 1 year` in `YearsCode` with `0`. Replace the string `More than 50 years` with `51`
8. Turn `YearsCode` into an integer column.
9. Create a new column called `experience` to categorise the values in `YearsCode` to:
  - Less than 1 year
  - 1-2 years
  - 3-5 years
  - 6-10 years
  - 11+ years
10. According to the Stack Overflow survey what proportion of Python developers have each level of experience?

In [171]:
# load the SO survey data
so_df = pd.read_csv("../data/so_2021_survey_results.csv")
so_df.shape[0]

83439

In [172]:
pd.options.display.float_format = "{:,.2f}".format


In [173]:
# the EdLevel values are really long, let's just keep the bit not in brackets
so_df["EdLevel"] = so_df["EdLevel"].str.split(r" \(").str[0]

In [None]:
# show the average salary for different types of employment
(
    so_df.groupby("Employment")["ConvertedCompYearly"]
    .mean()
    .dropna()
    .sort_values(ascending=False)
    .apply("${:,.2f}".format)
)

Employment
I prefer not to say                                     $1,455,643.25
Employed full-time                                        $121,369.67
Independent contractor, freelancer, or self-employed      $107,433.97
Retired                                                    $69,533.25
Employed part-time                                         $41,136.12
Name: ConvertedCompYearly, dtype: object

In [175]:
so_df["CompZScore"] = so_df.groupby("Employment")["ConvertedCompYearly"].transform(
    lambda x: stats.zscore(x, nan_policy="omit")
)
(
    so_df[(so_df["CompZScore"] < 3) & (so_df["CompZScore"] > -3)]
    .groupby("Employment")["ConvertedCompYearly"]
    .describe()
)
# adjusting to remove values more than 3 standard deviations from the mean for each group, full timers still get
# more than other groups

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Employment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Employed full-time,40323.0,88346.7,138424.34,1.0,28308.0,57072.0,98321.0,1608000.0
Employed part-time,1390.0,32237.51,33368.53,12.0,9129.0,22056.0,46424.0,320688.0
I prefer not to say,31.0,43202.32,40039.49,68.0,8472.0,33972.0,66929.5,160932.0
"Independent contractor, freelancer, or self-employed",4724.0,84080.85,112732.12,1.0,24522.0,56875.0,108098.0,1395996.0
Retired,16.0,69533.25,73898.83,100.0,9796.5,32910.0,120252.5,220000.0


In [176]:
# Make a pivot table indexed by country, columns are education levels, and cells contain average salary for each education level per country
so_df.pivot_table(index="Country", columns="EdLevel", values="ConvertedCompYearly")

EdLevel,Associate degree,Bachelor’s degree,Master’s degree,Other doctoral degree,Primary/elementary school,Professional degree,Secondary school,Some college/university study without earning a degree,Something else
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,,30288.00,10176704.00,,,,100.00,,
Albania,,19152.86,80127.62,,,5298.00,19890.00,22884.00,128522.00
Algeria,,21770.67,15052.57,,,12912.00,,6288.00,
Andorra,,94045.50,22056.00,146981.00,,,,,
Angola,,31500.00,,,,,18678.00,6904.00,
...,...,...,...,...,...,...,...,...,...
"Venezuela, Bolivarian Republic of...",,30108.77,28680.00,,7200.00,14833.29,10200.00,17720.57,
Viet Nam,7827.00,18463.11,50599.80,2592.00,10479.00,30000.00,,18866.19,
Yemen,,5628.67,,,,,,,
Zambia,,40173.00,4908.00,,,,4482.00,12105.33,8184.00


In [None]:
# same pivot table but only include countries from the OECD data set
(
    so_df[so_df["Country"].isin(oecd_df["country"])]
    .pivot_table(index="Country", columns="EdLevel", values="ConvertedCompYearly")
    .T
)
# Finland and Israel  have their highest earners with Associate degrees
# Australia (ignoring the primary school), France, and Japan have their highest earners with doctoral degrees

Country,Australia,Austria,Belgium,Brazil,Canada,Denmark,Finland,France,Germany,Hungary,Israel,Italy,Japan
EdLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Associate degree,117049.64,43623.38,35664.0,25347.42,87930.35,80217.33,282353.67,54394.89,98530.52,51041.0,146420.9,36427.94,143196.83
Bachelor’s degree,180794.16,66096.13,68474.7,47681.23,140668.24,80430.94,69381.34,65062.38,102751.23,48756.45,124973.4,73697.22,85821.47
Master’s degree,106794.32,77645.66,88580.41,42056.01,144733.35,115751.85,74360.98,94585.73,110611.98,46771.71,128492.71,79132.19,130143.19
Other doctoral degree,150234.96,74783.17,80832.44,43123.21,102989.35,102785.19,61508.25,140402.86,108718.46,52833.6,131812.62,93490.78,157239.4
Primary/elementary school,153327.5,86877.0,11342349.25,7880.0,73787.83,91056.57,77832.0,34181.0,74943.67,48100.0,100514.57,525992.17,44965.0
Professional degree,108725.0,38915.33,71000.2,25449.67,82953.4,114144.89,83016.0,78342.27,97330.14,9852.0,132336.0,117121.37,
Secondary school,158931.28,47438.83,43224.13,15072.89,180584.81,90411.74,59424.32,53981.81,82865.84,33440.8,97938.78,75114.4,57916.4
Some college/university study without earning a degree,127063.53,53906.05,88006.29,39978.02,155090.08,85131.35,89255.56,55119.45,95737.77,37102.89,130049.04,81531.72,91163.04
Something else,231987.17,45772.53,27035.5,20288.71,60795.17,95479.86,35024.0,42884.04,109333.99,28911.0,96142.5,32690.22,194514.0


In [None]:
# looking at the book solution, the question was actually where can you earn the most with an Associate degree or
# a doctoral degree - not in which countries were those the highest paid of the education levels
(
    so_df[so_df["Country"].isin(oecd_df["country"])]
    .pivot_table(index="Country", columns="EdLevel", values="ConvertedCompYearly")[
        "Associate degree"
    ]
    .sort_values(ascending=False)
    .iloc[:1]
)

Country
Finland   282,353.67
Name: Associate degree, dtype: float64

In [None]:
(
    so_df[so_df["Country"].isin(oecd_df["country"])]
    .pivot_table(index="Country", columns="EdLevel", values="ConvertedCompYearly")[
        "Other doctoral degree"
    ]
    .sort_values(ascending=False)
    .iloc[:1]
)


Country
Japan   157,239.40
Name: Other doctoral degree, dtype: float64

In [178]:
# remove rows LangageWorkedWith is NaN
so_python_df = so_df.dropna(subset=["LanguageHaveWorkedWith"])
so_python_df.shape[0]

82357

In [None]:
# remove rows in which Python is not in the LanguageHaveWorkedWith
so_python_df = so_python_df.loc[
    so_python_df["LanguageHaveWorkedWith"].str.contains("Python")
]
so_python_df.shape[0]

39792

In [184]:
# remove all rows in which YearsCode is NaN
so_python_df = so_python_df.dropna(subset=["YearsCode"])
so_python_df.shape[0]

38997

In [None]:
# turn the YearsCode column into an integer type
def yearsCodeTransformer(s):
    if s == "Less than 1 year":
        return "0"
    elif s == "More than 50 years":
        return "51"
    else:
        return s


so_python_df["YearsCode"] = (
    so_python_df["YearsCode"].transform(yearsCodeTransformer).astype(np.int32)
)
so_python_df.shape[0]

38997

In [None]:
so_python_df["experience"] = pd.cut(
    so_python_df["YearsCode"],
    bins=[0, 1, 2, 5, 10, float("inf")],
    labels=["Less than 1 year", "1-2 years", "3-5 years", "6-10 years", "11+ years"],
    include_lowest=True,
)
so_python_df["experience"].value_counts()

experience
11+ years           14561
6-10 years          12424
3-5 years            8678
1-2 years            1850
Less than 1 year     1484
Name: count, dtype: int64

In [None]:
# proportion of SO users are in each experience bin
(so_python_df["experience"].value_counts() / so_python_df.shape[0]) * 100

experience
11+ years          37.34
6-10 years         31.86
3-5 years          22.25
1-2 years           4.74
Less than 1 year    3.81
Name: count, dtype: float64