In [1]:
# Python adoption percentage per country
# This notebook analyses the percentage of respondents per country in the survey that report to use Python

In [2]:
import pandas as pd
df = pd.read_csv(r"C:\Users\YawOM\OneDrive\Desktop\Data Science\survey_results_public.csv")
# Load the 2018 Stack Overflow survey

In [3]:
df.dropna(subset = ["LanguageWorkedWith", "Country"], axis = "rows", how = "any")
# Rows with both country and coding languages are needed, any row without both of these values is dropped

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88875,88182,,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed part-time,Pakistan,,"Secondary school (e.g. American high school, G...",,...,Not applicable - I did not use Stack Overflow ...,Courses on technologies you're interested in,,Man,No,Straight / Heterosexual,,Yes,Too short,Neither easy nor difficult
88876,88212,,No,Less than once per year,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Spain,No,"Secondary school (e.g. American high school, G...",,...,,Tech articles written by other developers;Indu...,40.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
88877,88282,,Yes,Once a month or more often,The quality of OSS and closed source software ...,"Not employed, but looking for work",United States,No,Some college/university study without earning ...,"Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,,,Man,No,Straight / Heterosexual,,No,Too short,Neither easy nor difficult
88878,88377,,Yes,Less than once a month but more than once per ...,The quality of OSS and closed source software ...,"Not employed, and not looking for work",Canada,No,Primary/elementary school,,...,,Tech articles written by other developers;Tech...,,Man,No,,,No,Appropriate in length,Easy


In [4]:
country_group = df.groupby(["Country"],)
# Groupby country, used to make a dataframe with values for each country

In [5]:
respondents = country_group["Respondent"].count()
# Calculates amount of responders per country

In [6]:
pyt_num = country_group["LanguageWorkedWith"].apply(lambda x: x.str.contains("Python", na = False).sum())
# Calculates the amount of answers in column contains python, per country

In [7]:
df_pkp = pd.concat([respondents, pyt_num], axis = "columns", sort = False)

In [8]:
df_pkp
# Produced a table with the two variables as columns, and the countries as the index

Unnamed: 0_level_0,Respondent,LanguageWorkedWith
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,44,8
Albania,86,23
Algeria,134,40
Andorra,7,0
Angola,5,2
...,...,...
"Venezuela, Bolivarian Republic of...",88,28
Viet Nam,231,78
Yemen,19,3
Zambia,12,4


In [9]:
df_pkp["Percentage"] = ((df_pkp["LanguageWorkedWith"] / df_pkp["Respondent"]) * 100)

In [10]:
df_pkp

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,44,8,18.181818
Albania,86,23,26.744186
Algeria,134,40,29.850746
Andorra,7,0,0.000000
Angola,5,2,40.000000
...,...,...,...
"Venezuela, Bolivarian Republic of...",88,28,31.818182
Viet Nam,231,78,33.766234
Yemen,19,3,15.789474
Zambia,12,4,33.333333


In [11]:
all_df = pd.read_csv(r"C:\Users\YawOM\OneDrive\Desktop\Data Science\all.csv")
# Loads in a file containing the regions that each country is specified to be in

In [25]:
df_pkp.describe()
# Shows the general stats of the datafram, including interquartile rang, the min/max values, and the standard deviation.

Unnamed: 0,Respondent,LanguageWorkedWith,Percentage
count,179.0,179.0,179.0
mean,495.815642,203.592179,35.750651
std,1836.29241,838.572554,18.289211
min,1.0,0.0,0.0
25%,7.0,3.0,29.111626
50%,54.0,20.0,34.693878
75%,318.0,118.0,43.290199
max,20949.0,10083.0,100.0


In [13]:
q1 = df_pkp["Percentage"].quantile(0.25)
q3 = df_pkp["Percentage"].quantile(0.75)

q1, q3
# Calculate the lower and upper interquartile percentage values, and therefore the range as well.

(np.float64(29.111625779150227), np.float64(43.29019889767554))

In [14]:
typical_countries = ((df_pkp["Percentage"] >= q1) & (df_pkp["Percentage"] <= q3))

In [15]:
df_pkp.loc[typical_countries]
# Produces a list of countries that sit within the interquartile ranges

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Algeria,134,40,29.850746
Angola,5,2,40.000000
Argentina,553,191,34.538879
Armenia,86,28,32.558140
Australia,1903,790,41.513400
...,...,...,...
Uzbekistan,39,14,35.897436
"Venezuela, Bolivarian Republic of...",88,28,31.818182
Viet Nam,231,78,33.766234
Zambia,12,4,33.333333


In [16]:
df_pkp.sort_values("Percentage", ascending = True).head(10)
# Produces the bottom 10 countries in developers that know python percentage

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Andorra,7,0,0.0
Antigua and Barbuda,9,0,0.0
Cape Verde,3,0,0.0
Burkina Faso,4,0,0.0
Brunei Darussalam,1,0,0.0
Gabon,2,0,0.0
Djibouti,2,0,0.0
Chad,1,0,0.0
Papua New Guinea,1,0,0.0
Lao People's Democratic Republic,3,0,0.0


In [17]:
df_pkp.sort_values("Percentage", ascending = False).head(10)
# Produces the top 10 countries in developers that know python percentage

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dominica,1,1,100.0
Sao Tome and Principe,1,1,100.0
Niger,1,1,100.0
Timor-Leste,1,1,100.0
Turkmenistan,7,6,85.714286
Mauritania,7,5,71.428571
Guinea,3,2,66.666667
Bahamas,3,2,66.666667
Guyana,3,2,66.666667
Uganda,72,47,65.277778


In [18]:
country_to_region = (all_df.dropna(subset=["region"]).set_index("name")["region"])

In [19]:
df_pkp["Region"] = df_pkp.index.map(country_to_region)

In [20]:
df_pkp
# Added the regions of each country using both files, and mapping the indexes of both files

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage,Region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,44,8,18.181818,Asia
Albania,86,23,26.744186,Europe
Algeria,134,40,29.850746,Africa
Andorra,7,0,0.000000,Europe
Angola,5,2,40.000000,Africa
...,...,...,...,...
"Venezuela, Bolivarian Republic of...",88,28,31.818182,
Viet Nam,231,78,33.766234,Asia
Yemen,19,3,15.789474,Asia
Zambia,12,4,33.333333,Africa


In [21]:
df_pkp.dropna(subset = ["Region"], axis = "rows", how = "any")
Africa = (df_pkp["Region"] == "Africa")
df_pkp.loc[Africa].sum()
# Produces the total number of respondents in Africa.

Respondent                                                         2682
LanguageWorkedWith                                                  928
Percentage                                                  1467.313547
Region                AfricaAfricaAfricaAfricaAfricaAfricaAfricaAfri...
dtype: object

In [22]:
df_pkp["Respondent"].sum()

np.int64(88751)

In [24]:
africa_percentage = (2682 / 88751) * 100
africa_percentage
# Calculates the percentage of developers that answered this survey are in Africa

3.0219377809827495