In [1]:
# Python adoption percentage per country
# This notebook analyses the percentage of respondents per country in the survey that report to use Python

In [2]:
import pandas as pd
df = pd.read_csv(r"C:\Users\YawOM\OneDrive\Desktop\Data Science\survey_results_public.csv")
# Load the 2018 Stack Overflow survey

In [3]:
df.dropna(subset = ["LanguageWorkedWith", "Country"], axis = "rows", how = "any", inplace = True)
# Rows with both country and coding languages are needed, any row without both of these values is dropped

In [4]:
country_group = df.groupby(["Country"],)
# Groupby country, used to make a dataframe with values for each country

In [5]:
respondents = country_group["Respondent"].count()
# Calculates amount of responders per country

In [6]:
pyt_num = country_group["LanguageWorkedWith"].apply(lambda x: x.str.contains("Python", na = False).sum())
# Calculates the amount of answers in column contains python, per country

In [7]:
df_pkp = pd.concat([respondents, pyt_num], axis = "columns", sort = False)

In [8]:
df_pkp
# Produced a table with the two variables as columns, and the countries as the index

Unnamed: 0_level_0,Respondent,LanguageWorkedWith
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,39,8
Albania,83,23
Algeria,126,40
Andorra,7,0
Angola,4,2
...,...,...
"Venezuela, Bolivarian Republic of...",87,28
Viet Nam,220,78
Yemen,17,3
Zambia,12,4


In [9]:
df_pkp["Percentage"] = ((df_pkp["LanguageWorkedWith"] / df_pkp["Respondent"]) * 100)

In [10]:
df_pkp

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,39,8,20.512821
Albania,83,23,27.710843
Algeria,126,40,31.746032
Andorra,7,0,0.000000
Angola,4,2,50.000000
...,...,...,...
"Venezuela, Bolivarian Republic of...",87,28,32.183908
Viet Nam,220,78,35.454545
Yemen,17,3,17.647059
Zambia,12,4,33.333333


In [11]:
all_df = pd.read_csv(r"C:\Users\YawOM\OneDrive\Desktop\Data Science\all.csv")
# Loads in a file containing the regions that each country is specified to be in

In [12]:
df_pkp.describe()
# Shows the general stats of the datafram, including interquartile rang, the min/max values, and the standard deviation.

Unnamed: 0,Respondent,LanguageWorkedWith,Percentage
count,178.0,178.0,178.0
mean,491.960674,204.735955,37.032138
std,1822.012587,840.798036,18.831299
min,1.0,0.0,0.0
25%,7.0,3.0,30.081967
50%,56.5,20.0,36.033824
75%,312.0,118.5,46.153846
max,20769.0,10083.0,100.0


In [13]:
q1 = df_pkp["Percentage"].quantile(0.25)
q3 = df_pkp["Percentage"].quantile(0.75)

q1, q3
# Calculate the lower and upper interquartile percentage values, and therefore the range as well.

(np.float64(30.081967213114755), np.float64(46.15384615384615))

In [14]:
typical_countries = ((df_pkp["Percentage"] >= q1) & (df_pkp["Percentage"] <= q3))

In [15]:
df_pkp.loc[typical_countries]
# Produces a list of countries that sit within the interquartile ranges

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Algeria,126,40,31.746032
Argentina,547,191,34.917733
Armenia,84,28,33.333333
Australia,1880,790,42.021277
Austria,832,338,40.625000
...,...,...,...
Uzbekistan,39,14,35.897436
"Venezuela, Bolivarian Republic of...",87,28,32.183908
Viet Nam,220,78,35.454545
Zambia,12,4,33.333333


In [16]:
df_pkp.sort_values("Percentage", ascending = True).head(10)
# Produces the bottom 10 countries in developers that know python percentage

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Andorra,7,0,0.0
Antigua and Barbuda,7,0,0.0
Cape Verde,2,0,0.0
Burkina Faso,4,0,0.0
Brunei Darussalam,1,0,0.0
Gabon,2,0,0.0
Djibouti,1,0,0.0
Chad,1,0,0.0
Papua New Guinea,1,0,0.0
North Korea,1,0,0.0


In [17]:
df_pkp.sort_values("Percentage", ascending = False).head(10)
# Produces the top 10 countries in developers that know python percentage

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dominica,1,1,100.0
Niger,1,1,100.0
Sierra Leone,1,1,100.0
Sao Tome and Principe,1,1,100.0
Timor-Leste,1,1,100.0
Turkmenistan,7,6,85.714286
Mauritania,7,5,71.428571
Guinea,3,2,66.666667
Guyana,3,2,66.666667
Bahamas,3,2,66.666667


In [18]:
country_to_region = (all_df.dropna(subset=["region"]).set_index("name")["region"])

In [19]:
df_pkp["Region"] = df_pkp.index.map(country_to_region)

In [20]:
df_pkp
# Added the regions of each country using both files, and mapping the indexes of both files

Unnamed: 0_level_0,Respondent,LanguageWorkedWith,Percentage,Region
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,39,8,20.512821,Asia
Albania,83,23,27.710843,Europe
Algeria,126,40,31.746032,Africa
Andorra,7,0,0.000000,Europe
Angola,4,2,50.000000,Africa
...,...,...,...,...
"Venezuela, Bolivarian Republic of...",87,28,32.183908,
Viet Nam,220,78,35.454545,Asia
Yemen,17,3,17.647059,Asia
Zambia,12,4,33.333333,Africa


In [21]:
df_pkp.dropna(subset = ["Region"], axis = "rows", how = "any")
Africa = (df_pkp["Region"] == "Africa")
df_pkp.loc[Africa].sum()
# Produces the total number of respondents in Africa.

Respondent                                                         2586
LanguageWorkedWith                                                  928
Percentage                                                   1550.96169
Region                AfricaAfricaAfricaAfricaAfricaAfricaAfricaAfri...
dtype: object

In [22]:
df_pkp["Respondent"].sum()

np.int64(87569)

In [23]:
africa_percentage = (2586 / 87569) * 100
africa_percentage
# Calculates the percentage of developers that answered this survey are in Africa

2.9530998412680285

In [24]:
df_pkp.to_csv("C:/Users/YawOM/OneDrive/Desktop/Data Science/Python_percentage")
# Save to specific file