In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
pd.set_option('display.max_columns', 100)

data_2021 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', low_memory = False, encoding='UTF-8')
questions_2021 = data_2021.iloc[0, :].T
data_2021 = data_2021.iloc[1:, :]
data_2020 = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv', low_memory = False, encoding='UTF-8')
questions_2020 = data_2020.iloc[0, :].T
data_2020 = data_2020.iloc[1:, :]
data_2019 = pd.read_csv('../input/kaggle-survey-2019/multiple_choice_responses.csv', low_memory = False, encoding='UTF-8')
questions_2019 = data_2019.iloc[0, :].T
data_2019 = data_2019.iloc[1:, :]
data_2018 = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv', low_memory = False, encoding='UTF-8')
questions_2018 = data_2018.iloc[0, :].T
data_2018 = data_2018.iloc[1:, :]
data_2017 = pd.read_csv('../input/kaggle-survey-2017/multipleChoiceResponses.csv', low_memory = False, encoding='ISO-8859-1')
questions_2017 = data_2017.iloc[0, :].T
data_2017 = data_2017.iloc[1:, :]

# 2021 Yes. 2020 Yes. 2019 Yes. About TPUs

How data science fields has evolved from the 2018 to 2021? 

In [2]:
# Exhibit 1. Data Science Professionals distribution by industry 2018 vs. 2021

# Get data from 2021

industry_2021 = data_2021[data_2021['Q20'].notna()]
c = industry_2021['Q20'].value_counts(normalize=True).rename_axis('industry').reset_index(name='counts')
#c =industry_2021['Q20'].value_counts().rename_axis('industry').reset_index(name='counts')

# Get data from 2018

industry_2018 = data_2018[data_2018['Q7'] != 'I am a student']
industry_2018 = industry_2018[industry_2018['Q7'].notna()]
d = industry_2018['Q7'].value_counts(normalize=True).rename_axis('industry').reset_index(name='counts')
#d = industry_2018['Q7'].value_counts().rename_axis('industry').reset_index(name='counts')

# compute the industry

k = pd.merge(left = d, right = c, on = 'industry')
k = k.rename(columns = {'counts_x': '2018', 'counts_y': '2021'})
k = k.sort_values(by=['2021'], ascending=False)

# compute the difference
diff_industry = k.copy()
diff_industry['dff'] = k['2021'] - k['2018']

# plot

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize = (16,8))
#plt.style.use('IPython_default')
gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1]) 
ax1 = plt.subplot(gs[0])

k.plot.barh(x = "industry", ax= ax1)
#ax.grid(False)
ax1.set(title = "Data Science professionals distribution by Industry. 2018 vs 2021",
      xlabel = "Percentage",
      ylabel = "Industry")
ax1.invert_yaxis()

ax2 = plt.subplot(gs[1])
diff_industry['dff'].plot(kind='barh', x = 'industry', ax = ax2,
                    color=(diff_industry['dff'] > 0).map({True: 'g',
                                                    False: 'r'}))
ax2.set(title = "Change",
      xlabel = "Percentage",
      ylabel = "Industry")
ax2.set_yticks([])

plt.gca().invert_yaxis()

* Student do not fill Industry columns.

* 2017, 2019, 2020 Do not have industry field data

In [3]:
def get_professionals(data, column):
    data = data.loc[data[column] != 'Student']
    data = data.loc[data[column] != 'Currently not employed']
    data = data.loc[data[column] != 'Not employed']
    data = data.loc[data[column].notna()]
    return data

In [4]:
# Exhibit 1.1 Data Science distribution by company size 2021.

test = get_professionals(data_2021, 'Q5')
#print(len(test))
test = test[test['Q21'].notna()]
#print(len(test))

category_test = test.groupby(['Q20', 'Q21']).size()
#category_test.plot(kind='bar')
new_df = category_test.to_frame(name = 'size').reset_index()
new_df_2= pd.pivot(
    data = new_df,
    index = 'Q20',
    columns = 'Q21',
    values = 'size')
new_df_2.index.names = ['Industry']
new_df_2.columns.names = ['Company Size']

columns_order = ['0-49 employees', '50-249 employees', '250-999 employees', '1000-9,999 employees','10,000 or more employees']

new_df_2 = new_df_2.reindex(columns = columns_order)
new_df_2['total'] = new_df_2[columns_order].sum(axis = 1)
new_df_2 = new_df_2.sort_values(by = 'total', ascending = False)
new_df_2 = new_df_2.drop(columns='total')

# -----


new_df_2['total'] = new_df_2[columns_order].sum(axis = 1)
new_df_2 = new_df_2.sort_values(by = 'total', ascending = False)
new_df_2 = new_df_2.drop(columns = 'total')
res = new_df_2.div(new_df_2.sum(axis=1), axis = 0)
res

# -----

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize = (16,8))
#plt.style.use('IPython_default')
gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1]) 

ax1 = plt.subplot(gs[0])

new_df_2.plot(use_index = True,  
              kind='barh', 
              stacked=True, 
              ax = ax1,
              )

ax1.set(title = "DS professional distibution by company size across different industry 2021",
      xlabel = "Counts",
      ylabel = "Industry")


ax2 = plt.subplot(gs[1])

res.plot(use_index = True,  
              kind='barh', 
              stacked=True, 
              ax = ax2,
              )

ax2.set(title='Company Size portion within industry',
      xlabel = "Percentage",
      ylabel = " ")

plt.legend(title = "Company Size", bbox_to_anchor=(1.04,1), loc="upper left")
ax2.set_yticks([])
plt.show()

In [5]:
# 2021, 2020, 2019 (#26 and #27) About Computer vision and NLP

In [143]:
#data_2021.iloc[:,85:120].head()
test = get_professionals(data_2020, 'Q5')

algos = test[['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9','Q17_Part_10', 'Q17_Part_11']]
algos = algos.fillna(0)
algos[algos != 0] = 1
algos.replace({False: 0, True: 1}, inplace=True)
algo_df = algos[algos==True].count(axis=0).rename_axis('Algo').reset_index(name='counts')
algo_df = algo_df.set_index('Algo').T
stuff_2020 = algo_df
algo_df.plot(kind='bar')

In [144]:
data_2021.iloc[:,85:120].head()
test = get_professionals(data_2021, 'Q5')

algos = test[['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9','Q17_Part_10', 'Q17_Part_11']]
algos = algos.fillna(0)
algos[algos != 0] = 1
algos.replace({False: 0, True: 1}, inplace=True)
algo_df = algos[algos==True].count(axis=0).rename_axis('Algo').reset_index(name='counts')
algo_df = algo_df.set_index('Algo').T
stuff_2021 = algo_df
algo_df.plot(kind='bar')

In [145]:
old_colnames = ['Q24_Part_1','Q24_Part_2','Q24_Part_3','Q24_Part_4','Q24_Part_5','Q24_Part_6','Q24_Part_7','Q24_Part_8','Q24_Part_9','Q24_Part_10','Q24_Part_11']
new_colnames = ['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9','Q17_Part_10', 'Q17_Part_11']



pd.set_option('display.max_columns', None)
data_2019.head()
algos = data_2019[['Q24_Part_1','Q24_Part_2','Q24_Part_3','Q24_Part_4','Q24_Part_5','Q24_Part_6','Q24_Part_7','Q24_Part_8','Q24_Part_9','Q24_Part_10','Q24_Part_11']]
col_rename_dict = {i:j for i,j in zip(old_colnames,new_colnames)}
algos.rename(columns=col_rename_dict, inplace=True)

algos = algos.fillna(0)
algos[algos != 0] = 1
algos.replace({False: 0, True: 1}, inplace=True)
algo_df = algos[algos==True].count(axis=0).rename_axis('Algo').reset_index(name='counts')
algo_df = algo_df.set_index('Algo').T
stuff_2019 = algo_df
algo_df.plot(kind='bar')

In [None]:
stuff_2021.append([stuff_2020, stuff_2019])

In [6]:
student_2021 = data_2021.loc[data_2021['Q5'] == 'Student']
workforce_2021 = data_2021.loc[data_2021['Q5'] != 'Student']

professional_2021 = workforce_2021[workforce_2021['Q20'].notna()]

industry_2021 = professional_2021['Q20'].unique()

df_2021 = professional_2021[['Q5','Q20']]

d_2021 = {}

for industry in industry_2021:
    temp_df = df_2021[df_2021['Q20'] == industry]
    temp_dict = dict(temp_df['Q5'].value_counts())
    d_2021[industry] = temp_dict

def sorted_simple_dict(d):
    return {k: v for k, v in sorted(d.items())}

def sorted_once_nested_dict(d):
    return {k: sorted_simple_dict(v) for k, v in sorted(d.items())}

d_2021 = sorted_once_nested_dict(d_2021)

df_industry_2021 = pd.DataFrame.from_dict(d_2021, orient='index')
df_industry_2021.fillna(0, inplace = True)

# /-------------------------------------------------------------------------/
# /-------------------------------------------------------------------------/
# /-------------------------------------------------------------------------/

student_2018 = data_2018.loc[data_2018['Q7'] == 'I am a student']
workforce_2018 = data_2018.loc[data_2018['Q7'] != 'I am a student']

professional_2018 = workforce_2018[workforce_2018['Q7'].notna()]
professional_2018.head()

industry_2018 = professional_2018['Q7'].unique()

df_2018 = professional_2018[['Q6','Q7']]

d_2018 = {}

for industry in industry_2018:
    temp_df = df_2018[df_2018['Q7'] == industry]
    temp_dict = dict(temp_df['Q6'].value_counts())
    d_2018[industry] = temp_dict

d_2018 = sorted_once_nested_dict(d_2018)
    
df_industry_2018 = pd.DataFrame.from_dict(d_2018, orient='index')
df_industry_2018.fillna(0, inplace = True)

# /-------------------------------------------------------------------------/
# /-------------------------------------------------------------------------/
# /-------------------------------------------------------------------------/

#  SUBPLOTS - 1x2
fig = plt.figure(figsize=(22,10))

plt.subplot(121)   #  subplot 1
plt.title('2018 heatmap')
sns.heatmap(df_industry_2018, annot=True, annot_kws = {"size": 8}, linewidth = 0.5, fmt='g', square=True, cmap = 'Reds_r')

fig.subplots_adjust(wspace=0.4)

plt.subplot(122)   #  subplot 2
plt.title('2021 heatmap')
sns.heatmap(df_industry_2021, annot=True, annot_kws = {"size": 8}, linewidth = 0.5, fmt='g', square=True, cmap = 'Blues_r')

plt.show()

Heatmap of occupation across different industries. Data scientist and software engineer names down notably compare to 2018.

In [None]:
import squarify
df = workforce_2021["Q5"].value_counts().rename_axis('Current role').reset_index(name='Counts')

plt.figure(figsize = (12,8))
squarify.plot(sizes = df['Counts'], label=df['Current role'], alpha=.8 )
plt.axis('off')

plt.show()


In [None]:
# get professionals only. It will drop student and not employed rows
# print(get_professionals(data_2021, 'Q5')['Q5'].unique())
# print(get_professionals(data_2020, 'Q5')['Q5'].unique())
# print(get_professionals(data_2019, 'Q5')['Q5'].unique())
# print(get_professionals(data_2018, 'Q6')['Q6'].unique())
# print(get_professionals(data_2017, 'CurrentJobTitleSelect')['CurrentJobTitleSelect'].unique())

def get_professionals(data, column):
    data = data.loc[data[column] != 'Student']
    data = data.loc[data[column] != 'Currently not employed']
    data = data.loc[data[column] != 'Not employed']
    data = data.loc[data[column].notna()]
    return data

test = get_professionals(data_2017, 'CurrentJobTitleSelect')
test["CurrentJobTitleSelect"].value_counts()
c = test["CurrentJobTitleSelect"].value_counts()
d = test["CurrentJobTitleSelect"].value_counts(normalize = True)
df = pd.concat([c, d], axis = 1, keys = ['counts', '%'])
df['%'].round(2)



In [None]:
test = get_professionals(data_2021, 'Q5')
test["Q5"].value_counts()
c = test["Q5"].value_counts()
d = test["Q5"].value_counts(normalize = True)
df = pd.concat([c, d], axis = 1, keys = ['counts', '%'])
df['%'].round(2)



In [None]:
test = get_professionals(data_2018, 'Q6')
test["Q6"].value_counts()
c = test["Q6"].value_counts()
d = test["Q6"].value_counts(normalize = True)
df = pd.concat([c, d], axis = 1, keys = ['counts', '%'])
df['%'].round(2)

In [None]:
print(get_professionals(data_2021, 'Q5')['Q5'].unique())
print(get_professionals(data_2020, 'Q5')['Q5'].unique())
print(get_professionals(data_2019, 'Q5')['Q5'].unique())
print(get_professionals(data_2018, 'Q6')['Q6'].unique())
print(get_professionals(data_2017, 'CurrentJobTitleSelect')['CurrentJobTitleSelect'].unique())

In [None]:
job_title = {'Other':'Other',
     'Product Manager': 'Product/Project Manager',
 'Program/Project Manager':'Product/Project Manager',
 'Principal Investigator':'Product/Project Manager',
 'Chief Officer':'Product/Project Manager',
 'Manager':'Product/Project Manager',
 'Software Developer/Software Engineer': 'Software Engineer',
 'Operations Research Practitioner': 'Research Scientist',
 'Computer Scientist': 'Research Scientist',
 'Scientist/Researcher': 'Research Scientist',
 'Researcher': 'Research Scientist',
 'Data Scientist': 'Data Scientist',
     'Business Analyst': 'Business Analyst',
     'Engineer': 'Other',
     'DBA/Database Engineer': 'DBA/Database Engineer',
     'Data Analyst':'Data Analyst',
     'Machine Learning Engineer': 'Machine Learning Engineer',
     'Statistician':'Statistician',
     'Predictive Modeler':'Research Scientist',
     'Programmer': 'Software Engineer',
     'Data Miner': 'Data Engineer',
     'Consultant': 'Developer Relations/Advocacy',
     'Research Assistant': 'Research Scientist',
     'Chief Officer':'Product/Project Manager',
     'Data Engineer':'Data Engineer',
     'Developer Advocate': 'Developer Relations/Advocacy',
     'Marketing Analyst': 'Business Analyst',
     'Data Analyst': 'Data Analyst',
     'Software Engineer': 'Software Engineer',
     'Research Scientist': 'Research Scientist',
     'Data Journalist': 'Data Analyst',
     'Salesperson':'Developer Relations/Advocacy',
     'Product/Project Manager': 'Product/Project Manager',
     'Developer Relations/Advocacy': 'Developer Relations/Advocacy'
}

#k = get_professionals(data_2017, 'CurrentJobTitleSelect')['CurrentJobTitleSelect'].unique()
k = get_professionals(data_2018, 'Q6')['Q6'].unique()
#k = get_professionals(data_2019, 'Q5')['Q5'].unique()
#k = get_professionals(data_2021, 'Q5')['Q5'].unique()
lstval = list(map(job_title.get, k))
lstval
 

 

In [None]:
jesus = data_2018.copy()
magic = get_professionals(jesus, 'Q6')
magic.replace({'Q6': job_title}, inplace = True)

In [None]:
magic['Q6'].value_counts()

In [None]:
Q11_Q12 = data_2021[['Q11', 'Q12_Part_1', 'Q12_Part_2', 'Q12_Part_3', 'Q12_Part_4', 'Q12_Part_5', 'Q12_OTHER']]

In [None]:
Q11_Q12['Q11'].value_counts()

In [None]:
for q in questions_2021:
    print(q)