In [101]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
pd.set_option('display.max_columns', 100)

data_2021 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', low_memory = False, encoding='UTF-8')
questions_2021 = data_2021.iloc[0, :].T
data_2021 = data_2021.iloc[1:, :]
data_2020 = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv', low_memory = False, encoding='UTF-8')
questions_2020 = data_2020.iloc[0, :].T
data_2020 = data_2020.iloc[1:, :]
data_2019 = pd.read_csv('../input/kaggle-survey-2019/multiple_choice_responses.csv', low_memory = False, encoding='UTF-8')
questions_2019 = data_2019.iloc[0, :].T
data_2019 = data_2019.iloc[1:, :]
data_2018 = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv', low_memory = False, encoding='UTF-8')
questions_2018 = data_2018.iloc[0, :].T
data_2018 = data_2018.iloc[1:, :]
data_2017 = pd.read_csv('../input/kaggle-survey-2017/multipleChoiceResponses.csv', low_memory = False, encoding='ISO-8859-1')
questions_2017 = data_2017.iloc[0, :].T
data_2017 = data_2017.iloc[1:, :]

# 2021 Yes. 2020 Yes. 2019 Yes. About TPUs

<div style="font-family:Helvetica Neue; font-size:16px; line-height:1.7; color:black;">
    
<div class="alert alert-warning">
  <strong>Note: </strong> For all charts in this module, I only selected working Professionals.
</div>
</div>

<br>
<div style="font-family:Helvetica Neue; font-size:16px; line-height:1.7; color:black;">
Non-professionals were defined as those who answered Job Title as either: 
<ul>
<li>Student</li>
<li>Currently not employed</li>
<li>Who didn't answer the question (NaN)</li>
</ul>
</div>

In [2]:
def get_professionals(data, column):
    data = data.loc[data[column] != 'Student']
    data = data.loc[data[column] != 'Currently not employed']
    data = data.loc[data[column] != 'Not employed']
    data = data.loc[data[column].notna()]
    return data

<div style="font-family:Helvetica Neue; font-size:16px; line-height:1.7; color:black;">
    
**Exhibit 1: Data Science Professionals distribution by industry 2018 vs. 2021**
</div>

In [3]:
# Exhibit 1. Data Science Professionals distribution by industry 2018 vs. 2021

# Get data from 2021

industry_2021 = data_2021[data_2021['Q20'].notna()]
c = industry_2021['Q20'].value_counts(normalize=True).rename_axis('industry').reset_index(name='counts')
#c =industry_2021['Q20'].value_counts().rename_axis('industry').reset_index(name='counts')

# Get data from 2018

industry_2018 = data_2018[data_2018['Q7'] != 'I am a student']
industry_2018 = industry_2018[industry_2018['Q7'].notna()]
d = industry_2018['Q7'].value_counts(normalize=True).rename_axis('industry').reset_index(name='counts')
#d = industry_2018['Q7'].value_counts().rename_axis('industry').reset_index(name='counts')

# compute the industry

k = pd.merge(left = d, right = c, on = 'industry')
k = k.rename(columns = {'counts_x': '2018', 'counts_y': '2021'})
k = k.sort_values(by=['2021'], ascending=False)

# compute the difference
diff_industry = k.copy()
diff_industry['dff'] = k['2021'] - k['2018']

# plot

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize = (16,8))
#plt.style.use('IPython_default')
gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1]) 
ax1 = plt.subplot(gs[0])

k.plot.barh(x = "industry", ax= ax1)
#ax.grid(False)
ax1.set(title = "Data Science professionals distribution by Industry. 2018 vs 2021",
      xlabel = "Percentage",
      ylabel = "Industry")
ax1.invert_yaxis()

ax2 = plt.subplot(gs[1])
diff_industry['dff'].plot(kind='barh', x = 'industry', ax = ax2,
                    color=(diff_industry['dff'] > 0).map({True: 'g',
                                                    False: 'r'}))
ax2.set(title = "Change",
      xlabel = "Percentage",
      ylabel = "Industry")
ax2.set_yticks([])

plt.gca().invert_yaxis()

<div style="font-family:Helvetica Neue; font-size:16px; line-height:1.7; color:black;">

<ul>
<li> Computer/Technology field is no longer the only game in town. It declined <strong>7.5%</strong></li>

<li> Academic/Education industry gained <strong>3%</strong></li>
<li> Manufactruing industry gained <strong>2%</strong></li>

<li> A sign that Data Science is <strong>spreading into all over the industry</strong>. </li>
    </ul>
</div>

In [4]:
# ----------------
# Job title filter
# ----------------

job_title = {'Other':'Other',
     'Product Manager': 'Product/Project Manager',
 'Program/Project Manager':'Product/Project Manager',
 'Principal Investigator':'Product/Project Manager',
 'Chief Officer':'Product/Project Manager',
 'Manager':'Product/Project Manager',
 'Software Developer/Software Engineer': 'Software Engineer',
 'Operations Research Practitioner': 'Research Scientist',
 'Computer Scientist': 'Research Scientist',
 'Scientist/Researcher': 'Research Scientist',
 'Researcher': 'Research Scientist',
 'Data Scientist': 'Data Scientist',
     'Business Analyst': 'Business Analyst',
     'Engineer': 'Other',
     'DBA/Database Engineer': 'DBA/Database Engineer',
     'Data Analyst':'Data Analyst',
     'Machine Learning Engineer': 'Machine Learning Engineer',
     'Statistician':'Statistician',
     'Predictive Modeler':'Research Scientist',
     'Programmer': 'Software Engineer',
     'Data Miner': 'Data Engineer',
     'Consultant': 'Other',
     'Research Assistant': 'Research Scientist',
     'Chief Officer':'Product/Project Manager',
     'Data Engineer':'Data Engineer',
     'Developer Advocate': 'Developer Relations/Advocacy',
     'Marketing Analyst': 'Business Analyst',
     'Data Analyst': 'Data Analyst',
     'Software Engineer': 'Software Engineer',
     'Research Scientist': 'Research Scientist',
     'Data Journalist': 'Data Analyst',
     'Salesperson':'Developer Relations/Advocacy',
     'Product/Project Manager': 'Product/Project Manager',
     'Developer Relations/Advocacy': 'Developer Relations/Advocacy'
}

<div style="font-family:Helvetica Neue; font-size:16px; line-height:1.7; color:black;">
    
**Exhibit 1.1: Data Science Professional roles in different industry 2018 vs. 2021**
</div>

In [5]:
# cheat
pd.options.mode.chained_assignment = None 

# -----------------------------------------
# Heatmap of job title within industry 2021
# -----------------------------------------

workforce_2021 = get_professionals(data_2021, 'Q5')
professional_2021 = workforce_2021[workforce_2021['Q20'].notna()]
professional_2021['Q5'] = professional_2021['Q5'].map(job_title)
industry_2021 = professional_2021['Q20'].unique()
df_2021 = professional_2021[['Q5','Q20']]

temp_d = {}

for industry in industry_2021:
    temp_df = df_2021[df_2021['Q20'] == industry]
    temp_dict = dict(temp_df['Q5'].value_counts())
    temp_d[industry] = temp_dict

def sorted_simple_dict(d):
    return {k: v for k, v in sorted(d.items())}

def sorted_once_nested_dict(d):
    return {k: sorted_simple_dict(v) for k, v in sorted(d.items())}

temp_d = sorted_once_nested_dict(temp_d)

d_2021 = {}

h_lst = list(k['industry'])

for i in h_lst:
    d_2021[i] = temp_d[i]

df_industry_2021 = pd.DataFrame.from_dict(d_2021, orient='index')
df_industry_2021.fillna(0, inplace = True)
df_industry_2021 = df_industry_2021.sort_values(by=df_industry_2021.index[0], ascending=False, axis=1)
df_industry_2021

# -----------------------------------------
# Heatmap of job title within industry 2021
# -----------------------------------------

student_2018 = data_2018.loc[data_2018['Q7'] == 'I am a student']
workforce_2018 = data_2018.loc[data_2018['Q7'] != 'I am a student']

professional_2018 = workforce_2018[workforce_2018['Q7'].notna()]
professional_2018['Q6'] = professional_2018['Q6'].map(job_title)

industry_2018 = professional_2018['Q7'].unique()

df_2018 = professional_2018[['Q6','Q7']]

temp_d = {}

for industry in industry_2018:
    temp_df = df_2018[df_2018['Q7'] == industry]
    temp_dict = dict(temp_df['Q6'].value_counts())
    temp_d[industry] = temp_dict

def sorted_simple_dict(d):
    return {k: v for k, v in sorted(d.items())}

def sorted_once_nested_dict(d):
    return {k: sorted_simple_dict(v) for k, v in sorted(d.items())}

temp_d = sorted_once_nested_dict(temp_d)

d_2018 = {}

for i in h_lst:
    d_2018[i] = temp_d[i]

df_industry_2018 = pd.DataFrame.from_dict(d_2018, orient='index')
df_industry_2018.fillna(0, inplace = True)
df_industry_2018 = df_industry_2018.sort_values(by=df_industry_2018.index[0], ascending=False, axis=1)
#df_industry_2018

# ---------------
#  SUBPLOTS - 1x2
# ---------------

fig = plt.figure(figsize=(22,10))

plt.subplot(121)   #  subplot 1
plt.title('2018 heatmap')
sns.heatmap(df_industry_2018, annot=True, annot_kws = {"size": 8}, linewidth = 0.5, fmt='g', square=True, cmap = 'BuGn')

fig.subplots_adjust(wspace=0.4)

plt.subplot(122)   #  subplot 2
plt.title('2021 heatmap')
sns.heatmap(df_industry_2021, annot=True, annot_kws = {"size": 8}, linewidth = 0.5, fmt='g', square=True, cmap = 'BuGn')

plt.show()

<div style="font-family:Helvetica Neue; font-size:16px; line-height:1.7; color:black;">
<ul>
    <li>Machine Learning Engineer role is added in 2021. The Heatmap suggests that the large portion of data scientist and software engineer moved to machine learning engineer.</li>
    <li>According to <a href="https://www.snowflake.com/trending/machine-learning-engineer-vs-data-scientist">Snowflake</a>, A machine learning engineer will focus on writing code and deploying machine learning products.</li>
    
<li>Decline in Research Scientist roles in Computer/Tech and Academic fields.</li>
<li>A sign that the data science industry is <strong>shifting from research oriented to profit oriented business.</strong> </li>
</ul>
</div>



In [6]:
# Exhibit 1.1 Data Science distribution by company size 2021.

test = get_professionals(data_2021, 'Q5')
#print(len(test))
test = test[test['Q21'].notna()]
#print(len(test))

category_test = test.groupby(['Q20', 'Q21']).size()
#category_test.plot(kind='bar')
new_df = category_test.to_frame(name = 'size').reset_index()
new_df_2= pd.pivot(
    data = new_df,
    index = 'Q20',
    columns = 'Q21',
    values = 'size')
new_df_2.index.names = ['Industry']
new_df_2.columns.names = ['Company Size']

columns_order = ['0-49 employees', '50-249 employees', '250-999 employees', '1000-9,999 employees','10,000 or more employees']

new_df_2 = new_df_2.reindex(columns = columns_order)
new_df_2['total'] = new_df_2[columns_order].sum(axis = 1)
new_df_2 = new_df_2.sort_values(by = 'total', ascending = False)
new_df_2 = new_df_2.drop(columns='total')

# -----


new_df_2['total'] = new_df_2[columns_order].sum(axis = 1)
new_df_2 = new_df_2.sort_values(by = 'total', ascending = False)
new_df_2 = new_df_2.drop(columns = 'total')
res = new_df_2.div(new_df_2.sum(axis=1), axis = 0)
res

# -----

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize = (16,8))
#plt.style.use('IPython_default')
gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1]) 

ax1 = plt.subplot(gs[0])

new_df_2.plot(use_index = True,  
              kind='barh', 
              stacked=True, 
              ax = ax1,
              )

ax1.set(title = "DS professional distibution by company size across different industry 2021",
      xlabel = "Counts",
      ylabel = "Industry")


ax2 = plt.subplot(gs[1])

res.plot(use_index = True,  
              kind='barh', 
              stacked=True, 
              ax = ax2,
              )

ax2.set(title='Company Size portion within industry',
      xlabel = "Percentage",
      ylabel = " ")

plt.legend(title = "Company Size", bbox_to_anchor=(1.04,1), loc="upper left")
ax2.set_yticks([])
plt.show()

(0-49) sized company accounts the most except insurance/Risk industry.


In [7]:
test = get_professionals(data_2021, 'Q5')

# ----------------------------------------------------------------------------------------
# 2021 (#18 and #19), 2020 (#18 and #19), 2019 (#26 and #27) About Computer vision and NLP
# ----------------------------------------------------------------------------------------

# ---------------------------------
# COMPUTER VISION YES OR NO in 2021
# ---------------------------------

vision = test[['Q18_Part_1','Q18_Part_2','Q18_Part_3','Q18_Part_4','Q18_Part_5','Q18_Part_6','Q18_OTHER']]
nlp = test[['Q19_Part_1','Q19_Part_2','Q19_Part_3','Q19_Part_4','Q19_Part_5','Q19_OTHER']]
vision = vision.fillna(0)
vision[vision != 0] = 1
vision_df = vision[vision==True].count(axis=0).rename_axis('Algo').reset_index(name='counts')
vision_df = vision_df.set_index('Algo').T
stuff_2020 = vision_df

vision['yes'] = vision[['Q18_Part_1','Q18_Part_2','Q18_Part_3','Q18_Part_4','Q18_Part_5','Q18_OTHER']].sum(axis = 1)
vision['yes'] = vision['yes'].apply(lambda x: x>=1)
#vision['yes'].value_counts()
vision['no'] = (vision['yes'].apply(lambda x: x == 0) | vision['Q18_Part_6'].apply(lambda x: x == 1))
#vision['no'].value_counts()

vision['Q20'] = test['Q20']
vision = vision.drop(columns=['Q18_Part_1','Q18_Part_2','Q18_Part_3','Q18_Part_4','Q18_Part_5','Q18_Part_6','Q18_OTHER'])
vision.replace({False: 0, True: 1}, inplace=True)
vision_df = vision[vision['yes'] == 1].groupby('Q20').size()

total_boss = vision['Q20'].value_counts()

boss = pd.DataFrame(vision_df)
total_boss = pd.DataFrame(total_boss)
final_boss = total_boss.join(boss)
final_boss.rename(columns = {final_boss.columns[0]: 'NO', final_boss.columns[1]: 'YES'}, inplace = True)
#final_boss

# ---------------------
# NLP YES OR NO in 2021
# ---------------------

nlp = test[['Q19_Part_1','Q19_Part_2','Q19_Part_3','Q19_Part_4','Q19_Part_5','Q19_OTHER']]
nlp = nlp.fillna(0)
nlp[nlp != 0] = 1
nlp_df = nlp[nlp==True].count(axis=0).rename_axis('Algo').reset_index(name='counts')
nlp_df = nlp_df.set_index('Algo').T

nlp['yes'] = nlp[['Q19_Part_1','Q19_Part_2','Q19_Part_3','Q19_Part_4','Q19_OTHER']].sum(axis = 1)
nlp['yes'] = nlp['yes'].apply(lambda x: x>=1)
#vision['yes'].value_counts()
nlp['no'] = (nlp['yes'].apply(lambda x: x == 0) | nlp['Q19_Part_5'].apply(lambda x: x == 1))
#vision['no'].value_counts()

nlp['Q20'] = test['Q20']
nlp = nlp.drop(columns=['Q19_Part_1','Q19_Part_2','Q19_Part_3','Q19_Part_4','Q19_Part_5','Q19_OTHER'])
nlp.replace({False: 0, True: 1}, inplace=True)
nlp_df = nlp[nlp['yes'] == 1].groupby('Q20').size()

total_boss2 = nlp['Q20'].value_counts()

boss2 = pd.DataFrame(nlp_df)
total_boss2 = pd.DataFrame(total_boss2)
final_boss2 = total_boss.join(boss2)
final_boss2.rename(columns = {final_boss2.columns[1]: 'hello'}, inplace = True)
#final_boss2['perc'] = final_boss2['hello'] * 100 / final_boss2['Q20']
final_boss2.rename(columns = {final_boss2.columns[0]: 'NO', final_boss2.columns[1]: 'YES'}, inplace = True)
final_boss2

# --------------- 
#  SUBPLOTS - 1x2
# ---------------

fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(18,11))

plt.subplot(121)   #  subplot 1
final_boss.plot(kind='barh', ax = ax1)
ax1.set(title = "Computer Vision Yes / No")

for i,j in zip(ax1.containers[0], ax1.containers[1]):

    perc = j.get_width() / i.get_width()
    perc = (perc*100).round(1)
    non_perc = (100 - perc).round(1)
    
    width = i.get_width()
    height = i.get_height()
    x, y = i.get_xy()
    ax1.annotate(f'{non_perc}%', (x + width, y + height*1.02), ha="left", va="center")
    
    width2 = j.get_width()
    height2 = j.get_height()
    x2, y2 = j.get_xy() 
    ax1.annotate(f'{perc}%', (x + width2, y2 + height2*1.02), ha="left", va="center")

fig.subplots_adjust(wspace=1)

plt.subplot(122)   #  subplot 2
final_boss2.plot(kind='barh', ax = ax2)
ax2.set(title = "NLP Yes / No")

for i,j in zip(ax2.containers[0], ax2.containers[1]):

    perc = j.get_width() / i.get_width()
    perc = (perc*100).round(1)
    non_perc = (100 - perc).round(2)
    
    width = i.get_width()
    height = i.get_height()
    x, y = i.get_xy()
    ax2.annotate(f'{non_perc}%', (x + width, y + height*1.02), ha="left", va="center")
    
    width2 = j.get_width()
    height2 = j.get_height()
    x2, y2 = j.get_xy() 
    ax2.annotate(f'{perc}%', (x + width2, y2 + height2*1.02), ha="left", va="center")

plt.show()

In [8]:
# ---------------------
# ML Algos 2019 to 2021
# ---------------------

# ----
# 2021
# ----
test = get_professionals(data_2021, 'Q5')

old_colnames = ['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9','Q17_Part_10', 'Q17_Part_11', 'Q17_OTHER']
new_colnames = ['Regression', 'Decision Trees', 'Gradient Boosting', 'Bayesian', 'Evolutionary', 'DNN', 'CNN', 'GAN', 'RNN', 'Transformer', 'None', 'Other']

algos = test[['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9','Q17_Part_10', 'Q17_Part_11', 'Q17_OTHER']]
col_rename_dict = {i:j for i,j in zip(old_colnames,new_colnames)}
algos.rename(columns=col_rename_dict, inplace=True)
algos = algos.fillna(0)
algos[algos != 0] = 1
algos.replace({False: 0, True: 1}, inplace=True)
algo_df = algos[algos==True].count(axis=0).rename_axis('Algo').reset_index(name='counts')
algo_df = algo_df.set_index('Algo').T
stuff_2021 = algo_df

# ----
# 2020
# ----

test = get_professionals(data_2020, 'Q5')

old_colnames = ['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9','Q17_Part_10', 'Q17_Part_11','Q17_OTHER']
new_colnames = ['Regression', 'Decision Trees', 'Gradient Boosting', 'Bayesian', 'Evolutionary', 'DNN', 'CNN', 'GAN', 'RNN', 'Transformer', 'None', 'Other']

algos = test[['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9','Q17_Part_10', 'Q17_Part_11','Q17_OTHER']]
col_rename_dict = {i:j for i,j in zip(old_colnames,new_colnames)}
algos.rename(columns=col_rename_dict, inplace=True)
algos = algos.fillna(0)
algos[algos != 0] = 1
algos.replace({False: 0, True: 1}, inplace=True)
algo_df = algos[algos==True].count(axis=0).rename_axis('Algo').reset_index(name='counts')
algo_df = algo_df.set_index('Algo').T
stuff_2020 = algo_df

# ----
# 2019
# ----

old_colnames = ['Q24_Part_1','Q24_Part_2','Q24_Part_3','Q24_Part_4','Q24_Part_5','Q24_Part_6','Q24_Part_7','Q24_Part_8','Q24_Part_9','Q24_Part_10','Q24_Part_11', 'Q24_Part_12']
#new_colnames = ['Q17_Part_1', 'Q17_Part_2', 'Q17_Part_3', 'Q17_Part_4', 'Q17_Part_5', 'Q17_Part_6', 'Q17_Part_7', 'Q17_Part_8', 'Q17_Part_9','Q17_Part_10', 'Q17_Part_11']

new_colnames = ['Regression', 'Decision Trees', 'Gradient Boosting', 'Bayesian', 'Evolutionary', 'DNN', 'CNN', 'GAN', 'RNN', 'Transformer', 'None', 'Other']
col_rename_dict = {i:j for i,j in zip(old_colnames,new_colnames)}


pd.set_option('display.max_columns', None)
data_2019.head()
algos = data_2019[['Q24_Part_1','Q24_Part_2','Q24_Part_3','Q24_Part_4','Q24_Part_5','Q24_Part_6','Q24_Part_7','Q24_Part_8','Q24_Part_9','Q24_Part_10','Q24_Part_11','Q24_Part_12']]
col_rename_dict = {i:j for i,j in zip(old_colnames,new_colnames)}
algos.rename(columns=col_rename_dict, inplace=True)
algos = algos.fillna(0)
algos[algos != 0] = 1
algos.replace({False: 0, True: 1}, inplace=True)
algo_df = algos[algos==True].count(axis=0).rename_axis('Algo').reset_index(name='counts')
algo_df = algo_df.set_index('Algo').T
stuff_2019 = algo_df

# ---------------
# Merge the frame
# ---------------

algo_set = stuff_2021.append([stuff_2020, stuff_2019])
algo_set = algo_set.T

# --------------
# Plot the chart
# --------------

fig, ax1 = plt.subplots(figsize = (16,8))
algo_set.plot(kind = 'bar', ax = ax1)
ax1.set(title = "ML Algo")

Heatmap of occupation across different industries. Data scientist and software engineer names down notably compare to 2018.

In [9]:
test = get_professionals(data_2021, 'Q5')
test = test[test['Q26'].notna()]
test['Q26'].unique()
#test = test[test['Q26'] != '$0 ($USD)']
test = test.groupby(['Q26', 'Q20']).size()
df = test.to_frame(name = 'size').reset_index()
df= pd.pivot(
    data = df,
    index = 'Q20',
    columns = 'Q26',
    values = 'size')
df.index.names = ['Industry']
df.columns.names = ['Money Spent']

#c = ['$0 ($USD)', '$1-$99','$100-$999', '$1000-$9,999','$10,000-$99,999', '$100,000 or more ($USD)']
c = ['$1-$99','$100-$999', '$1000-$9,999','$10,000-$99,999', '$100,000 or more ($USD)']
df = df.reindex(c, axis = 1)

fig, ax1 = plt.subplots(figsize = (16,8))
df.plot(kind = 'barh', ax = ax1)
ax1.set(title = "Money Spent on ML or Cloud computing service by industry")

In [10]:
data_2019 = data_2019[data_2019['Q5'].notna()]
data_2019 = data_2019[data_2019['Q5'] != 'Student']
data_2019 = data_2019[data_2019['Q5'] != 'Not employed']

In [105]:
c = {'$0 ($USD)': '$0 ($USD)',
     '$1-$99': '$1-$99',
     '$100-$999': '$100-$999',
     '$1000-$9,999': '$1000-$9,999',
     '$10,000-$99,999': '$10,000-$99,999',
     '$100,000 or more ($USD)': '$100,000 or +',
     '> $100,000 ($USD)': '$100,000 or +',
}

temp_2021 = get_professionals(data_2021, 'Q5')
temp_2020 = get_professionals(data_2020, 'Q5')

data_2019 = data_2019[data_2019['Q5'].notna()]
data_2019 = data_2019[data_2019['Q5'] != 'Student']
data_2019 = data_2019[data_2019['Q5'] != 'Not employed']
#data_2019['Q11'] = data_2019['Q11'].map(c)

money_spent_2021 = data_2021[data_2021['Q26'].notna()]
money_spent_2021['Q26'] = money_spent_2021['Q26'].map(c)
money_spent_2021 = money_spent_2021[money_spent_2021['Q26'] != '$0 ($USD)']

money_spent_2020 = data_2020[data_2020['Q25'].notna()]
money_spent_2020['Q25'] = money_spent_2020['Q25'].map(c)
money_spent_2020 = money_spent_2020[money_spent_2020['Q25'] != '$0 ($USD)']

money_spent_2019 = data_2019[data_2019['Q11'].notna()]
money_spent_2019['Q11'] = money_spent_2019['Q11'].map(c)

row_order = ['$1-$99', '$100-$999', '$1000-$9,999', '$10,000-$99,999', '$100,000 or +']

df_2021 = pd.DataFrame(money_spent_2021['Q26'].value_counts(), index = row_order)
df_2020 = pd.DataFrame(money_spent_2020['Q25'].value_counts(), index = row_order)
df_2019 = pd.DataFrame(money_spent_2019['Q11'].value_counts(), index = row_order)

df_final = pd.concat([df_2021, df_2020, df_2019], axis = 1)
df_final.rename(columns = {'Q26': '2021', 'Q25': '2020', 'Q11': '2019'}, inplace= True)
df_final.plot(kind='bar')

In [15]:
pd.set_option('display.max_columns', None)
