# Kaggle survey

Info about the competition: https://www.kaggle.com/competitions/kaggle-survey-2022/overview


In [1]:
# Install ipyvizzu and ipyvizzu-story if you haven't already
!pip install -U ipyvizzu
#!pip install -U ipyvizzu-story



In [2]:
import pandas as pd
import numpy as np

from ipyvizzu import Chart, Data, Config, Style
from ipyvizzustory import Story, Slide, Step

#Create data object, read csv to data frame and add data frame to data object.
data = Data()

#Only use columns you'll need for your story - should be updated to match the story content
col_list = ['Q2','Q3','Q5','Q23','Q25','Q29','Q44_1']

df = pd.read_csv("../data/Kaggle/kaggle_survey_2022_responses.csv", usecols=col_list)

df.rename(columns={"Q2": "Age", "Q3": "Gender", "Q5": "Student_employee", "Q23": "Title", "Q25": "Company_size", "Q29": "Salary"}, inplace = True)
#Add count column
df['Count'] = 1
df['Company_size'] = df['Company_size'].str.replace('employees','')
df.head()

Unnamed: 0,Age,Gender,Student_employee,Title,Company_size,Salary,Q44_1,Count
0,What is your age (# years)?,What is your gender? - Selected Choice,"Are you currently a student? (high school, uni...",Select the title most similar to your current ...,What is the size of the company where you are ...,What is your current yearly compensation (appr...,Who/what are your favorite media sources that ...,1
1,30-34,Man,No,,,,,1
2,30-34,Man,No,,,,,1
3,18-21,Man,Yes,,,,Twitter (data science influencers),1
4,55-59,Man,No,Data Scientist,0-49,"25,000-29,999",Twitter (data science influencers),1


In [3]:
#Remove row with question texts
df = df.drop(0)

In [4]:
#Add new column categorizing respondents with ages above and below 30
df.insert(1,'age_30',np.where((df.Age == '18-21') | (df.Age == '22-24') | (df.Age == '25-29'), "below 30", "above 30"))
df['age_30'] = df['age_30'].astype(str)

In [5]:
df.Age.unique()

array(['30-34', '18-21', '55-59', '45-49', '70+', '22-24', '35-39',
       '40-44', '50-54', '25-29', '60-69'], dtype=object)

In [6]:
#sort the dataframe by company size
agesorter = ['18-21', '22-24', '25-29', '30-34', '35-39', '40-44', '45-49','50-54','55-59','60-69','70+']

# Create the dictionary that defines the order for sorting
sorterIndex4 = dict(zip(agesorter, range(len(agesorter))))

# Generate a rank column that will be used to sort
# the dataframe numerically
df['Age_Rank'] = df['Age'].map(sorterIndex4)

# the actual sorting is done together with salary, title below

In [7]:
#Add percentage value for gender categories for each age group
df['Gender_added'] = df['Gender'].notna()
df['Gender_added'] = df['Gender_added'].astype(str)
df['Gender_Pct[%]'] = 100 / df.groupby(['Age','Gender_added'])['Count'].transform('sum') 

In [8]:
#Replace values in Student_employee column for easier understanding when values shown on the legend
df.Student_employee.replace({'Yes': 'Student', 'No': 'Employee'}, inplace = True)
df.head()

Unnamed: 0,Age,age_30,Gender,Student_employee,Title,Company_size,Salary,Q44_1,Count,Age_Rank,Gender_added,Gender_Pct[%]
1,30-34,above 30,Man,Employee,,,,,1,3,True,0.033647
2,30-34,above 30,Man,Employee,,,,,1,3,True,0.033647
3,18-21,below 30,Man,Student,,,,Twitter (data science influencers),1,0,True,0.021935
4,55-59,above 30,Man,Employee,Data Scientist,0-49,"25,000-29,999",Twitter (data science influencers),1,8,True,0.163666
5,45-49,above 30,Man,Student,,,,,1,6,True,0.079808


In [9]:
#sort the dataframe by title
titlesorter = ['Data Scientist', 'Data Analyst', 'Software Engineer', 'Teacher / professor', 'Manager', 'Other', 'Research Scientist', 'ML / MLops Engineer', 'Engineer (non-software)', 'Data Engineer', 'Statistician', 'Data Architect', 'Data Administrator', 'Developer Advocate', 'Currently not employed',]

#replace long titles with shorter versions
df = df.replace("Data Analyst (Business, Marketing, Financial, Quantitative, etc)", "Data Analyst")
df = df.replace("Manager (Program, Project, Operations, Executive-level, etc)", "Manager")
df = df.replace("Machine Learning/ MLops Engineer", "ML / MLops Engineer")
df = df.replace("Currently not employed", "Unemployed")
df = df.replace("Engineer (non-software)", "Other Engineer")

# Create the dictionary that defines the order for sorting
sorterIndex = dict(zip(titlesorter, range(len(titlesorter))))

# Generate a rank column that will be used to sort
# the dataframe numerically
df['Title_Rank'] = df['Title'].map(sorterIndex)
# the actual sorting is done together with salary, title below

In [10]:
#Add percentage share value between age groups for each title
df['Employee_w_title'] = df['Title'].notna()
df['Employee_w_title'] = df['Employee_w_title'].astype(str)
df.Employee_w_title.replace({'True': 'Added title', 'False': 'No info'}, inplace = True)
df['Title_Pct[%]'] = 100 / df.groupby(['Title','Employee_w_title'])['Count'].transform('sum')


In [11]:
#sort the dataframe by company size
sizesorter = ['0-49', '50-249', '250-999', '1000-9,999', '10,000 or more', 'NaN']

# Create the dictionary that defines the order for sorting
sorterIndex2 = dict(zip(sizesorter, range(len(sizesorter))))

# Generate a rank column that will be used to sort
# the dataframe numerically
df['Size_Rank'] = df['Company_size'].map(sorterIndex2)

# the actual sorting is done together with salary, title below

In [12]:
#Add column to seperate those that did not add company size
df['Employee_w_size'] = df['Company_size'].notna()
df['Employee_w_size'] = df['Employee_w_size'].astype(str)
df.Employee_w_size.replace({'True': 'Added company size', 'False': 'No info'}, inplace = True)
df['Size_Pct[%]'] = 100 / df.groupby(['Age','Employee_w_size'])['Count'].transform('sum')

In [13]:
#Add new column to have fewer salary categories
salary_map = {'$0-999': '$0-1k',
                '1,000-1,999': '$1-10k',
                '2,000-2,999': '$1-10k',
                '3,000-3,999': '$1-10k',
                '4,000-4,999': '$1-10k',
                '5,000-7,499': '$1-10k',
                '7,500-9,999': '$1-10k',
                '10,000-14,999': '$10-20k',
                '15,000-19,999': '$10-20k',
                '20,000-24,999': '$20-50k',
                '25,000-29,999': '$20-50k',
                '30,000-39,999': '$20-50k',
                '40,000-49,999': '$20-50k',
                '50,000-59,999': '$50-100k',
                '60,000-69,999': '$50-100k',
                '70,000-79,999': '$50-100k',
                '80,000-89,999': '$50-100k',
                '90,000-99,999': '$50-100k',
                '100,000-124,999': '$100-200k',
                '125,000-149,999': '$100-200k',
                '150,000-199,999': '$100-200k',
                '200,000-249,999': '$200-500k',
                '250,000-299,999': '$200-500k',
                '300,000-499,999': '$200-500k',
                '$500,000-999,999': '$500k-1M',
                '>$1,000,000': '$1M+'}

df['salary_cat'] = df['Salary'].map(salary_map)

In [14]:
#prepare to sort the dataframe by salary list
salarysorter = ['$0-1k', '$1-10k', '$10-20k', '$20-50k', '$50-100k', '$100-200k', '$200-500k', '$500k-1M', '$1M+', 'NaN']

# Create the dictionary that defines the order for sorting
sorterIndex3 = dict(zip(salarysorter, range(len(salarysorter))))

# Generate a rank column that will be used to sort
# the dataframe numerically
df['Salary_Rank'] = df['salary_cat'].map(sorterIndex3)

#sorting based on all of the rankings
df.sort_values(['Title_Rank','Age_Rank','Salary_Rank','Size_Rank'], inplace = True)

In [15]:
#Add percentage value for salary categories for each age group
df['Employee_w_salary'] = df['salary_cat'].notna()
df['Employee_w_salary'] = df['Employee_w_salary'].astype(str)
df.Employee_w_salary.replace({'True': 'Added salary', 'False': 'No info'}, inplace = True)
df['Salary_Pct[%]'] = 100 / df.groupby(['Age','Employee_w_salary'])['Count'].transform('sum')

display(df)

Unnamed: 0,Age,age_30,Gender,Student_employee,Title,Company_size,Salary,Q44_1,Count,Age_Rank,...,Title_Rank,Employee_w_title,Title_Pct[%],Size_Rank,Employee_w_size,Size_Pct[%],salary_cat,Salary_Rank,Employee_w_salary,Salary_Pct[%]
1655,18-21,below 30,Man,Employee,Data Scientist,50-249,$0-999,,1,0,...,0.0,Added title,0.051840,,Added company size,1.052632,$0-1k,0.0,Added salary,1.204819
1890,18-21,below 30,Man,Employee,Data Scientist,0-49,$0-999,Twitter (data science influencers),1,0,...,0.0,Added title,0.051840,,Added company size,1.052632,$0-1k,0.0,Added salary,1.204819
5358,18-21,below 30,Man,Employee,Data Scientist,0-49,$0-999,Twitter (data science influencers),1,0,...,0.0,Added title,0.051840,,Added company size,1.052632,$0-1k,0.0,Added salary,1.204819
9880,18-21,below 30,Man,Employee,Data Scientist,0-49,$0-999,,1,0,...,0.0,Added title,0.051840,,Added company size,1.052632,$0-1k,0.0,Added salary,1.204819
11415,18-21,below 30,Man,Employee,Data Scientist,50-249,$0-999,,1,0,...,0.0,Added title,0.051840,,Added company size,1.052632,$0-1k,0.0,Added salary,1.204819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21304,70+,above 30,Woman,Student,,,,Twitter (data science influencers),1,10,...,,No info,,,No info,2.040816,,,No info,1.587302
21993,70+,above 30,Man,Employee,Unemployed,,,,1,10,...,,Added title,0.069832,,No info,2.040816,,,No info,1.587302
22178,70+,above 30,Man,Employee,Unemployed,,,Twitter (data science influencers),1,10,...,,Added title,0.069832,,No info,2.040816,,,No info,1.587302
23753,70+,above 30,Man,Student,,,,,1,10,...,,No info,,,No info,2.040816,,,No info,1.587302


In [16]:
style = Style({
    'fontSize':'120%',
    'logo':{'width':'5em'},
        'legend':{'width': '14em'},
        'title' : {'fontSize' : '2em'},
        'plot' : {
            'marker' :{ 'label' :{ 'maxFractionDigits' : 1},'maxLightness' : 0,},
            'xAxis' :{ 'title' :{ 'color' : '#00000000'}},
            'yAxis' :{ 'title' :{ 'color' : '#00000000'}}
        }
})

In [17]:
data.add_data_frame(df)
story = Story(data=data, style=style)
story.set_size("100%", "400px")
story.set_feature("tooltip", True)

slide1 = Slide(
#Show number of respondents
    Step(
        Config({
            "x": "Count",
            "label":"Count",
         #   "reverse": True,
            "title": "Almost 24 thousand people filled the survey",

        })
    )
)
story.add_slide(slide1)

slide2 = Slide()
#Show number of respondents below and above 30
slide2.add_step(
    Step(
        Config({
            "x": ["Age","Count"],
            "label":"Age"
        })
))

slide2.add_step(
    Step(
        Config({
            "x": "Count",
            "y": "Age",
            "label":"Count",
            "title": "55% of respondents are younger than 30"
        })
    )
)
story.add_slide(slide2)

#Add gender
slide3 = Slide()

slide3.add_step(
    Step(
        Config({
            "x": ["Count","Gender"],
            "y": "Age",
        })
    )
)

slide3.add_step(
    Step(
        Config({
            "color":"Gender",
        }),
        Style({'plot' : {'marker' : { 'colorPalette' : '#03AE71FF #F4941BFF #F4C204FF #D49664FF #F25456FF'}}})
    )
)
    

#Show ratio of genders
slide3.add_step(
    Step(
        Config({
            'align':'stretch',
            'label':'Gender_Pct[%]',
            "title": "There are a bit more women in the younger generations"
        })
    )
)
story.add_slide(slide3)

In [18]:
#Add event to avoid small values written on the label scale as they overlap
label_handler_method = """let parts = event.data.text.split(' ');
if (parts[1] == '%') {
	if (parts[0] < 2) event.preventDefault();
} else if (parts[1] == undefined) {
	if (parts[0] < 200) event.preventDefault();
}"""
story.add_event("plot-marker-label-draw", label_handler_method)

In [19]:
#Remove gender
slide4 = Slide()

slide4.add_step(
    Step(
        Config({
            "align": "none",
            "label": "Count",
            "title": "Let's see what they do for a living"
        })
    )
)

slide4.add_step(
    Step(
        Config({
            "x":"Count",
            "color":None,
            "legend":"lightness"
        }),
        Style({'plot' : {'marker' : { 'colorPalette' : None}}})
    )
)

#Add student/employee status

slide4.add_step(
    Step(
        Config({
            "x": ["Count","Student_employee"],
            "y": "Age",
            "color":"Student_employee",
            "title": "70% below 30 vs. 25% above 30 are students",
            "legend":"color"
        }),
        Style({'legend':{'width': '12.5em'}, "plot": {"marker": {"colorPalette": "#00819CFF #C08030FF" }}})
    )
)

story.add_slide(slide4)

#Zoom to those who are not students anymore
slide5 = Slide()
slide5.add_step(
    Step(
        Config({
            "title": "Let's focus on the employees"
        })
    )
)
slide5.add_step(
    Step(
        Data.filter("record.Student_employee == 'Employee'"),
    )
)

slide5.add_step(
    Step(
        Config({
            "color": "Age",
            "legend":"color",
            "title": "All employees"
        }),
        Style({"plot": {
            "marker": {"colorPalette": "#26B6C3FF #453A90FF" },
            'yAxis' : { 'label' :{ 'color' : '#00000000'}}
        }})
))
            

story.add_slide(slide5)

In [20]:
slide6 = Slide()

slide6.add_step(
    Step(
        Config({
            "title": "Let's see the size of the companies they work for"
        })
    )
)

slide6.add_step(
    Step(
        Config({
            "x": ["Count","Employee_w_size"],
            "label":"Employee_w_size",
            "color":["Age","Employee_w_size"]
        }),
        Style({"plot": {"marker": {"colorPalette": "#26B6C3FF #DCD9EAFF #453A90FF #C8C4DFFF"}}})
    )
)

slide6.add_step(
    Step(
        Data.filter("record.Student_employee === 'Employee' && record.Employee_w_size === 'Added company size'" ),
    )
)

slide6.add_step(
    Step(
        Config({
            "x": ["Count","Company_size"],
            "label":"Size_Pct[%]",
        })
    )
)

slide6.add_step(
    Step(
        Config({
            "geometry":"circle",
            "x": {"set":"Company_size","range": {"max": "6"}},
            "label":"Size_Pct[%]",
            "size":"Size_Pct[%]",
            "title": "Most youngsters work at very big or very small companies"
        }),Style({ 'paddingBottom' : '2em',
        'plot' : {'xAxis' :{ 'label' :{ 'angle' : '-0.785','paddingTop' : '0'}}, 
                 'marker' : { 'label' :{ 'position' : 'center'},'circleMaxRadius' : '0.14', 'circleMinRadius' : '0.07'}, 
                 }})
    )
) 

story.add_slide(slide6)

slide7 = Slide()
slide7.add_step(
    Step(
        Config(({
            "size":None,
            "x": {"set":["Count","Company_size"],"range": {"max": "auto"}},
            "title": "",
            "geometry":"rectangle"
        })
    )
))

slide7.add_step(
    Step(
        Config(({
            "x": ["Count","Employee_w_size"],
            "label":"Employee_w_size",
            "color":["Age","Employee_w_size"]
        })
    )
))

slide7.add_step(
    Step(
        Data.filter("record.Student_employee === 'Employee'" ),
    )
)

slide7.add_step(
    Step(
        Config({
            "x":"Count",
            "label":"Count",
            "color": "Age",
            "legend":"color",
            "title": "All employees"
        }),
        Style({"plot": {"marker": {"colorPalette": "#26B6C3FF #453A90FF" }}})
    )
)

story.add_slide(slide7)

In [21]:
slide8 = Slide()

slide8.add_step(
    Step(
        Config({
            "title": "Let's check out the salaries"
        })
    )
)


slide8.add_step(
    Step(
        Config({
            "x": ["Count","Employee_w_salary"],
            "label":"Employee_w_salary",
            "color":["Age","Employee_w_salary"]
        }),
        Style({"plot": {"marker": {"colorPalette": "#26B6C3FF #DCD9EAFF #453A90FF #C8C4DFFF"}}})
    )
)

slide8.add_step(
    Step(
        Data.filter("record.Student_employee === 'Employee' && record.Employee_w_salary === 'Added salary'" ),
    )
)

slide8.add_step(
    Step(
        Config({
            "x": ["Count","salary_cat"],
            "label":None,
        }),
    )
)

slide8.add_step(
    Step(
        Config({
            "y":["Salary_Pct[%]","Age"],
            "x":"salary_cat",
            "split": True,
            "title":"Over 50% below 30 earns less than $10k per year",
        })
    )
)
slide8.add_step(
    Step(
        Config({
            "label":"Salary_Pct[%]"
        })
    )
)

story.add_slide(slide8)

In [22]:
slide9 = Slide()

slide9.add_step(
    Step(
        Config({
            "x": ["Count","salary_cat"],
            "y": "Age",
            "split": False
        }),
))

slide9.add_step(
    Step(
        Config({
            "label": "Employee_w_salary",
            "x": ["Count","Employee_w_salary"]
        })
    )
)

slide9.add_step(
    Step(
        Data.filter("record.Student_employee === 'Employee'"),
    )
)

slide9.add_step(
    Step(
        Config({
            "x":"Count",
            "label":"Count",
            "color": "Age",
            "legend":"color",
            "title": "All employees"
        }),
        Style({"plot": {"marker": {"colorPalette": "#26B6C3FF #453A90FF" }}})
    )
)

story.add_slide(slide9)

slide10 = Slide()

slide10.add_step(
    Step(
        Config({
            "title": "Finally, let's see the titles!"
        })
    )
)


slide10.add_step(
    Step(
        Config({
            "x": ["Count","Employee_w_title"],
            "label":"Employee_w_title",
            "color":["Age","Employee_w_title"]
        }),
        Style({"plot": {"marker": {"colorPalette": "#26B6C3FF #DCD9EAFF #453A90FF #C8C4DFFF"}}})
    )
)

slide10.add_step(
    Step(
        Data.filter("record.Student_employee === 'Employee' && record.Employee_w_title === 'Added title'" ),
    )
)

slide10.add_step(
    Step(
        Config({
            "x": ["Count","Title"],
            "label":None,
        }),
    )
)

slide10.add_step(
    Step(
        Config({
            "x": "Title",
            "y": ["Age","Count"],
            "label":"Count",
            "title":"39% below 30 vs. 30% above 30 works as Data Scientist or Data Analyst"
        }),
        Style({'plot' : {'paddingBottom' : '10em'}})
    )
)

story.add_slide(slide10)

In [23]:
slide11 = Slide()
slide11.add_step(
    Step(
        Config({
            "y": ["Age","Title_Pct[%]"],
            "label":"Title_Pct[%]",
            "title":"Many ML/MLOps engineers and very few Data Architects are below 30"
        }),
        Style({'plot' : {'marker' :{ 'label' :{ 'maxFractionDigits' : 0, 'fontSize': '90%'}}}}
        )
    )
)
story.add_slide(slide11)

# Play the created story - should be at the end of the cell
story.play()

In [24]:
# If you want to save the story as an interactive HTML
# (containing only the output of the previous cell),
# use the following command:
#story.export_to_html(filename="../html/kaggle.html")

### Learn more about [ipyvizzu-story](https://github.com/vizzuhq/ipyvizzu-story/)