<a href="https://colab.research.google.com/github/yslvm/cs65/blob/main/4_Plotting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import io
import urllib.request as req
import zipfile

In [None]:
def tweak_kag(df):
    na_mask = df.Q9.isna()
    hide_mask = df.Q9.str.startswith('I do not').fillna(False)
    df = df[~na_mask & ~hide_mask]
    
    q1 = (df.Q1
      .replace({'Prefer not to say': 'Another',
               'Prefer to self-describe': 'Another'})
      .rename('Gender')
    )
    q2 = df.Q2.str.slice(0,2).astype(int).rename('Age')
    def limit_countries(val):
        if val in  {'United States of America', 'India', 'China'}:
            return val
        return 'Another'
    q3 = df.Q3.apply(limit_countries).rename('Country')
   
    q4 = (df.Q4
     .replace({'Master’s degree': 18,
     'Bachelor’s degree': 16,
     'Doctoral degree': 20,
     'Some college/university study without earning a bachelor’s degree': 13,
     'Professional degree': 19,
     'I prefer not to answer': None,
     'No formal education past high school': 12})
     .fillna(11)
     .rename('Edu')
    )
    
    def only_cs_stat_val(val):
        if val not in {'cs', 'eng', 'stat'}:
            return 'another'
        return val

    q5 = (df.Q5
            .replace({
                'Computer science (software engineering, etc.)': 'cs',
                'Engineering (non-computer focused)': 'eng',
                'Mathematics or statistics': 'stat'})
             .apply(only_cs_stat_val)
             .rename('Studies'))
    def limit_occupation(val):
        if val in {'Student', 'Data Scientist', 'Software Engineer', 'Not employed',
                  'Data Engineer'}:
            return val
        return 'Another'

    q6 = df.Q6.apply(limit_occupation).rename('Occupation')
    
    q8 = (df.Q8
      .str.replace('+', '')
      .str.split('-', expand=True)
      .iloc[:,0]
      .fillna(-1)
      .astype(int)
      .rename('Experience')
    )
    
    q9 = (df.Q9
     .str.replace('+','')
     .str.replace(',','')
     .str.replace('500000', '500')
     .str.replace('I do not wish to disclose my approximate yearly compensation','')
     .str.split('-', expand=True)
     .iloc[:,0]
     .astype(int)
     .mul(1000)
     .rename('Salary'))
    return pd.concat([q1, q2, q3, q4, q5, q6, q8, q9], axis=1)

# https://www.kaggle.com/kaggle/kaggle-survey-2018
url = 'https://github.com/mattharrison/datasets/raw/master/data/kaggle-survey-2018.zip'
fin = req.urlopen(url)

with zipfile.ZipFile(io.BytesIO(fin.read())) as z:
    print(z.namelist())
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    kag_questions = kag.iloc[0]
    df = kag.iloc[1:]
    
df = tweak_kag(df)

['multipleChoiceResponses.csv', 'freeFormResponses.csv', 'SurveySchema.csv']


  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
ax = sns.boxplot(x='Occupation', y='Salary', data=df)

In [None]:
# seaborn is built on matplot lib
# let's increase the size
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(x='Occupation', y='Salary', data=df, ax=ax)

In [None]:
# seaborn is built on matplot lib
# rotate text
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(x='Occupation', y='Salary', data=df, ax=ax)
ax.tick_params(axis='x', labelrotation=45)

In [None]:
# Current style
sns.axes_style()

In [None]:
# seaborn is built on matplot lib
# change looks
with sns.axes_style('dark'):
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.boxplot(x='Occupation', y='Salary', data=df, ax=ax)
    ax.tick_params(axis='x', labelrotation=45)

In [None]:
# seaborn is built on matplot lib
# change facecolor`
with sns.axes_style('dark', {'axes.facecolor':'pink'}):
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.boxplot(x='Occupation', y='Salary', data=df, ax=ax)
    ax.tick_params(axis='x', labelrotation=45)

In [None]:
# seaborn is built on matplot lib
# change size
with sns.plotting_context('poster'):
    with sns.axes_style('dark', {'axes.facecolor':'pink'}):
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.boxplot(x='Occupation', y='Salary', data=df, ax=ax)
        ax.tick_params(axis='x', labelrotation=45)

In [None]:
# seaborn is built on matplot lib
# save image
with sns.plotting_context('poster'):
    with sns.axes_style('dark', {'axes.facecolor':'pink'}):
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.boxplot(x='Occupation', y='Salary', data=df, ax=ax)
        ax.tick_params(axis='x', labelrotation=45)
fig.savefig('/tmp/box.png', dpi=300)

In [None]:
# Current colors
sns.palplot(sns.color_palette())

In [None]:
# Colors for categories
sns.palplot(sns.color_palette('husl', n_colors=7))

In [None]:
# Colors for diverging data (heatmap of correlation)
sns.palplot(sns.color_palette('RdBu', n_colors=30))

In [None]:
# Colors for sequential data (ordinal)
sns.palplot(sns.color_palette('Blues'))

In [None]:
# Chooser 'sequential', 'diverging', 'qualitative'
sns.choose_colorbrewer_palette('diverging')

In [None]:
# use the Dark2 color palette
with sns.color_palette('RdGy'):
    with sns.plotting_context('poster'):
        with sns.axes_style('dark', {'axes.facecolor':'white'}):
            fig, ax = plt.subplots(figsize=(10, 8))
            sns.boxplot(x='Occupation', y='Salary', data=df, ax=ax)
            ax.tick_params(axis='x', labelrotation=45)

In [None]:
# custom color palette
bad = ['#c07fef', '#deadbe', '#fef70c', '#112233', '#332211']
with sns.color_palette(bad):
    with sns.plotting_context('poster'):
        with sns.axes_style('dark', {'axes.facecolor':'white'}):
            fig, ax = plt.subplots(figsize=(10, 8))
            sns.boxplot(x='Occupation', y='Salary', data=df, ax=ax)
            ax.tick_params(axis='x', labelrotation=45)