# Debugging and Testing Pandas

## Code to Transform Data

### How to do it...

In [2]:
import pandas as pd
import numpy as np
import zipfile
url = 'datasets/kaggle-survey-2018.zip'

In [3]:
with zipfile.ZipFile(url) as z:
    print(z.namelist())
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

['multipleChoiceResponses.csv', 'freeFormResponses.csv', 'SurveySchema.csv']


  kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))


In [4]:
df.T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,23850,23851,23852,23853,23854,23855,23856,23857,23858,23859
Time from Start to Finish (seconds),710,434,718,621,731,1142,959,1758,641,751,...,820,683,57,122,348,575,131,370,36,502
Q1,Female,Male,Female,Male,Male,Male,Male,Male,Male,Male,...,Female,Male,Female,Female,Male,Male,Female,Male,Male,Male
Q1_OTHER_TEXT,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Q2,45-49,30-34,30-34,35-39,22-24,25-29,35-39,18-21,25-29,30-34,...,18-21,22-24,18-21,30-34,30-34,45-49,25-29,22-24,25-29,25-29
Q3,United States of America,Indonesia,United States of America,United States of America,India,Colombia,Chile,India,Turkey,Hungary,...,India,Turkey,Turkey,Turkey,Turkey,France,Turkey,Turkey,United Kingdom of Great Britain and Northern I...,Spain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q50_Part_5,,,,Not enough incentives to share my work,Not enough incentives to share my work,,,Not enough incentives to share my work,,,...,,,,,,,,,,
Q50_Part_6,,,,,,I had never considered making my work easier f...,I had never considered making my work easier f...,,,,...,,,,,,,,,,
Q50_Part_7,,,,,,,,,,,...,,,,,,,,,,
Q50_Part_8,,,,,,,,,,,...,,,,,,,,,,


In [5]:
df.dtypes

Time from Start to Finish (seconds)    object
Q1                                     object
Q1_OTHER_TEXT                          object
Q2                                     object
Q3                                     object
                                        ...  
Q50_Part_5                             object
Q50_Part_6                             object
Q50_Part_7                             object
Q50_Part_8                             object
Q50_OTHER_TEXT                         object
Length: 395, dtype: object

In [6]:
df.Q1.value_counts(dropna=False)

Male                       19430
Female                      4010
Prefer not to say            340
Prefer to self-describe       79
Name: Q1, dtype: int64

In [7]:
def tweak_kag(df):
    na_mask = df.Q9.isna()
    hide_mask = df.Q9.str.startswith('I do not').fillna(False)
    df = df[~na_mask & ~hide_mask]


    q1 = (df.Q1
      .replace({'Prefer not to say': 'Another',
               'Prefer to self-describe': 'Another'})
      .rename('Gender')
    )
    q2 = df.Q2.str.slice(0,2).astype(int).rename('Age')
    def limit_countries(val):
        if val in  {'United States of America', 'India', 'China'}:
            return val
        return 'Another'

    q3 = df.Q3.apply(limit_countries).rename('Country')


    q4 = (df.Q4
     .replace({'Master’s degree': 18,
     'Bachelor’s degree': 16,
     'Doctoral degree': 20,
     'Some college/university study without earning a bachelor’s degree': 13,
     'Professional degree': 19,
     'I prefer not to answer': None,
     'No formal education past high school': 12})
     .fillna(11)
     .rename('Edu')
    )


    def only_cs_stat_val(val):
        return 'another' if val not in {'cs', 'eng', 'stat'} else val

    q5 = (df.Q5
            .replace({
                'Computer science (software engineering, etc.)': 'cs',
                'Engineering (non-computer focused)': 'eng',
                'Mathematics or statistics': 'stat'})
             .apply(only_cs_stat_val)
             .rename('Studies'))
    def limit_occupation(val):
        if val in {'Student', 'Data Scientist', 'Software Engineer', 'Not employed',
                  'Data Engineer'}:
            return val
        return 'Another'


    q6 = df.Q6.apply(limit_occupation).rename('Occupation')


    q8 = (df.Q8
      .str.replace('+', '')
      .str.split('-', expand=True)
      .iloc[:,0]
      .fillna(-1)
      .astype(int)
      .rename('Experience')
    )


    q9 = (df.Q9
     .str.replace('+','')
     .str.replace(',','')
     .str.replace('500000', '500')
     .str.replace('I do not wish to disclose my approximate yearly compensation','')
     .str.split('-', expand=True)
     .iloc[:,0]
     .astype(int)
     .mul(1000)
     .rename('Salary'))
    return pd.concat([q1, q2, q3, q4, q5, q6, q8, q9], axis=1)

In [8]:
tweak_kag(df)

  .str.replace('+', '')
  .str.replace('+','')


Unnamed: 0,Gender,Age,Country,Edu,Studies,Occupation,Experience,Salary
2,Male,30,Another,16.0,eng,Another,5,10000
3,Female,30,United States of America,18.0,cs,Data Scientist,0,0
5,Male,22,India,18.0,stat,Another,0,0
7,Male,35,Another,20.0,another,Another,10,10000
8,Male,18,India,18.0,another,Another,0,0
...,...,...,...,...,...,...,...,...
23844,Male,30,Another,18.0,cs,Software Engineer,10,90000
23845,Male,22,Another,18.0,stat,Student,0,0
23854,Male,30,Another,20.0,cs,Another,5,10000
23855,Male,45,Another,20.0,cs,Another,5,250000


In [9]:
tweak_kag(df).dtypes

  .str.replace('+', '')
  .str.replace('+','')


Gender         object
Age             int32
Country        object
Edu           float64
Studies        object
Occupation     object
Experience      int32
Salary          int32
dtype: object

### How it works...

In [10]:
kag = tweak_kag(df)
(kag
    .groupby('Country')
    .apply(lambda g: g.Salary.corr(g.Experience))
)

  .str.replace('+', '')
  .str.replace('+','')


Country
Another                     0.289827
China                       0.252974
India                       0.167335
United States of America    0.354125
dtype: float64

## Apply Performance

### How to do it...

In [11]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
         return val
     return 'Another'

In [12]:
%%timeit
q3 = df.Q3.apply(limit_countries).rename('Country')

4.3 ms ± 90.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%%timeit
other_values = df.Q3.value_counts().iloc[3:].index
q3_2 = df.Q3.replace(other_values, 'Another')

43.6 ms ± 2.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%%timeit
values = {'United States of America', 'India', 'China'}
q3_3 = df.Q3.where(df.Q3.isin(values), 'Another')

1.82 ms ± 74 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit
values = {'United States of America', 'India', 'China'}
q3_4 = pd.Series(np.where(df.Q3.isin(values), df.Q3, 'Another'), 
     index=df.index)

1.58 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [16]:
#q3_4.equals(q3_2)

In [17]:
#q3_4.equals(q3_3)

In [18]:
#q3.equals(q3_4)

### How it works...

### There's more...

In [19]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
         return val
     return 'Another'

In [20]:
q3 = df.Q3.apply(limit_countries).rename('Country')

In [21]:
def debug(something):
    # what is something? A cell, series, dataframe?
    print(type(something), something)
    1/0

In [22]:
#q3.apply(debug)

In [23]:
the_item = None
def debug(something):
    global the_item
    the_item = something
    return something

In [24]:
_ = q3.apply(debug)

In [25]:
the_item

'Another'

## Improving Apply Performance with Dask, Pandarell, Swifter, and More

### How to do it...

In [26]:
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [27]:
def limit_countries(val):
     if val in  {'United States of America', 'India', 'China'}:
         return val
     return 'Another'

In [28]:
%%timeit
res_p = df.Q3.parallel_apply(limit_countries).rename('Country')

2.71 s ± 161 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
import swifter

In [None]:
%%timeit
res_s = df.Q3.swifter.apply(limit_countries).rename('Country')

In [31]:
import dask

In [32]:
%%timeit
res_d = (dask.dataframe.from_pandas(
       df, npartitions=4)
   .map_partitions(lambda df: df.Q3.apply(limit_countries))
   .rename('Countries')
)

750 ms ± 24.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
np_fn = np.vectorize(limit_countries)

In [34]:
%%timeit
res_v = df.Q3.apply(np_fn).rename('Country')

359 ms ± 16.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
from numba import jit

In [36]:
@jit
def limit_countries2(val):
     if val in  ['United States of America', 'India', 'China']:
         return val
     return 'Another'

In [37]:
%%timeit
res_n = df.Q3.apply(limit_countries2).rename('Country')

80.7 ms ± 4.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### How it works...

## Inspecting Code 

### How to do it...

In [38]:
import zipfile
import pandas as pd
url = 'datasets/kaggle-survey-2018.zip'

In [39]:
with zipfile.ZipFile(url) as z:
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

  kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))


In [40]:
df.Q3.apply?

[1;31mSignature:[0m
[0mdf[0m[1;33m.[0m[0mQ3[0m[1;33m.[0m[0mapply[0m[1;33m([0m[1;33m
[0m    [0mfunc[0m[1;33m:[0m [1;34m'AggFuncType'[0m[1;33m,[0m[1;33m
[0m    [0mconvert_dtype[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0margs[0m[1;33m:[0m [1;34m'tuple[Any, ...]'[0m [1;33m=[0m [1;33m([0m[1;33m)[0m[1;33m,[0m[1;33m
[0m    [1;33m**[0m[0mkwargs[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame | Series'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Invoke function on values of Series.

Can be ufunc (a NumPy function that applies to the entire Series)
or a Python function that only works on single values.

Parameters
----------
func : function
    Python function or NumPy ufunc to apply.
convert_dtype : bool, default True
    Try to find better dtype for elementwise function results. If
    False, leave as dtype=object. Note that the dtype is always
    preserved for some 

In [41]:
df.Q3.apply??

[1;31mSignature:[0m
[0mdf[0m[1;33m.[0m[0mQ3[0m[1;33m.[0m[0mapply[0m[1;33m([0m[1;33m
[0m    [0mfunc[0m[1;33m:[0m [1;34m'AggFuncType'[0m[1;33m,[0m[1;33m
[0m    [0mconvert_dtype[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0margs[0m[1;33m:[0m [1;34m'tuple[Any, ...]'[0m [1;33m=[0m [1;33m([0m[1;33m)[0m[1;33m,[0m[1;33m
[0m    [1;33m**[0m[0mkwargs[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame | Series'[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
    [1;32mdef[0m [0mapply[0m[1;33m([0m[1;33m
[0m        [0mself[0m[1;33m,[0m[1;33m
[0m        [0mfunc[0m[1;33m:[0m [0mAggFuncType[0m[1;33m,[0m[1;33m
[0m        [0mconvert_dtype[0m[1;33m:[0m [0mbool[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m        [0margs[0m[1;33m:[0m [0mtuple[0m[1;33m[[0m[0mAny[0m[1;33m,[0m [1;33m...[0m[1;33m][0m [1;33m=[0m [1;33m([0m[1;33m)[0m[

In [42]:
import pandas.core.series
pandas.core.series.lib

<module 'pandas._libs.lib' from 'C:\\Users\\Vadim\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\pandas\\_libs\\lib.cp310-win_amd64.pyd'>

In [43]:
pandas.core.series.lib.map_infer??

[1;31mDocstring:[0m
Substitute for np.vectorize with pandas-friendly dtype inference.

Parameters
----------
arr : ndarray
f : function
convert : bint
ignore_na : bint
    If True, NA values will not have f applied

Returns
-------
np.ndarray
[1;31mType:[0m      builtin_function_or_method


## Debugging in Jupyter

### How to do it...

In [44]:
import zipfile
import pandas as pd

url = 'datasets/kaggle-survey-2018.zip'

In [45]:
with zipfile.ZipFile(url) as z:
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    df = kag.iloc[1:]

  kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))


##  Managing data integrity with Great Expectations

### How to do it...

In [46]:
kag = tweak_kag(df)

In [47]:
import great_expectations as ge
kag_ge = ge.from_pandas(kag)

In [48]:
sorted([x for x in set(dir(kag_ge)) - set(dir(kag))
    if not x.startswith('_')])

['add_citation',
 'append_expectation',
 'attempt_allowing_relative_error',
 'autoinspect',
 'batch_id',
 'batch_kwargs',
 'batch_markers',
 'batch_parameters',
 'caching',
 'column_aggregate_expectation',
 'column_map_expectation',
 'column_pair_map_expectation',
 'default_expectation_args',
 'discard_failing_expectations',
 'discard_subset_failing_expectations',
 'edit_expectation_suite',
 'expect_column_bootstrapped_ks_test_p_value_to_be_greater_than',
 'expect_column_chisquare_test_p_value_to_be_greater_than',
 'expect_column_distinct_values_to_be_in_set',
 'expect_column_distinct_values_to_contain_set',
 'expect_column_distinct_values_to_equal_set',
 'expect_column_kl_divergence_to_be_less_than',
 'expect_column_max_to_be_between',
 'expect_column_mean_to_be_between',
 'expect_column_median_to_be_between',
 'expect_column_min_to_be_between',
 'expect_column_most_common_value_to_be_in_set',
 'expect_column_pair_cramers_phi_value_to_be_less_than',
 'expect_column_pair_values_A_to_be

In [49]:
kag_ge.expect_column_to_exist('Salary')

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {}
}

In [50]:
kag_ge.expect_column_mean_to_be_between(
   'Salary', min_value=10_000, max_value=100_000)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 43869.66102793441,
    "element_count": 15429,
    "missing_count": null,
    "missing_percent": null
  }
}

In [51]:
kag_ge.expect_column_values_to_be_between(
   'Salary', min_value=0, max_value=500_000)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 15429,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

In [52]:
kag_ge.expect_column_values_to_not_be_null('Salary')

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 15429,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  }
}

In [53]:
kag_ge.expect_column_values_to_match_regex(
    'Country', r'America|India|Another|China')

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 15429,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

In [54]:
kag_ge.expect_column_values_to_be_of_type(
   'Salary', type_='int')

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "int32"
  }
}

In [55]:
kag_ge.save_expectation_suite('kaggle_expectations.json')

In [56]:
kag_ge.to_csv('kag.csv')
import json

ge.validate(ge.read_csv('kag.csv'), 
    expectation_suite=json.load(
        open('kaggle_expectations.json')))

{
  "results": [
    {
      "success": true,
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "result": {},
      "expectation_config": {
        "meta": {},
        "kwargs": {
          "column": "Salary"
        },
        "expectation_type": "expect_column_to_exist"
      }
    },
    {
      "success": true,
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "result": {
        "observed_value": 43869.66102793441,
        "element_count": 15429,
        "missing_count": null,
        "missing_percent": null
      },
      "expectation_config": {
        "meta": {},
        "kwargs": {
          "column": "Salary",
          "max_value": 100000,
          "min_value": 10000
        },
        "expectation_type": "expect_column_mean_to_be_between"
      }
