In [13]:
import pandas as pd
import catboost as cb
import collections
import numpy as np

jb = pd.read_csv('datasets/2020-jetbrains-python-survey.csv', low_memory=False)

def get_unique_cols(jb):
    counter = collections.defaultdict(list)
    for col in sorted(jb.columns):
        period_count = col.count('.')
        if period_count >= 2:
            part_end = 2
        else:
            part_end = 1
        parts = col.split('.')[:part_end]
        counter['.'.join(parts)].append(col)
    unique_cols = []
    for cols in counter.values():
        if len(cols) == 1:
                unique_cols.extend(cols)
    return unique_cols

def prep_for_ml(df):
    # remove pandas types
    return (df
           .assign(**{col:df[col].astype(float)
                     for col in df.select_dtypes('number')},
                   **{col:df[col].astype('str').fillna('')
                     for col in df.select_dtypes(['object', 'category'])})
           )

def predict_col(df, col):
    df = prep_for_ml(df)
    missing = df.query(f'~{col}.isna()')
    cat_idx = []
    for i, typ in enumerate(df.drop(columns=[col]).dtypes):
        if str(typ) == 'object':
            cat_idx.append(i)
    X = missing.drop(columns=[col]).values
    y = missing[col]
    model = cb.CatBoostRegressor(iterations=20, cat_features=cat_idx)
    model.fit(X, y, cat_features=cat_idx)
    pred = model.predict(df.drop(columns=[col]))
    return df[col].where(~df[col].isna(), pred)

def tweak_jb(jb):
    unique_cols = get_unique_cols(jb)
    return (jb
            [unique_cols]
           .rename(columns=lambda c: c.replace('.', '_'))
           .assign(age=lambda df_: df_.age.str.slice(0, 2).astype(float)
                  .astype('Int64'),
                  are_you_datascientist=lambda df_: df_
                      .are_you_datascientist
                      .replace({'Yes': True, 'No': False, np.nan: False}),
                 # company_size=lambda df_: df_.company_size.replace({
                  #    'Just me': 1, 'Not sure': np.nan,  
                   #   'More than 5,000': 5000, '2-10': 2, '11-50': 11, ?!?!?!?!?!?!?!? what's wrong?
                    #  '51-500': 51, '501-1,000': 501,
                     # '1,001-5,001': 1001}).astype('Int64'),
                  country_live=lambda df_: df_.country_live
                      .astype('category'),
                  employment_status=lambda df_: df_.employment_status
                      .fillna('Other').astype('category'),
                  is_python_main=lambda df_: df_.is_python_main
                      .astype('category'),
                  team_size=lambda df_: df_.team_size
                      .str.split(r'-', n=1, expand=True)
                      .iloc[:, 0].replace('More than 40 people', 41)
                      .where(df_.company_size!=1, 1).astype(float),
                  years_of_coding=lambda df_: df_.years_of_coding
                      .replace('Less than 1 year', .5)
                      .str.extract(r'(\d+)').astype(float),
                  python_years=lambda df_: df_.python_years
                      .replace('Less than 1 year', .5)
                      .str.extract(r'(\d+)').astype(float),
                  python3_ver=lambda df_: df_.python3_version_most
                      .str.replace('_', '.').str.extract(r'(\d\.\d)')
                      .astype(float),
                  use_python_most=lambda df_: df_.use_python_most
                      .fillna('Unknown')
                  )
           .assign(team_size=lambda df_:predict_col(df_, 'team_size')
                  .astype(int))
           .drop(columns=['python2_version_most'])
           .dropna()
        )

jb2 = tweak_jb(jb)
jb2

Learning rate set to 0.5
0:	learn: 6.3373204	total: 21.9ms	remaining: 415ms
1:	learn: 6.2416428	total: 44ms	remaining: 396ms
2:	learn: 6.2042819	total: 66.7ms	remaining: 378ms
3:	learn: 6.1713242	total: 88.4ms	remaining: 354ms
4:	learn: 6.1313697	total: 110ms	remaining: 331ms
5:	learn: 6.1163744	total: 132ms	remaining: 308ms
6:	learn: 6.1054688	total: 154ms	remaining: 286ms
7:	learn: 6.1040920	total: 173ms	remaining: 260ms
8:	learn: 6.0946976	total: 195ms	remaining: 238ms
9:	learn: 6.0906277	total: 211ms	remaining: 211ms
10:	learn: 6.0894286	total: 236ms	remaining: 193ms
11:	learn: 6.0757775	total: 258ms	remaining: 172ms
12:	learn: 6.0718265	total: 282ms	remaining: 152ms
13:	learn: 6.0696191	total: 307ms	remaining: 131ms
14:	learn: 6.0644258	total: 333ms	remaining: 111ms
15:	learn: 6.0638403	total: 345ms	remaining: 86.3ms
16:	learn: 6.0469462	total: 367ms	remaining: 64.8ms
17:	learn: 6.0442746	total: 389ms	remaining: 43.2ms
18:	learn: 6.0442742	total: 398ms	remaining: 20.9ms
19:	learn:

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
1,21,True,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2,Software prototyping,3.0,3.6
2,30,False,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",3,DevOps / System administration / Writing autom...,3.0,3.6
10,21,False,51–500,Other country,Fully employed by a company / organization,School / University,Daily,IntelliJ IDEA,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",2,Web development,1.0,3.8
11,21,True,51–500,United States,Fully employed by a company / organization,Online learning platform / Online course,Daily,PyCharm Community Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",9.0,3.0,Python 3_9,"Yes, I work on many different projects",2,Data analysis,3.0,3.9
13,30,True,"More than 5,000",Belgium,Fully employed by a company / organization,Social network,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_7,"Yes, I work on many different projects",2,Data analysis,3.0,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54456,30,False,"1,001–5,000",Turkey,Fully employed by a company / organization,Friend / Colleague,Daily,PyCharm Community Edition,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",9.0,1.0,Python 3_6,"Yes, I work on many different projects",3,Machine learning,6.0,3.6
54457,21,False,2–10,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",1,Data analysis,1.0,3.6
54459,21,False,Just me,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",2,Web development,6.0,3.7
54460,30,True,51–500,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",3,Data analysis,3.0,3.7


1. Group by a categorical column and take the mean of the numeric columns.

In [7]:
jb2.groupby(by='age').agg('mean')

  jb2.groupby(by='age').agg('mean')


Unnamed: 0_level_0,nps_main_ide,python_years,team_size,years_of_coding,python3_ver
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18,8.948795,2.879518,2.942771,2.060241,3.761145
21,8.958763,3.33634,3.830817,2.631443,3.730799
30,8.828866,5.271173,4.036519,5.914724,3.731993
40,8.780107,6.279863,3.974159,8.602145,3.734715
50,8.590203,6.567503,4.156511,9.62724,3.742413
60,8.90393,6.969432,4.899563,10.052402,3.743231


2. Group by a categorical column and take the mean and max of the numeric columns.

In [8]:
jb2.groupby(by='age').agg(['mean', 'max'])

  jb2.groupby(by='age').agg(['mean', 'max'])


Unnamed: 0_level_0,nps_main_ide,nps_main_ide,python_years,python_years,team_size,team_size,years_of_coding,years_of_coding,python3_ver,python3_ver
Unnamed: 0_level_1,mean,max,mean,max,mean,max,mean,max,mean,max
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
18,8.948795,10.0,2.879518,11.0,2.942771,41,2.060241,11.0,3.761145,3.9
21,8.958763,10.0,3.33634,11.0,3.830817,41,2.631443,11.0,3.730799,3.9
30,8.828866,10.0,5.271173,11.0,4.036519,41,5.914724,11.0,3.731993,3.9
40,8.780107,10.0,6.279863,11.0,3.974159,41,8.602145,11.0,3.734715,3.9
50,8.590203,10.0,6.567503,11.0,4.156511,41,9.62724,11.0,3.742413,3.9
60,8.90393,10.0,6.969432,11.0,4.899563,41,10.052402,11.0,3.743231,3.9


3. Group by a categorical column and apply a custom aggregation function that calculates the
mode of the numeric columns.


In [20]:
def custom_agg_func(ser):
    return ser.mode()
jb2.groupby(by='age').agg(custom_agg_func)



Unnamed: 0_level_0,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
18,False,2–10,India,Working student,Friend / Colleague,Daily,VS Code,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_8,"Yes, I work on many different projects",2,Web development,1.0,3.8
21,False,51–500,United States,Fully employed by a company / organization,Friend / Colleague,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_8,"Yes, I work on many different projects",2,Web development,3.0,3.8
30,False,51–500,United States,Fully employed by a company / organization,Friend / Colleague,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_8,"Yes, I work on many different projects",2,Web development,6.0,3.8
40,False,51–500,United States,Fully employed by a company / organization,Friend / Colleague,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,11.0,Python 3_8,"Yes, I work on many different projects",2,Web development,11.0,3.8
50,False,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,11.0,Python 3_8,"Yes, I work on many different projects",2,Data analysis,11.0,3.8
60,False,"More than 5,000",United States,Fully employed by a company / organization,I don't remember,Daily,PyCharm Professional Edition,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,11.0,Python 3_8,"Yes, I work on many different projects",2,Data analysis,11.0,3.8


4. Group by two categorical columns and take the mean of the numeric columns.


In [21]:
jb2.groupby(by=['nps_main_ide', 'python_years']).agg('mean')

  jb2.groupby(by=['nps_main_ide', 'python_years']).agg('mean')


Unnamed: 0_level_0,Unnamed: 1_level_0,age,team_size,years_of_coding,python3_ver
nps_main_ide,python_years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,1.0,19.5,6.0,1.0,3.65
0.0,3.0,27.625,3.75,5.25,3.775
0.0,6.0,31.833333,3.0,6.083333,3.691667
0.0,11.0,31.818182,3.727273,6.181818,3.672727
1.0,1.0,30.75,4.5,6.0,3.725
1.0,3.0,30.428571,8.285714,3.714286,3.714286
1.0,6.0,30.0,2.0,6.0,3.7
1.0,11.0,40.615385,3.461538,10.615385,3.715385
2.0,1.0,27.75,5.0,2.0,3.725
2.0,3.0,32.3,2.6,2.7,3.7


5. Group by binned numeric column and take the mean of the numeric columns.

In [25]:
jb2.groupby(['age', pd.cut(jb2.age, 5)]).mean()

  jb2.groupby(['age', pd.cut(jb2.age, 5)]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,nps_main_ide,python_years,team_size,years_of_coding,python3_ver
age,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18,"(17.958, 26.4]",8.948795,2.879518,2.942771,2.060241,3.761145
18,"(26.4, 34.8]",,,,,
18,"(34.8, 43.2]",,,,,
18,"(43.2, 51.6]",,,,,
18,"(51.6, 60.0]",,,,,
21,"(17.958, 26.4]",8.958763,3.33634,3.830817,2.631443,3.730799
21,"(26.4, 34.8]",,,,,
21,"(34.8, 43.2]",,,,,
21,"(43.2, 51.6]",,,,,
21,"(51.6, 60.0]",,,,,
