In [2]:
import pandas as pd
import catboost as cb
import collections
import numpy as np

jb = pd.read_csv('datasets/2020-jetbrains-python-survey.csv', low_memory=False)

def get_unique_cols(jb):
    counter = collections.defaultdict(list)
    for col in sorted(jb.columns):
        period_count = col.count('.')
        if period_count >= 2:
            part_end = 2
        else:
            part_end = 1
        parts = col.split('.')[:part_end]
        counter['.'.join(parts)].append(col)
    unique_cols = []
    for cols in counter.values():
        if len(cols) == 1:
                unique_cols.extend(cols)
    return unique_cols

def prep_for_ml(df):
    # remove pandas types
    return (df
           .assign(**{col:df[col].astype(float)
                     for col in df.select_dtypes('number')},
                   **{col:df[col].astype('str').fillna('')
                     for col in df.select_dtypes(['object', 'category'])})
           )

def predict_col(df, col):
    df = prep_for_ml(df)
    missing = df.query(f'~{col}.isna()')
    cat_idx = []
    for i, typ in enumerate(df.drop(columns=[col]).dtypes):
        if str(typ) == 'object':
            cat_idx.append(i)
    X = missing.drop(columns=[col]).values
    y = missing[col]
    model = cb.CatBoostRegressor(iterations=20, cat_features=cat_idx)
    model.fit(X, y, cat_features=cat_idx)
    pred = model.predict(df.drop(columns=[col]))
    return df[col].where(~df[col].isna(), pred)

def tweak_jb(jb):
    unique_cols = get_unique_cols(jb)
    return (jb
            [unique_cols]
           .rename(columns=lambda c: c.replace('.', '_'))
           .assign(age=lambda df_: df_.age.str.slice(0, 2).astype(float)
                  .astype('Int64'),
                  are_you_datascientist=lambda df_: df_
                      .are_you_datascientist
                      .replace({'Yes': True, 'No': False, np.nan: False}),
                 # company_size=lambda df_: df_.company_size.replace({
                  #    'Just me': 1, 'Not sure': np.nan,  
                   #   'More than 5,000': 5000, '2-10': 2, '11-50': 11, ?!?!?!?!?!?!?!? what's wrong?
                    #  '51-500': 51, '501-1,000': 501,
                     # '1,001-5,001': 1001}).astype('Int64'),
                  country_live=lambda df_: df_.country_live
                      .astype('category'),
                  employment_status=lambda df_: df_.employment_status
                      .fillna('Other').astype('category'),
                  is_python_main=lambda df_: df_.is_python_main
                      .astype('category'),
                  team_size=lambda df_: df_.team_size
                      .str.split(r'-', n=1, expand=True)
                      .iloc[:, 0].replace('More than 40 people', 41)
                      .where(df_.company_size!=1, 1).astype(float),
                  years_of_coding=lambda df_: df_.years_of_coding
                      .replace('Less than 1 year', .5)
                      .str.extract(r'(\d+)').astype(float),
                  python_years=lambda df_: df_.python_years
                      .replace('Less than 1 year', .5)
                      .str.extract(r'(\d+)').astype(float),
                  python3_ver=lambda df_: df_.python3_version_most
                      .str.replace('_', '.').str.extract(r'(\d\.\d)')
                      .astype(float),
                  use_python_most=lambda df_: df_.use_python_most
                      .fillna('Unknown')
                  )
           .assign(team_size=lambda df_:predict_col(df_, 'team_size')
                  .astype(int))
           .drop(columns=['python2_version_most'])
           .dropna()
        )

jb2 = tweak_jb(jb)
jb2


Learning rate set to 0.5
0:	learn: 6.3373204	total: 155ms	remaining: 2.94s
1:	learn: 6.2416428	total: 178ms	remaining: 1.6s
2:	learn: 6.2042819	total: 201ms	remaining: 1.14s
3:	learn: 6.1713242	total: 229ms	remaining: 914ms
4:	learn: 6.1313697	total: 254ms	remaining: 763ms
5:	learn: 6.1163744	total: 278ms	remaining: 649ms
6:	learn: 6.1054688	total: 302ms	remaining: 562ms
7:	learn: 6.1040920	total: 322ms	remaining: 483ms
8:	learn: 6.0946976	total: 345ms	remaining: 421ms
9:	learn: 6.0906277	total: 362ms	remaining: 362ms
10:	learn: 6.0894286	total: 384ms	remaining: 314ms
11:	learn: 6.0757775	total: 406ms	remaining: 271ms
12:	learn: 6.0718265	total: 428ms	remaining: 231ms
13:	learn: 6.0696191	total: 454ms	remaining: 194ms
14:	learn: 6.0644258	total: 480ms	remaining: 160ms
15:	learn: 6.0638403	total: 493ms	remaining: 123ms
16:	learn: 6.0469462	total: 517ms	remaining: 91.2ms
17:	learn: 6.0442746	total: 543ms	remaining: 60.3ms
18:	learn: 6.0442742	total: 552ms	remaining: 29.1ms
19:	learn: 6.0

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
1,21,True,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2,Software prototyping,3.0,3.6
2,30,False,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",3,DevOps / System administration / Writing autom...,3.0,3.6
10,21,False,51–500,Other country,Fully employed by a company / organization,School / University,Daily,IntelliJ IDEA,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",2,Web development,1.0,3.8
11,21,True,51–500,United States,Fully employed by a company / organization,Online learning platform / Online course,Daily,PyCharm Community Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",9.0,3.0,Python 3_9,"Yes, I work on many different projects",2,Data analysis,3.0,3.9
13,30,True,"More than 5,000",Belgium,Fully employed by a company / organization,Social network,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_7,"Yes, I work on many different projects",2,Data analysis,3.0,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54456,30,False,"1,001–5,000",Turkey,Fully employed by a company / organization,Friend / Colleague,Daily,PyCharm Community Edition,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",9.0,1.0,Python 3_6,"Yes, I work on many different projects",3,Machine learning,6.0,3.6
54457,21,False,2–10,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",1,Data analysis,1.0,3.6
54459,21,False,Just me,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",2,Web development,6.0,3.7
54460,30,True,51–500,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",3,Data analysis,3.0,3.7


1. Add a new column that is the sum of a numeric column that was grouped by a string column.

In [5]:
jb2.assign(Sum=jb2.groupby('country_live').age.transform('sum'))

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,...,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver,Sum
1,21,True,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,...,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2,Software prototyping,3.0,3.6,25404
2,30,False,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,...,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",3,DevOps / System administration / Writing autom...,3.0,3.6,89227
10,21,False,51–500,Other country,Fully employed by a company / organization,School / University,Daily,IntelliJ IDEA,Yes,Work in a team,...,"No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",2,Web development,1.0,3.8,9360
11,21,True,51–500,United States,Fully employed by a company / organization,Online learning platform / Online course,Daily,PyCharm Community Edition,Yes,Work in a team,...,"No, it has all the features I need",9.0,3.0,Python 3_9,"Yes, I work on many different projects",2,Data analysis,3.0,3.9,89227
13,30,True,"More than 5,000",Belgium,Fully employed by a company / organization,Social network,Daily,VS Code,Yes,Work in a team,...,"No, it has all the features I need",10.0,6.0,Python 3_7,"Yes, I work on many different projects",2,Data analysis,3.0,3.7,4397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54456,30,False,"1,001–5,000",Turkey,Fully employed by a company / organization,Friend / Colleague,Daily,PyCharm Community Edition,Yes,Work on your own project(s) independently,...,"No, it has all the features I need",9.0,1.0,Python 3_6,"Yes, I work on many different projects",3,Machine learning,6.0,3.6,2860
54457,21,False,2–10,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,...,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",1,Data analysis,1.0,3.6,14011
54459,21,False,Just me,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,...,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",2,Web development,6.0,3.7,14011
54460,30,True,51–500,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,...,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",3,Data analysis,3.0,3.7,9221


2. Filter out the rows that have less than 3 entries when grouped by a string column.


In [14]:
jb2.groupby('country_live').filter(lambda g: g.country_live.nunique() < 3)

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
1,21,True,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2,Software prototyping,3.0,3.6
2,30,False,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",3,DevOps / System administration / Writing autom...,3.0,3.6
10,21,False,51–500,Other country,Fully employed by a company / organization,School / University,Daily,IntelliJ IDEA,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",2,Web development,1.0,3.8
11,21,True,51–500,United States,Fully employed by a company / organization,Online learning platform / Online course,Daily,PyCharm Community Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",9.0,3.0,Python 3_9,"Yes, I work on many different projects",2,Data analysis,3.0,3.9
13,30,True,"More than 5,000",Belgium,Fully employed by a company / organization,Social network,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_7,"Yes, I work on many different projects",2,Data analysis,3.0,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54456,30,False,"1,001–5,000",Turkey,Fully employed by a company / organization,Friend / Colleague,Daily,PyCharm Community Edition,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",9.0,1.0,Python 3_6,"Yes, I work on many different projects",3,Machine learning,6.0,3.6
54457,21,False,2–10,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",1,Data analysis,1.0,3.6
54459,21,False,Just me,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",2,Web development,6.0,3.7
54460,30,True,51–500,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",3,Data analysis,3.0,3.7
