In [1]:
import pandas
import seaborn

import sklearn.impute
import sklearn.preprocessing
import sklearn.pipeline
import sklearn.compose
import sklearn.ensemble

%matplotlib inline

In [2]:
data_train = pandas.read_csv('./data/high_salary.train.csv').set_index('id')
data_train

Unnamed: 0_level_0,social-security-number,house-number,age-group,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country-code,native-country,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
8616,216208066.0,6278.0,2.0,self-emp-inc,270079.0,bachelors,13.0,married-civ-spouse,exec-managerial,husband,white,male,0.0,0.0,3.0,USA,united-states,1.0
21982,404455249.0,1673.0,3.0,local-gov,146325.0,doctorate,16.0,married-civ-spouse,prof-specialty,husband,white,male,0.0,2.0,2.0,USA,united-states,1.0
11191,500057524.0,7417.0,0.0,private,240767.0,hs-grad,9.0,never-married,other-service,not-in-family,white,female,0.0,0.0,1.0,USA,united-states,0.0
22229,932067669.0,9740.0,2.0,private,118536.0,hs-grad,9.0,divorced,machine-op-inspct,other-relative,black,male,0.0,0.0,2.0,USA,united-states,0.0
20732,417132399.0,6941.0,3.0,private,160440.0,bachelors,13.0,married-civ-spouse,sales,husband,white,male,0.0,0.0,3.0,USA,united-states,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,448834224.0,3595.0,0.0,private,147253.0,assoc-acdm,12.0,never-married,sales,not-in-family,white,male,0.0,0.0,2.0,USA,united-states,0.0
5390,290486928.0,8844.0,3.0,private,330543.0,preschool,1.0,married-civ-spouse,other-service,husband,white,male,0.0,0.0,2.0,MEX,mexico,0.0
860,198116565.0,5120.0,3.0,private,41223.0,some-college,10.0,married-civ-spouse,adm-clerical,husband,white,male,0.0,0.0,2.0,USA,united-states,1.0
15795,481902134.0,6430.0,0.0,state-gov,275421.0,some-college,10.0,never-married,machine-op-inspct,own-child,white,female,0.0,0.0,2.0,USA,united-states,0.0


In [3]:
data_test_features = pandas.read_csv('./data/high_salary.test_features.csv').set_index('id')
data_test_label = pandas.read_csv('./data/high_salary.test_label.csv').set_index('id')

In [4]:
data_processor = sklearn.compose.ColumnTransformer(transformers=[
    (
        'categorical_data_processor', 
        (
            sklearn.pipeline.Pipeline(steps=[
                ('mode_imputer', sklearn.impute.SimpleImputer(strategy='most_frequent')),
                ('onehot_encoder', sklearn.preprocessing.OneHotEncoder(sparse=False)),
            ])
        ), 
        [
            'age-group',
            'workclass', 
            'marital-status', 
            'occupation', 
            'relationship', 
            'race', 
            'sex', 
            'native-country',
            'native-country-code',

        ]
    ),
    (
        'numerical_data_processor', 
        (
            sklearn.pipeline.Pipeline(steps=[
                ('median_imputer', sklearn.impute.SimpleImputer(strategy='median')),
            ])
        ), 
        [
            'fnlwgt', 
            'education-num', 
            'capitalgain', 
            'capitalloss', 
            'hoursperweek', 
        ]
    ),
])

In [5]:
model = sklearn.pipeline.Pipeline(steps=[
    ('data_processor', data_processor),
    ('classifier', sklearn.ensemble.RandomForestClassifier(
        n_estimators=100,
        criterion='entropy', 
        max_features=0.5,
        min_samples_split=0.03,
        class_weight='balanced_subsample',
        random_state=0,
    )),
])


In [6]:
model.fit(
    X = data_train.drop(columns=['label']),
    y = data_train['label']
)

Pipeline(steps=[('data_processor',
                 ColumnTransformer(transformers=[('categorical_data_processor',
                                                  Pipeline(steps=[('mode_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot_encoder',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['age-group', 'workclass',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country',
                                                   'native-country-code']),
                                                 ('numerical_data_proce

In [7]:
sklearn.metrics.f1_score(
    y_true = data_train['label'],
    y_pred = model.predict(data_train.drop(columns=['label'])),
)

0.79593147751606

In [8]:
sklearn.metrics.f1_score(
    y_true = data_test_label,
    y_pred = model.predict(data_test_features),
)

0.7970826580226905

In [9]:
predictions = pandas.DataFrame(
    data = model.predict(data_test_features),
    columns = ['prediction'],
    index = data_test_features.index,
)
predictions

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
6111,0.0
11214,0.0
5554,1.0
25131,1.0
14324,1.0
...,...
25998,1.0
8375,0.0
13888,0.0
4159,1.0


In [10]:
predictions.to_csv('./results/predictions.test.csv')