In [2]:
SEED = 123321
MISSING = -9999

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier

In [4]:
df_train = pd.read_csv("../data/interim/train.csv")
df_test = pd.read_csv("../data/interim/test.csv")

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# explicitly categorical features
categorical = ['v17', 'v20', 'v25', 'v78', 'v154', 'v155', 'v161', 'cntry']

# implicitly categorical features
categorical2 = ['v70', 'v71', 'v72', 'v73', 'v102', 'v103','v158', 'v159', 'v160', 
                'v163', 'v164', 'v169', 'v170', 'v190', 'v191','v216', 'v231']

# yes-no features
yes_no_cols = ['v6', 'v7', 'v8', 'v9', 'v10', 'v11', 'v14', 'v18', 'v21', 'v22', 'v23',
               'v24', 'v43', 'v53', 'v77', 'v105', 'v108', 'v152', 'v157', 'v162',
               'v165', 'v172', 'v173', 'v187', 'v188', 'v221', 'v241', 'v242', 'v243',
               'v248', 'v254', 'v256', 'v257']

# special features
education_encoding1 = ['v56', 'v58', 'v60', 'v62']
education_encoding2 = ['v57', 'v59', 'v61', 'v63']

occupation_encoding = ['v150', 'v151']

In [6]:
df_train['test'] = 0
df_test['test'] = 1

In [7]:
df = pd.concat([df_train, df_test]).reset_index(drop=True)
df = df.set_index('id')

In [10]:
# create happiness ratio feature
df['v98'] = pd.to_numeric(df['v98'], 'coerce') # happiness
df['happiness_ratio_cntry'] = df['v98'] / df.groupby('cntry')['v98'].transform(np.mean)

In [12]:
# binarize yes-no features
df[yes_no_cols] = df[yes_no_cols].replace(to_replace=['2', 2], value=0)
df[yes_no_cols] = df[yes_no_cols].replace(to_replace=['1', 1], value=1)

In [13]:
# special missing values in education encoding
for col in education_encoding1:
    df[col] = pd.to_numeric(df[col], 'coerce')
    df[col] = df[col].replace({55:np.nan})

for col in education_encoding2:
    df[col] = pd.to_numeric(df[col], 'coerce')
    df[col] = df[col].replace({5555:np.nan})

In [14]:
# take first 2 digit of occupation encoding
for col in occupation_encoding:
    df[col] = df[col].apply(lambda x: x[:2])

In [15]:
one_hot = categorical + categorical2 + occupation_encoding
numeric = [col for col in df.columns if col not in one_hot + ['id', 'satisfied', 'test']]

In [17]:
# apply one-hot encoding
for col in one_hot:
    df[col] = pd.Categorical(df[col])
df_encoded = pd.get_dummies(df, columns=one_hot, drop_first=True)

In [18]:
# normalize and impute missing
df_encoded = df_encoded.apply(pd.to_numeric, args=('coerce',))
df_encoded = df_encoded.drop(columns=df_encoded.nunique()[df_encoded.nunique() <= 1].index)
df_encoded -= df_encoded.min()
df_encoded /= df_encoded.max()
df_encoded = df_encoded.fillna(MISSING)

In [25]:
# feature selection using Extremely Randomized Tree
et = ExtraTreesClassifier(n_estimators=800, random_state=SEED)
X = df_encoded[df_encoded['test'] == 0].drop(columns=['satisfied','test'])
y = df_encoded[df_encoded['test'] == 0]['satisfied']
et.fit(X, y)

In [29]:
feature_importance = pd.DataFrame(np.array([X.columns, et.feature_importances_]).T, 
                                  columns=['feature', 'importance']) \
                                .sort_values('importance', ascending=False) \
                                .reset_index(drop=True)

In [31]:
# keep only top 350 important features
non_important_cols = feature_importance[350:]['feature']
df_encoded = df_encoded.drop(columns=non_important_cols)
df_encoded.shape

In [34]:
# save data to csv
df_encoded.to_csv("../data/processed/data_encoded_stacking_final.csv", index=True)