In [97]:
import numpy as np 
import pandas as pd 

# preprocessing/decomposition
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA, FastICA, FactorAnalysis, KernelPCA, TruncatedSVD

from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# model evaluation
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# supportive models
from sklearn.ensemble import GradientBoostingRegressor
# feature selection (from supportive model)
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

seed = 42 

import os 

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
INPUT = '/media/tin/DATA/Kaggle/Mercedes/Input/'
OUTPUT = '/media/tin/DATA/Kaggle/Mercedes/Output/'

train = pd.read_csv(os.path.join(INPUT, 'train_section.csv'))
test = pd.read_csv(os.path.join(INPUT, 'test_section.csv'))

train = train.drop('X11', axis=1)
train = train.drop('X93', axis=1)
train = train.drop('X107', axis=1)
train = train.drop('X233', axis=1)
train = train.drop('X235', axis=1)
train = train.drop('X268', axis=1)

test = test.drop('X11', axis=1)
test = test.drop('X93', axis=1)
test = test.drop('X107', axis=1)
test = test.drop('X233', axis=1)
test = test.drop('X235', axis=1)
test = test.drop('X268', axis=1)

# save IDs for submission
id_test = test['ID'].copy()

# glue datasets together
total = pd.concat([train, test], axis=0)
print('initial shape: {}'.format(total.shape))

# binary indexes for train/test set split
is_train = ~total.y.isnull()

initial shape: (8418, 373)


In [92]:
for col in ['X0', 'group']:
    total[col] = total[col].astype('object')
    

In [93]:


# find all categorical features
cf = total.select_dtypes(include=['object']).columns

In [78]:
#cf.append(pd.Index(['X0', 'group']))

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X0', 'group'], dtype='object')

In [95]:


# make one-hot-encoding convenient way - pandas.get_dummies(df) function
dummies = pd.get_dummies(
    total[cf],
    drop_first=False # you can set it = True to ommit multicollinearity (crucial for linear models)
)

print('oh-encoded shape: {}'.format(dummies.shape))

# get rid of old columns and append them encoded
total = pd.concat(
    [
        total.drop(cf, axis=1), # drop old
        dummies # append them one-hot-encoded
    ],
    axis=1 # column-wise
)

print('appended-encoded shape: {}'.format(total.shape))

# recreate train/test again, now with dropped ID column
train, test = total[is_train].drop(['ID'], axis=1), total[~is_train].drop(['ID', 'y'], axis=1)

# drop redundant objects
del total

# check shape
print('\nTrain shape: {}\nTest shape: {}'.format(train.shape, test.shape))

oh-encoded shape: (8418, 216)
appended-encoded shape: (8418, 580)

Train shape: (4209, 579)
Test shape: (4209, 578)


In [99]:
n_comp = 10

In [100]:
# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]



In [101]:
# create augmentation by feature importances as additional features
t = train['y']
tr = train.drop(['y'], axis=1)

# Tree-based estimators can be used to compute feature importances
clf = GradientBoostingRegressor(
                max_depth=4, 
                learning_rate=0.005, 
                random_state=seed, 
                subsample=0.95, 
                n_estimators=200
)

# fit regressor
clf.fit(tr, t)

# df to hold feature importances
features = pd.DataFrame()
features['feature'] = tr.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)

# select best features
model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(tr)

test_reduced = model.transform(test.copy())

# dataset augmentation
train = pd.concat([train, pd.DataFrame(train_reduced)], axis=1)
test = pd.concat([test, pd.DataFrame(test_reduced)], axis=1)

# check new shape
print('\nTrain shape: {}\nTest shape: {}'.format(train.shape, test.shape))


Train shape: (4209, 653)
Test shape: (4209, 652)


In [102]:
train.to_csv(os.path.join(INPUT,'train_engineer.csv'), index = False)
test.to_csv(os.path.join(INPUT,'test_engineer.csv'), index = False)