In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [10]:
def get_score(true_values, predicted_values):
    mae_ = mean_absolute_error(true_values, predicted_values)
    mse_ = mean_squared_error(true_values, predicted_values)
    r2_ = r2_score(true_values, predicted_values)

    return mae_, mse_, r2_

# Loading data

In [3]:
dataset = pd.read_csv('dataset_all_features.csv')

In [4]:
dataset = dataset.loc[dataset['COND'] != '0']

data = dataset.iloc[:, :-3]
labels = dataset.iloc[:, -2:]

# Deleting categorical features

In [5]:
data.drop(columns=['FACEATTRIBUTES-BLUR-BLURLEVEL', 'FACEATTRIBUTES-EXPOSURE-EXPOSURELEVEL', 'FACEATTRIBUTES-GENDER',
                   'FACEATTRIBUTES-GLASSES', 'FACEATTRIBUTES-HAIR-INVISIBLE', 'FACEATTRIBUTES-MAKEUP-EYEMAKEUP',
                   'FACEATTRIBUTES-MAKEUP-LIPMAKEUP', 'FACEATTRIBUTES-NOISE-NOISELEVEL',
                   'FACEATTRIBUTES-ACCESSORIES', 'FACEID'],
          inplace=True)

# Splitting data

In [6]:
data_train_full, data_test, labels_train_full, labels_test = train_test_split(data, labels, test_size=0.2, random_state=123)

data_train, data_validation, labels_train, labels_validation = train_test_split(data_train_full, labels_train_full, test_size=0.2, random_state=123)

data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 1]
data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 5]
data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 9]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 1]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 5]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 9]

data_train = data_train.loc[labels_train['ANS_VALENCE'] != 1]
data_train = data_train.loc[labels_train['ANS_VALENCE'] != 5]
data_train = data_train.loc[labels_train['ANS_VALENCE'] != 9]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 1]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 5]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 9]

# Scaling values to 0 - 1

In [7]:
mms = MinMaxScaler()

data_train_mms = mms.fit_transform(data_train)
data_train_mms = pd.DataFrame(data_train_mms, columns=data_train.columns)
data_validation_mms = mms.fit_transform(data_validation)
data_validation_mms = pd.DataFrame(data_validation_mms, columns=data_validation.columns)

# Scaling input values to Gaussian distributions

In [8]:
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
data_train_qt = quantile_transformer.fit_transform(data_train_mms)
data_train_qt = pd.DataFrame(data_train_qt, columns=data_train_mms.columns)
data_validation_qt = quantile_transformer.fit_transform(data_validation_mms)
data_validation_qt = pd.DataFrame(data_validation_qt, columns=data_validation_mms.columns)

# Training

In [9]:
etr = ExtraTreesRegressor(random_state=0, n_jobs=2)

etr.fit(data_train_qt, labels_train)
print('Training finished')

Training finished


In [10]:
predictions_etr = etr.predict(data_validation_qt)

mae_etr, mse_etr, r2_etr = get_score(labels_validation, predictions_etr)
print(f'''Values for validation set:\nMAE: {mae_etr}\nMSE: {mse_etr}\nRMSE: {mse_etr**.5}\nR2:  {r2_etr}''')

Values for validation set:
MAE: 1.7652871421275371
MSE: 5.057169301395082
RMSE: 2.2488150883065248
R2:  0.1153286055143053


In [11]:
etr.feature_importances_

array([0.00964215, 0.01236831, 0.00114631, 0.00148899, 0.00618445,
       0.01484759, 0.01462721, 0.00524179, 0.0139764 , 0.01420477,
       0.03423443, 0.0045812 , 0.00456687, 0.00342345, 0.01049285,
       0.01023275, 0.01002933, 0.00951984, 0.01036095, 0.00976055,
       0.01065344, 0.02090903, 0.02411804, 0.02083641, 0.01234031,
       0.00615665, 0.01167762, 0.01092877, 0.01109453, 0.01207841,
       0.01179602, 0.0107324 , 0.01113929, 0.01153588, 0.01087303,
       0.01107996, 0.01078311, 0.01018783, 0.01071105, 0.0112621 ,
       0.01130897, 0.01031863, 0.01106218, 0.01127518, 0.0110871 ,
       0.00991033, 0.01080677, 0.01107127, 0.01101143, 0.01031379,
       0.01284411, 0.01223743, 0.01267256, 0.01190657, 0.01186138,
       0.01062043, 0.0109687 , 0.01047948, 0.01220194, 0.01067505,
       0.01110687, 0.01038984, 0.01090634, 0.01061529, 0.01123026,
       0.01001254, 0.01134855, 0.01155887, 0.01100487, 0.01066327,
       0.01146587, 0.01038793, 0.01202868, 0.01009141, 0.01156

# Optimizing parameters

In [12]:
# param_grid = [{
#     'n_estimators': [300, 400, 500],
#     'min_samples_split': [2],
#     'min_samples_leaf': [1]
# }]

In [13]:
# etr = ExtraTreesRegressor()
# clf = GridSearchCV(estimator=etr, param_grid=param_grid, scoring='neg_root_mean_squared_error', cv=3, verbose=10)
# clf.fit(data_train, labels_train)
# print('Search finished')

In [14]:
# clf.best_params_

### Using best parameters

In [13]:
etr_best = ExtraTreesRegressor(n_estimators=500, min_samples_split=2, min_samples_leaf=1, random_state=0, n_jobs=2)

etr_best.fit(data_train_qt, labels_train)
print('Training finished')

Training finished


In [14]:
predictions_etr_best = etr_best.predict(data_validation_qt)

mae_etr_best, mse_etr_best, r2_etr_best = get_score(labels_validation, predictions_etr_best)
print(f'''Values for validation set:\nMAE: {mae_etr_best}\nMSE: {mse_etr_best}\nRMSE: {mse_etr_best**.5}\nR2:  {r2_etr_best}''')

Values for validation set:
MAE: 1.762043108711513
MSE: 5.026648619820291
RMSE: 2.2420188714237645
R2:  0.12105716642648334


In [15]:
np.max(predictions_etr_best, axis=0)

array([8.385836, 8.203664])

In [16]:
np.min(predictions_etr_best, axis=0)

array([1.765982, 1.676062])

# KFold

In [11]:
from sklearn.model_selection import cross_validate

model = ExtraTreesRegressor(n_estimators=500, min_samples_split=2, min_samples_leaf=1, random_state=0, n_jobs=2)

k = 5
cv = cross_validate(estimator=model, X=data_train_qt, y=labels_train, scoring='neg_mean_absolute_error', cv=k, verbose=10)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END ......................................, score=-1.135 total time= 1.3min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] END ......................................, score=-1.159 total time= 1.4min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s


[CV] END ......................................, score=-1.132 total time= 1.3min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.0min remaining:    0.0s


[CV] END ......................................, score=-1.134 total time= 1.3min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.3min remaining:    0.0s


[CV] END ......................................, score=-1.152 total time= 1.4min
Result for 0 fold: MAE: -1.1354090984370717
Result for 1 fold: MAE: -1.1592654428297213
Result for 2 fold: MAE: -1.1315027702221014
Result for 3 fold: MAE: -1.1344697921579352
Result for 4 fold: MAE: -1.1524388069646299
Average MAE: -1.142617182122292


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.8min finished


In [12]:
for f in range(k):
    print(f'Result for {f} fold: MAE: {cv["test_score"][f] * -1.0}')
print(f'Average MAE: {np.sum(cv["test_score"] / k * -1.0)}')

Result for 0 fold: MAE: 1.1354090984370717
Result for 1 fold: MAE: 1.1592654428297213
Result for 2 fold: MAE: 1.1315027702221014
Result for 3 fold: MAE: 1.1344697921579352
Result for 4 fold: MAE: 1.1524388069646299
Average MAE: 1.142617182122292


In [17]:
model = ExtraTreesRegressor(n_estimators=500, min_samples_split=2, min_samples_leaf=1, random_state=0, n_jobs=2)

k = 5
cv = cross_validate(estimator=model, X=data_validation_qt, y=labels_validation, scoring='neg_mean_absolute_error', cv=k, verbose=10)

[CV] START .....................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ......................................, score=-1.745 total time=  31.0s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.0s remaining:    0.0s


[CV] END ......................................, score=-1.764 total time=  27.4s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   58.4s remaining:    0.0s


[CV] END ......................................, score=-1.730 total time=  28.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.5min remaining:    0.0s


[CV] END ......................................, score=-1.698 total time=  27.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.9min remaining:    0.0s


[CV] END ......................................, score=-1.712 total time=  27.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min finished


In [18]:
for f in range(k):
    print(f'Result for {f} fold: MAE: {cv["test_score"][f] * -1.0}')
print(f'Average MAE: {np.sum(cv["test_score"] / k * -1.0)}')

Result for 0 fold: MAE: 1.744619967177242
Result for 1 fold: MAE: 1.7644575251641141
Result for 2 fold: MAE: 1.7297182100656436
Result for 3 fold: MAE: 1.698109462773723
Result for 4 fold: MAE: 1.7119964627737225
Average MAE: 1.729780325590889


In [19]:
data_mms = mms.fit_transform(data)
data_qt = quantile_transformer.fit_transform(data)

In [20]:
model = ExtraTreesRegressor(n_estimators=500, min_samples_split=2, min_samples_leaf=1, random_state=0, n_jobs=2)

k = 5
cv = cross_validate(estimator=model, X=data_qt, y=labels, scoring='neg_mean_absolute_error', cv=k, verbose=10)

[CV] START .....................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ......................................, score=-2.012 total time= 2.9min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.9min remaining:    0.0s


[CV] END ......................................, score=-2.103 total time= 3.0min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.9min remaining:    0.0s


[CV] END ......................................, score=-1.974 total time= 3.0min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  8.9min remaining:    0.0s


[CV] END ......................................, score=-2.059 total time= 3.0min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 11.8min remaining:    0.0s


[CV] END ......................................, score=-2.107 total time= 2.7min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 14.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 14.6min finished


In [21]:
for f in range(k):
    print(f'Result for {f} fold: MAE: {cv["test_score"][f] * -1.0}')
print(f'Average MAE: {np.sum(cv["test_score"] / k * -1.0)}')

Result for 0 fold: MAE: 2.0123005963581146
Result for 1 fold: MAE: 2.103430674331743
Result for 2 fold: MAE: 1.973951768036418
Result for 3 fold: MAE: 2.0588075443614344
Result for 4 fold: MAE: 2.107128090590709
Average MAE: 2.051123734735684
