In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [2]:
def get_score(true_values, predicted_values):
    mae_ = mean_absolute_error(true_values, predicted_values)
    mse_ = mean_squared_error(true_values, predicted_values)
    r2_ = r2_score(true_values, predicted_values)

    return mae_, mse_, r2_

# Loading data

### Using only values for FaceAPI without neutral gave the best result

In [3]:
dataset = pd.read_csv('dataset.csv')

In [4]:
dataset = dataset.loc[dataset['COND'] != '0']

data = dataset.iloc[:, :-3]
labels = dataset.iloc[:, -2:]

In [5]:
data_train_full, data_test, labels_train_full, labels_test = train_test_split(data, labels, test_size=0.2, random_state=123)

data_train, data_validation, labels_train, labels_validation = train_test_split(data_train_full, labels_train_full, test_size=0.2, random_state=123)

data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 1]
data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 5]
data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 9]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 1]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 5]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 9]

data_train = data_train.loc[labels_train['ANS_VALENCE'] != 1]
data_train = data_train.loc[labels_train['ANS_VALENCE'] != 5]
data_train = data_train.loc[labels_train['ANS_VALENCE'] != 9]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 1]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 5]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 9]

# Scaling personality values to range 0 - 1

In [6]:
mms = MinMaxScaler()

data_train_mms = mms.fit_transform(data_train)
data_train_mms = pd.DataFrame(data_train_mms, columns=data_train.columns)
data_validation_mms = mms.fit_transform(data_validation)
data_validation_mms = pd.DataFrame(data_validation_mms, columns=data_validation.columns)

# Scaling input values to Gaussian distributions

In [7]:
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
data_train_qt = quantile_transformer.fit_transform(data_train_mms)
data_train_qt = pd.DataFrame(data_train_qt, columns=data_train_mms.columns)
data_validation_qt = quantile_transformer.fit_transform(data_validation_mms)
data_validation_qt = pd.DataFrame(data_validation_qt, columns=data_validation_mms.columns)

# Training

In [8]:
etr = ExtraTreesRegressor(random_state=0)

etr.fit(data_train_qt, labels_train)
print('Training finished')

Training finished


In [9]:
predictions_etr = etr.predict(data_validation_qt)

mae_etr, mse_etr, r2_etr = get_score(labels_validation, predictions_etr)
print(f'''Values for validation set:\nMAE: {mae_etr}\nMSE: {mse_etr}\nRMSE: {mse_etr**.5}\nR2:  {r2_etr}''')

Values for validation set:
MAE: 2.0382980835131095
MSE: 6.6463294245657885
RMSE: 2.5780475993599863
R2:  -0.163272549890205


# Optimizing parameters

In [10]:
# param_grid = [{
#     'n_estimators': [100, 200, 300, 400, 500],
#     'min_samples_split': [2, 4, 8, 16, 32],
#     'min_samples_leaf': [1, 2, 4, 8, 16]
# }]

In [11]:
# clf = GridSearchCV(estimator=etr, param_grid=param_grid, scoring='neg_root_mean_squared_error', n_jobs=2, cv=3, verbose=1)
# clf.fit(data_train, labels_train)
# print('Search finished')

In [12]:
# clf.best_params_

In [13]:
# predictions_clf = clf.predict(data_validation_qt)
#
# mae_clf, mse_clf, r2_clf = get_score(labels_validation, predictions_clf)
# print(f'''Values for validation set:\nMAE: {mae_clf}\nMSE: {mse_clf}\nRMSE: {mse_clf**.5}\nR2:  {r2_clf}''')

### Using best parameters

In [14]:
etr_best = ExtraTreesRegressor(n_estimators=500, min_samples_split=2, min_samples_leaf=16, random_state=0, n_jobs=2)

etr_best.fit(data_train_qt, labels_train)
print('Training finished')

Training finished


In [15]:
predictions_etr_best = etr_best.predict(data_validation_qt)

mae_etr_best, mse_etr_best, r2_etr_best = get_score(labels_validation, predictions_etr_best)
print(f'''Values for validation set:\nMAE: {mae_etr_best}\nMSE: {mse_etr_best}\nRMSE: {mse_etr_best**.5}\nR2:  {r2_etr_best}''')

Values for validation set:
MAE: 1.9541837585722288
MSE: 5.622815549551893
RMSE: 2.3712476778168687
R2:  0.02316915642577294


# Using full dataset

In [16]:
dataset = pd.read_csv('dataset_full.csv')
dataset = dataset.loc[dataset['COND'] != '0']

data = dataset.iloc[:, :-3]
labels = dataset.iloc[:, -2:]
data_train_full, data_test, labels_train_full, labels_test = train_test_split(data, labels, test_size=0.2,
                                                                              random_state=123)

data_train, data_validation, labels_train, labels_validation = train_test_split(data_train_full, labels_train_full,
                                                                                test_size=0.2, random_state=123)

data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 1]
data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 5]
data_train = data_train.loc[labels_train['ANS_AROUSAL'] != 9]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 1]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 5]
labels_train = labels_train.loc[labels_train['ANS_AROUSAL'] != 9]

data_train = data_train.loc[labels_train['ANS_VALENCE'] != 1]
data_train = data_train.loc[labels_train['ANS_VALENCE'] != 5]
data_train = data_train.loc[labels_train['ANS_VALENCE'] != 9]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 1]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 5]
labels_train = labels_train.loc[labels_train['ANS_VALENCE'] != 9]

In [17]:
data_train_mms = mms.fit_transform(data_train)
data_train_mms = pd.DataFrame(data_train_mms, columns=data_train.columns)
data_validation_mms = mms.fit_transform(data_validation)
data_validation_mms = pd.DataFrame(data_validation_mms, columns=data_validation.columns)

In [18]:
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
data_train_qt = quantile_transformer.fit_transform(data_train_mms)
data_train_qt = pd.DataFrame(data_train_qt, columns=data_train_mms.columns)
data_validation_qt = quantile_transformer.fit_transform(data_validation_mms)
data_validation_qt = pd.DataFrame(data_validation_qt, columns=data_validation_mms.columns)

In [19]:
etr_best = ExtraTreesRegressor(n_estimators=500, min_samples_split=2, min_samples_leaf=16, random_state=0, n_jobs=2)

etr_best.fit(data_train_qt, labels_train)
print('Training finished')

Training finished


In [20]:
predictions_etr_best = etr_best.predict(data_validation_qt)

mae_etr_best, mse_etr_best, r2_etr_best = get_score(labels_validation, predictions_etr_best)
print(f'''Values for validation set:\nMAE: {mae_etr_best}\nMSE: {mse_etr_best}\nRMSE: {mse_etr_best**.5}\nR2:  {r2_etr_best}''')

Values for validation set:
MAE: 1.94933299852981
MSE: 5.616215378242968
RMSE: 2.369855560628742
R2:  0.037406496749699814
