In [27]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

In [28]:
def get_weights_kulah(distances):
	k = distances.shape[1]
	numerator = (np.sum(distances, axis=1) - distances.T).T
	denominator = np.sum(distances)
	weights = (1 / (k - 1)) * (numerator/denominator)
	return weights

def get_hyper_params(filename):
	hyper_params = {}
	if not os.path.exists(dir_path + filename):
		C_range = np.logspace(0, 4, 5)
		gamma_range = np.logspace(-10, -1, 10)
		param_grid = dict(gamma=gamma_range, C=C_range)
		cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
		grid = GridSearchCV(SVR(kernel='rbf'), param_grid=param_grid, cv=cv)
		for i in range(y.shape[1]):
			grid.fit(X, y.iloc[:, i])
			hyper_params[i] = grid.best_params_
			# print("The best parameters are %s with a score of %0.2f"
			#       % (grid.best_params_, grid.best_score_))
		with open(dir_path + filename, 'wb') as file:
			pickle.dump(hyper_params, file, protocol=pickle.HIGHEST_PROTOCOL)
	else:
		hyper_params = None
		with open(dir_path + filename, 'rb') as file:
			hyper_params = pickle.load(file)
	return hyper_params

In [29]:
dir_path = 'data/'
fj_unlabeled_font_vectors = pd.read_pickle(dir_path + 'fj_ul_font_vectors.pkl')
fj_labeled_font_vectors = pd.read_pickle(dir_path + 'fj_l_font_vectors.pkl')
common_attribute_labels = pd.read_pickle(dir_path + 'common_attribute_labels.pkl')

attribute_names = np.loadtxt(dir_path + 'attrNames.txt', dtype=str)
typographic_features = np.asarray(['font_name', 'capitals', 'cursive', 'display', 'italic', 'monospace', 'serif'])
semantic_features = attribute_names[~np.isin(attribute_names, typographic_features)]

In [30]:
X = fj_labeled_font_vectors.iloc[:, 1:]
y = common_attribute_labels.iloc[:, 1:]

In [55]:
error_by_attr_table = pd.DataFrame(data=[], index=semantic_features)
error_table = pd.DataFrame(data=[])

In [56]:
# Kulahcioglu KNN
errors = []
model = KNeighborsRegressor(n_neighbors=4, weights=get_weights_kulah, metric='cosine')
for i, row in X.iterrows():
    model.fit(X.drop(index=i), y.drop(index=i))
    f_p = model.predict(X.iloc[i, :].values.reshape(1, -1)).flatten()
    f_t = y.iloc[i, :].values
    e = np.abs(f_t - f_p)
    errors.append(e)
errors = np.asarray(errors)
avg_errors = np.mean(errors, axis=0)
result = pd.DataFrame(data=avg_errors, index=attribute_names)
semantic_result = result[result.index.isin(semantic_features)]

print('Error by attribute (ordered by error, ascending):')
print(semantic_result.sort_values(by=0).round(3))

semantic_result.columns = ['Kulahcioglu KNN']
error_by_attr_table = pd.concat([error_by_attr_table, semantic_result, ], axis=1)
error_table = pd.concat([error_table, semantic_result.mean().round(3), ], axis=0)

Error by attribute (ordered):
                        0
gentle              0.051
fresh               0.052
delicate            0.055
wide                0.058
charming            0.060
calm                0.064
friendly            0.069
soft                0.070
strong              0.073
attention-grabbing  0.075
thin                0.078
graceful            0.079
sloppy              0.080
happy               0.081
attractive          0.082
pretentious         0.085
warm                0.087
modern              0.088
boring              0.091
clumsy              0.095
disorderly          0.096
bad                 0.097
legible             0.098
artistic            0.101
sharp               0.101
playful             0.101
formal              0.102
dramatic            0.102
angular             0.106
complex             0.108
technical           0.108


In [57]:
# Best KNN
errors = []
model = KNeighborsRegressor(n_neighbors=7, weights='distance', metric='cosine')
for i, row in X.iterrows():
    model.fit(X.drop(index=i), y.drop(index=i))
    f_p = model.predict(X.iloc[i, :].values.reshape(1, -1)).flatten()
    f_t = y.iloc[i, :].values
    e = np.abs(f_t - f_p)
    errors.append(e)
errors = np.asarray(errors)
avg_errors = np.mean(errors, axis=0)
result = pd.DataFrame(data=avg_errors, index=attribute_names)
semantic_result = result[result.index.isin(semantic_features)]

print('Error by attribute (ordered by error, ascending):')
print(semantic_result.sort_values(by=0).round(3))

semantic_result.columns = ['Lowest Error KNN']
error_by_attr_table = pd.concat([error_by_attr_table, semantic_result, ], axis=1)
error_table = pd.concat([error_table, semantic_result.mean().round(3), ], axis=0)

Error by attribute (ordered):
                        0
gentle              0.047
fresh               0.049
delicate            0.052
charming            0.055
wide                0.057
calm                0.062
friendly            0.064
soft                0.065
strong              0.072
attention-grabbing  0.073
sloppy              0.073
graceful            0.074
happy               0.076
attractive          0.079
thin                0.081
pretentious         0.082
warm                0.085
modern              0.085
boring              0.085
disorderly          0.091
clumsy              0.092
bad                 0.092
legible             0.094
sharp               0.095
playful             0.095
dramatic            0.095
artistic            0.096
formal              0.098
complex             0.104
angular             0.105
technical           0.108


In [58]:
# Best SVM
hyper_params = get_hyper_params('hyper_params.pkl')
result = []
for i in range(y.shape[1]):
    errors = []
    y_1d = y.iloc[:, i]
    model = SVR(kernel='rbf', C=hyper_params[i]['C'], gamma=hyper_params[i]['gamma'])
    for j, row in X.iterrows():
        model.fit(X.drop(index=j), y_1d.drop(index=j))
        f_p = model.predict(X.iloc[j, :].values.reshape(1, -1)).flatten()
        f_t = [y.iloc[j, i]]
        e = np.abs(f_t - f_p)
        errors.append(e)
    errors = np.asarray(errors)
    avg_error = np.mean(errors, axis=0)
    result.append(avg_error)
result = pd.DataFrame(data=result, index=attribute_names)
semantic_result = result[result.index.isin(semantic_features)]

print('Error by attribute (ordered by error, ascending):')
print(semantic_result.sort_values(by=0).round(3))

semantic_result.columns = ['Lowest Error SVM']
error_by_attr_table = pd.concat([error_by_attr_table, semantic_result, ], axis=1)
error_table = pd.concat([error_table, semantic_result.mean().round(3), ], axis=0)

Error by attribute (ordered):
                        0
delicate            0.060
charming            0.060
soft                0.065
fresh               0.065
gentle              0.066
strong              0.066
wide                0.066
friendly            0.067
sloppy              0.067
calm                0.070
attention-grabbing  0.073
happy               0.073
thin                0.076
attractive          0.077
warm                0.079
modern              0.080
graceful            0.081
pretentious         0.082
clumsy              0.082
bad                 0.084
dramatic            0.087
artistic            0.088
legible             0.088
playful             0.089
disorderly          0.090
complex             0.090
boring              0.091
sharp               0.092
formal              0.092
angular             0.095
technical           0.107


In [64]:
# potentially other models here

In [65]:
print('Error By Attribute Table (ordered alphabetically by semantic attribute name): ')
print(error_by_attr_table)

Error By Attribute Table (ordered alphabetically by semantic attribute name): 
                    Kulahcioglu KNN  Lowest Error KNN  Lowest Error SVM
angular                    0.106042          0.105307          0.094690
artistic                   0.100523          0.096274          0.087652
attention-grabbing         0.074598          0.072855          0.073328
attractive                 0.082439          0.079318          0.076957
bad                        0.096762          0.091685          0.084426
boring                     0.090999          0.085246          0.090782
calm                       0.064256          0.062258          0.070130
charming                   0.060042          0.055430          0.059798
clumsy                     0.095413          0.091512          0.082222
complex                    0.107579          0.103535          0.090366
delicate                   0.055431          0.052378          0.059517
disorderly                 0.095505          0.090648    

In [66]:
error_table.columns = ['Error']
print('Error Table:')
print(error_table)

Error Table:
                  Error
Kulahcioglu KNN   0.084
Lowest Error KNN  0.080
Lowest Error SVM  0.079
