In [1]:
from collections import Counter
from glob import glob
import joblib

import numpy as np
import pandas as pds
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm

### Figure 1. Model performance comparison across feature groups for predicting UGA valuation outcomes (appreciation and view counts)

In [2]:
# get r score each cases
paths = sorted(glob('../dataset/models/250814/*.values'))

infos = []
for _path in tqdm(paths):
    tokens = _path.split('/')[-1].split('.')[0].split('+')
    _values = joblib.load(_path)
    
    # p values IS ALMOST ZEROS
    infos.append(
        tokens+[
            r2_score(_values['y'], _values['y_hat']), 
            pearsonr(_values['y'], _values['y_hat'])[0], 
            spearmanr(_values['y'], _values['y_hat'])[0]
        ]
    )
results_df = pds.DataFrame(infos, columns='Model,Target,Variables,w/ control variable,$r$,pearson r,spearman r'.split(','))
del results_df['w/ control variable']

100%|██████████| 24/24 [00:00<00:00, 270.15it/s]


In [3]:
results_df

Unnamed: 0,Model,Target,Variables,$r$,pearson r,spearman r
0,EBM,appreciation,All,0.370881,0.617144,0.622597
1,EBM,appreciation,Artist,0.210116,0.490633,0.489834
2,EBM,appreciation,Artwork,0.231095,0.503055,0.518775
3,EBM,view,All,0.391543,0.645115,0.647893
4,EBM,view,Artist,0.225594,0.512043,0.513057
5,EBM,view,Artwork,0.274315,0.551027,0.548947
6,LR,appreciation,All,0.228453,0.482293,0.502342
7,LR,appreciation,Artist,0.111092,0.340358,0.358456
8,LR,appreciation,Artwork,0.143495,0.411443,0.438152
9,LR,view,All,0.247649,0.536397,0.548827


### Figure 2. Feature importance rankings derived from XGB and EBM for predicting appreciation and view counts of UGA projects on Behance.

In [4]:
# Design of feature importance
xgb_model_paths= sorted(glob('../dataset/models/250814/XGB*All+True.models'))
xgb_varsn_paths= sorted(glob('../dataset/models/250814/XGB*All+True.column'))
ebm_model_paths= sorted(glob('../dataset/models/250814/EBM*All+True.models'))
ebm_varsn_paths= sorted(glob('../dataset/models/250814/EBM*All+True.column'))

In [5]:
# XGB
for _xgb_model_path, _xgb_varsn_path in zip(xgb_model_paths, xgb_varsn_paths):
    print(_xgb_varsn_path)
    xgb_new_df = pds.DataFrame()
    
    fname = _xgb_model_path.split('/')[-1].split('.')[0]
    _model_info, _vars_info = joblib.load(_xgb_model_path), joblib.load(_xgb_varsn_path)
    
    xgb_new_df['Variables'] = _vars_info
    xgb_new_df['Importance'] = _model_info.feature_importances_
    print(xgb_new_df.sort_values('Importance', ascending=False).reset_index(drop=True).iloc[:20])

./dataset/models/250814/XGB+appreciation+All+True.column
                                            Variables  Importance
0             Artist_Inter_project_publication_period    0.046004
1          Artist_Curation_status_of_the_last_project    0.027108
2                          Artwork_Image_sample_count    0.019642
3              Artwork_Creative_fields-(Architecture)    0.017055
4                           Artwork_Contributor_count    0.016244
5                  Artwork_Creative_fields-(Branding)    0.014863
6                    Artist_Residency-(Taiwan Region)    0.012823
7             Artist_Residency-(United Arab Emirates)    0.011131
8                  Artwork_Creative_fields-(Painting)    0.011105
9                 Artwork_Creative_fields-(Packaging)    0.009802
10                  Artwork_Creative_fields-(Fashion)    0.009627
11                    Artist_Residency-(South Africa)    0.008941
12  Artwork_Visual_Features_latent_PCA_embeddings_...    0.008775
13                 

In [6]:
# EBM
for _ebm_model_path, _ebm_varsn_path in zip(ebm_model_paths, ebm_varsn_paths):
    print(_ebm_varsn_path)
    fname = _ebm_model_path.split('/')[-1].split('.')[0]

    _model_info, _vars_info = joblib.load(_ebm_model_path), joblib.load(_ebm_varsn_path)
    idx2vars = {i:v for i,v in enumerate(_vars_info)}
    
    alloc_names = []
    for _pair in _model_info.term_features_:
        if len(_pair)==1:
            alloc_names.append([idx2vars[_pair[0]], None])
        elif len(_pair)==2:
            alloc_names.append([idx2vars[_pair[0]], idx2vars[_pair[1]]])

    ebm_new_df = pds.DataFrame(alloc_names, columns='Variable 1,Variable 2'.split(','))
    ebm_new_df['Importance'] = _model_info.term_importances()
    
    single_ebm_df = ebm_new_df.loc[
        pds.isna(ebm_new_df['Variable 2'])
    ].copy().sort_values('Variable 1').reset_index(drop=True); del single_ebm_df['Variable 2']
    pair_ebm_df = ebm_new_df.loc[
        ~pds.isna(ebm_new_df['Variable 2'])
    ].copy().sort_values('Variable 1').reset_index(drop=True)
    
    print(single_ebm_df.sort_values('Importance', ascending=False).reset_index(drop=True).iloc[:20])
    
    
    group_ids = []
    for a_row in pair_ebm_df.iterrows():
        _var1, _var2 = a_row[1]['Variable 1'], a_row[1]['Variable 2']

        if (_var1.split('_')[0]=='Artist') and (_var2.split('_')[0]=='Artist'):
            group_ids.append('Artist & Artist')
        elif (_var1.split('_')[0]=='Artist') and (_var2.split('_')[0]=='Artwork'):
            group_ids.append('Artist & Artwork')
        elif (_var1.split('_')[0]=='Artwork') and (_var2.split('_')[0]=='Artwork'):
            group_ids.append('Artwork & Artwork')
        elif (_var1.split('_')[0]=='Artwork') and (_var2.split('_')[0]=='Artist'):
            group_ids.append('Artist & Artwork')
        else:
            group_ids.append(None)
    pair_ebm_df['Group'] = group_ids
    pair_ebm_df.to_csv(f'../dataset/group_tables/{fname}.csv', index=False)

./dataset/models/250814/EBM+appreciation+All+True.column
                                           Variable 1  Importance
0             Artist_Inter_project_publication_period    0.108297
1                          Artwork_Image_sample_count    0.054717
2                  Artwork_Visual_Features_Complexity    0.042493
3   Artist_Visual_simillarity_latent_PCA_embedding...    0.040775
4                           Artwork_Contributor_count    0.040626
5          Artist_Curation_status_of_the_last_project    0.036023
6               Artwork_Assigned_creative_field_count    0.029161
7                              Artist_Career_timeline    0.028388
8      Artwork_Visual_Features_red_Standard_deviation    0.024741
9     Artwork_Visual_Features_blue_Standard_deviation    0.023887
10                          Artist_Collaborator_count    0.023426
11             Artwork_Creative_fields-(Illustration)    0.022398
12  Artwork_Visual_Features_latent_PCA_embeddings_...    0.022126
13  Artwork_Visual_

### Figure 3. Combined influence of artwork and artist features in predicting UGA valuation

In [7]:
appr_df = pds.read_csv("../dataset/group_tables/EBM+appreciation+All+True.csv")
view_df = pds.read_csv("../dataset/group_tables/EBM+view+All+True.csv")

In [8]:
appr_group_counter = dict(Counter(appr_df.Group))
view_group_counter = dict(Counter(view_df.Group))

g2index = {'Artist': 1,'Artwork': 0}

In [9]:
target2sum_val = dict()
target2sum_val['appreciation'] = np.zeros((2,2))
target2sum_val['view'] = np.zeros((2,2))

for _group in appr_df.Group.unique():
    try:
        g1, g2 = _group.split(' & ')
        target2sum_val['appreciation'][g2index[g2]][g2index[g1]] = appr_df.query(
            "Group == @_group"
        ).Importance.sum()
    except:
        pass
    
for _group in appr_df.Group.unique():
    try:
        g1, g2 = _group.split(' & ')
        target2sum_val['view'][g2index[g2]][g2index[g1]] = view_df.query(
            "Group == @_group"
        ).Importance.sum()
    except:
        pass

In [10]:
Counter(appr_df.Group)

Counter({'Artist & Artist': 106,
         'Artist & Artwork': 232,
         'Artwork & Artwork': 58,
         nan: 16})

In [11]:
Counter(view_df.Group)

Counter({'Artist & Artwork': 225,
         'Artist & Artist': 105,
         'Artwork & Artwork': 62,
         nan: 20})