### For setup: change working directory to parent and load config for correct paths

In [1]:
import pandas as pd
import json
from pathlib import Path
import os
os.chdir('..')
from config import Config
cfg = Config.get()

At first, the directory is specified, the files are read and the name column is adjusted

In [2]:
file_dir = cfg.output_dir.joinpath('classifier')
file_list = os.listdir(file_dir)

In [3]:
results = pd.DataFrame(columns=['name', 'model', 'pred_topic', 'likelihood', 'best_score', 'accuracy', 'precision', 'recall', 'f1', 'f1_weighted', 'parameters'])
for file in file_list:
    with open(file_dir.joinpath(file)) as f:
        d = json.load(f)
    record = [d['dataset'], d['model'], d['pred_topic_id'], d['use_likelihood'], d['best_score_f1'], d['eval_metrics']['accuracy'], d['eval_metrics']['precision'], d['eval_metrics']['recall'], d['eval_metrics']['f1'], d['eval_metrics']['f1_weighted'], d['model_params']]
    if d['dataset'].count('+') <= 2:
        results.loc[len(results)] = record

In [4]:
results.name = results['name'].str.split('_').str[1].str.replace('touche', 'google')

General Information about the data:

In [5]:
results.describe()

Unnamed: 0,pred_topic,best_score,accuracy,precision,recall,f1,f1_weighted
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,70.666667,0.739971,0.839352,0.79469,0.688444,0.735698,0.836009
std,11.343547,0.058676,0.04462,0.09799,0.090016,0.087372,0.045651
min,55.0,0.653773,0.722222,0.541667,0.47619,0.555556,0.731424
25%,55.0,0.692318,0.813889,0.733966,0.635294,0.677662,0.807306
50%,76.0,0.72609,0.838889,0.78101,0.690476,0.726136,0.836105
75%,81.0,0.796596,0.87037,0.859539,0.762959,0.819118,0.869111
max,81.0,0.851415,0.925926,0.965116,0.841584,0.894737,0.924857


The best five models derived by best_score: (best_score always refers to the f1-score out of the grid searching process)

In [6]:
results[['name', 'model', 'likelihood', 'best_score', 'precision', 'recall', 'f1', 'parameters']].nlargest(5, 'best_score').round(2)

Unnamed: 0,name,model,likelihood,best_score,precision,recall,f1,parameters
63,combined,SVC,True,0.85,0.96,0.84,0.89,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}"
62,combined,SVC,False,0.85,0.97,0.82,0.89,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}"
45,combined,SGDClassifier,True,0.84,0.96,0.84,0.89,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
44,combined,SGDClassifier,False,0.84,0.95,0.79,0.86,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
50,google,SGDClassifier,False,0.83,0.96,0.76,0.85,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."


Mean scores by dataset:

In [7]:
results[['name', 'best_score', 'precision', 'recall', 'f1']].groupby(by='name').mean().round(2)

Unnamed: 0_level_0,best_score,precision,recall,f1
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
clarifai,0.72,0.76,0.65,0.7
combined,0.76,0.81,0.72,0.77
google,0.74,0.81,0.69,0.74


Best results by combination of likelihood and predicted topic:

In [8]:
# create new dfs
results_temp = results[['likelihood', 'pred_topic', 'best_score', 'precision', 'recall', 'f1']]
results_lt = pd.DataFrame(columns=['likelihood', 'pred_topic', 'best_score', 'precision', 'recall', 'f1'])

# get the relevant values for finding the best results for each combination
likelihood = results.likelihood.unique()
topics = results.pred_topic.unique()

# find and concatenate the best results
for l in likelihood:
    for t in topics:
        results_lt = pd.concat([results_lt, results_temp.loc[(results_temp.pred_topic == t) & (results_temp.likelihood == l)].nlargest(1, 'best_score')])
        
# show the results
results_lt.round(2)

Unnamed: 0,likelihood,pred_topic,best_score,precision,recall,f1
60,False,55,0.77,0.77,0.75,0.76
62,False,76,0.85,0.97,0.82,0.89
64,False,81,0.72,0.73,0.68,0.71
61,True,55,0.77,0.76,0.74,0.75
63,True,76,0.85,0.96,0.84,0.89
65,True,81,0.73,0.72,0.68,0.7


Best combinations of models and topics:

In [9]:
# create new df
results_temp = results[['model', 'pred_topic', 'best_score', 'precision', 'recall', 'f1']]
results_mt = pd.DataFrame(columns=['model', 'pred_topic', 'best_score', 'precision', 'recall', 'f1'])

# get the relevant values for finding the best results for each combination
models = results.model.unique()
topics = results.pred_topic.unique()

# find and concatenate the best results
for m in models:
    for t in topics:
        results_mt = pd.concat([results_mt, results_temp.loc[(results_temp.pred_topic == t) & (results_temp.model == m)].nlargest(1, 'best_score')])

# show the results
results_mt.round(2)

Unnamed: 0,model,pred_topic,best_score,precision,recall,f1
6,GradientBoostingClassifier,55,0.74,0.78,0.7,0.74
8,GradientBoostingClassifier,76,0.82,0.96,0.76,0.85
10,GradientBoostingClassifier,81,0.7,0.76,0.6,0.67
25,PassiveAggressiveClassifier,55,0.71,0.73,0.69,0.71
26,PassiveAggressiveClassifier,76,0.82,0.9,0.81,0.85
29,PassiveAggressiveClassifier,81,0.67,0.7,0.66,0.68
43,SGDClassifier,55,0.76,0.79,0.7,0.74
45,SGDClassifier,76,0.84,0.96,0.84,0.89
47,SGDClassifier,81,0.72,0.8,0.65,0.71
61,SVC,55,0.77,0.76,0.74,0.75
