# Quantitaive Experiment Evaluation of Religion Dataset with different summary sizes

To get some inspiration, e.g. seaborn, plotly and brokeh looks great https://mode.com/blog/python-data-visualization-libraries/

---

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
import sys
import pickle

%matplotlib inline

---

#### Load data from experiment

In [4]:
DATA_PATH = "../../data/experiments/quantitative-v2-religios-longer/latest/csv/"
FACTOR = 962  # constant

In [13]:
def load_files(path_to_files):
    files_it = os.scandir(path_to_files)
    frames = {}
    
    for file in files_it:
        if file.is_file() and file.name.startswith('fm-') and file.name.endswith('.csv'):
            frame = pd.read_csv(file.path)
            
            version = file.name.replace('fm-', '')
            version = version.replace('.csv', '')
            _, _, senlen = version.split('-')
            
            senlen = int(senlen)
            
            if senlen in frames:
                frames[senlen] = pd.concat([frames[senlen], frame])
            else:
                frames[senlen] = frame
        
    return dict(sorted(frames.items()))

In [14]:
dfs = load_files(DATA_PATH)

In [18]:
len(dfs)

6

In [19]:
dfs.keys()

dict_keys([4, 6, 8, 10, 12, 14])

---

#### Describe

In [5]:
dfs = dfs.reset_index(drop=True)

In [44]:
dfs[4].columns

Index(['originalP', 'customSP', 'simpleSP', 'trueClass'], dtype='object')

Columns:
 - **originalP**: prediction of the examined model on the entire instance
 - **customSP**: prediction of the examined model on the custom (explanation) summaries
 - **simpleSP**: prediction of the examined model on the simple (TextRank) summaries without altering weights
 - **trueClass**: true class of the original instance, **(0 = atheism, 1 = christianity)**

---

#### Predictions

In [21]:
# Normalize values to custom bounds https://stackoverflow.com/a/48109733/7875594
def normalize(values, bounds):
    if bounds['actual']['upper'] == bounds['actual']['lower']:
        return values
    return [bounds['desired']['lower'] + (abs(x) - bounds['actual']['lower']) * (bounds['desired']['upper'] - bounds['desired']['lower']) / 
            (bounds['actual']['upper'] - bounds['actual']['lower']) for x in values]


def process_mean(df):
    df = df.reset_index(drop=True)
    
    df['difSimple'] = df.originalP - df.simpleSP
    df['difCustom'] = df.originalP - df.customSP
    return df.difCustom.mean(), df.difSimple.mean()


def process_mse(df):
    """
    From input dataframe returns 2 MSE errors: (1) error from custom summaries, (2) error from simple summaries
    """
    
    df = df.reset_index(drop=True)
    
    # 1. normalize to (-1, 1)
    bounds = {'actual': {'lower':0.0, 'upper':1.0}, 'desired':{'lower':-1, 'upper':1}}
    df['normOriginal'] = normalize(df.originalP, bounds)
    df['normCustom'] = normalize(df.customSP, bounds)
    df['normTextRank'] = normalize(df.simpleSP, bounds)
    
    # 2. subtract textrank and custom predictions from model prediction on entire instance to get errors
    df['errorTextRank'] = df.normOriginal - df.normTextRank
    df['errorCustom'] = df.normOriginal - df.normCustom
    
    # 3. square errors
    df['errorTextRank'] = df.errorTextRank ** 2
    df['errorCustom'] = df.errorCustom ** 2
    
    # 4. find mean
    arr = np.array([df.errorCustom.mean(), df.errorTextRank.mean()])
    
    # 5. square root errors
    arr = np.sqrt(arr)
    
    return arr[0], arr[1]
    
    # squareroot
    # mean of that
    return df.difCustom.mean(), df.difSimple.mean()

def process_arr(dfs, error_func):
    data = {}
    
    for d in dfs.items():
        ix = d[0]
        if error_func == 'mse':
            data[int(ix)] = process_mse(d[1])
        elif error_func == 'mean':
            data[int(ix)] = process_mean(d[1])
    return data

In [22]:
process_arr(dfs, 'mse')

{4: (0.6090496919571073, 0.5919774476770174),
 6: (0.571760307846971, 0.5595643488513099),
 8: (0.5495720095225056, 0.5484767412134677),
 10: (0.5313449705286112, 0.531335237946862),
 12: (0.5179546142591483, 0.5125810401104864),
 14: (0.5059011896272103, 0.5022583061727023)}

In [39]:
df = pd.DataFrame(process_arr(dfs, 'mse')).T
df = df.rename(columns={0:'custom', 1:'simple'})
df = df.reset_index().rename(columns={'index':'senlen'})
df

Unnamed: 0,senlen,custom,simple
0,4,0.60905,0.591977
1,6,0.57176,0.559564
2,8,0.549572,0.548477
3,10,0.531345,0.531335
4,12,0.517955,0.512581
5,14,0.505901,0.502258
