In [3]:
#Initialize, import, and format
import numpy as np
import pandas as pd
from numpy.random import normal, multivariate_normal

import itertools

from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.decomposition import PCA
from scipy import stats


DATA_DIRECTORY = "C:\\Users\\WDAmo\\GitHub\\font_analysis\\data"
PARAM_LABELS = ["unit", "pen", "cap", "bar", "asc", "desc", "xht", "horz",
                "vert", "cont", "supr", "slnt", "aprt", "crnr", "over", "tapr"]
PARAM_NAMES = ["Unit Width", "Pen Width", "Cap Height", "Bar Height", "Ascender Height", "Descender Height", "X-Height",
              "Horizontal Increase", "Vertical Increase", "Contrast", "Superness", "Slanting", "Aperture", "Corner",
              "Overshoot", "Taper"]

def extract_data(lin, chain):
    raw_file = pd.read_csv(DATA_DIRECTORY + "\\" + str(lin) + "\\" + str(chain) + ".csv")["chosen.values"]
    df = pd.DataFrame([x[1:-1].split(',') for x in raw_file])
    return df
def extract_all():
    out = []
    for i in range(4): ##For each lineage,
        lin = []
        for j in range(4): ##And each chain inside that lineage
            lin.append(extract_data(i,j))
        out.append(lin)
    return out;
def split_df(df):
    if len(df) % 2 != 0:  # Handling `df` with `odd` number of rows
        df = df.iloc[:-1, :]
    df1, df2 =  np.array_split(df, 2)
    return df1, df2
def split_all(dfs): # Returns the second half of each dataframe, cast correctly
    out = []
    for i in range(4): ##For each lineage,
        lin = []
        for j in range(4): ##And each chain inside that lineage
            upper, lower = split_df(dfs[i][j])
            lin.append(lower.astype('float64').to_numpy())
        out.append(lin)
    return out;
def combine_all(chains):
    full_chain = np.empty((0,16), 'float64')
    for i in range(4):
        for j in range(4):
            full_chain = np.concatenate((full_chain, chains[i][j]))
    return full_chain
def combine_by_font(chains):
    arial = np.empty((0,16), 'float64')
    georgia = np.empty((0,16), 'float64')
    for i in range(4):
        if(i%2==1):
            for j in range(4):
                arial = np.concatenate((arial, chains[i][j]))
        else:            
            for j in range(4):
                georgia = np.concatenate((georgia, chains[i][j]))
    return arial, georgia

raw_data = split_all(extract_all())
dataset = combine_all(raw_data)
arial, georgia = combine_by_font(raw_data)

In [5]:
for p in range(16):    
    plt.hist(dataset[:,p], bins=100)
    plt.title(PARAM_NAMES[p])
    plt.savefig("boundary histograms/"+PARAM_NAMES[p]+"_hist.pdf")
    plt.close()

In [6]:
def auto_count():
    autos = 0
    humans = 0
    for i in range(4):
        for j in range(4):
                df = pd.read_csv(DATA_DIRECTORY + "\\" + str(i) + "\\" + str(j) + ".csv")
                if len(df) % 2 != 0:  # Handling `df` with `odd` number of rows
                    df = df.iloc[:-1, :]
                df1, df2 =  np.array_split(df, 2)
                raw_file = df2["auto"]
                counts = raw_file.value_counts()
                autos += counts.loc[True]
                humans += counts.loc[False]
    return autos, humans

In [7]:
auto_rejected, human_choices = auto_count()
print(auto_rejected)
print(human_choices)

24651
10259


In [20]:
acc = 0
rej = 0
for i in range(4):
    for j in range(4):
        df = pd.read_csv(DATA_DIRECTORY + "\\" + str(i) + "\\" + str(j) + ".csv")
        if len(df) % 2 != 0:  # Handling `df` with `odd` number of rows
            df = df.iloc[:-1, :]
        df1, df2 =  np.array_split(df, 2)
        arr = df2.to_numpy()
        for k in range(int(arr.size/4)):
            if k == 0:
                continue
            elif arr[k][2]==arr[k-1][2]:
                rej+=1
            else:
                acc+=1
print(acc/(acc + rej))

        

0.11331460996159798


(2547, 4)