In [1]:
import numpy as np
import pandas as pd
from numpy.random import normal, multivariate_normal

import itertools

from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn import mixture

In [2]:
DATA_DIRECTORY = "C:\\Users\\WDAmo\\GitHub\\font_analysis\\data"
PARAM_LABELS = ["unit", "pen", "cap", "bar", "asc", "desc", "xht", "horz",
                "vert", "cont", "supr", "slnt", "aprt", "crnr", "over", "tapr"]

In [9]:
def extract_data(lin, chain):
    raw_file = pd.read_csv(DATA_DIRECTORY + "\\" + str(lin) + "\\" + str(chain) + ".csv")["chosen.values"]
    df = pd.DataFrame([x[1:-1].split(',') for x in raw_file])
    return df
def extract_all():
    out = []
    for i in range(4): ##For each lineage,
        lin = []
        for j in range(4): ##And each chain inside that lineage
            lin.append(extract_data(i,j))
        out.append(lin)
    return out;
def split_df(df):
    if len(df) % 2 != 0:  # Handling `df` with `odd` number of rows
        df = df.iloc[:-1, :]
    df1, df2 =  np.array_split(df, 2)
    return df1, df2
def split_all(dfs): # Returns the second half of each dataframe, cast correctly
    out = []
    for i in range(4): ##For each lineage,
        lin = []
        for j in range(4): ##And each chain inside that lineage
            upper, lower = split_df(dfs[i][j])
            lin.append(lower.astype('float64').to_numpy())
        out.append(lin)
    return out;
def combine_all(chains):
    full_chain = np.empty((0,16), 'float64')
    for i in range(4):
        for j in range(4):
            full_chain = np.concatenate((full_chain, chains[i][j]))
    return full_chain
def combine_by_font(chains):
    arial = np.empty((0,16), 'float64')
    georgia = np.empty((0,16), 'float64')
    for i in range(4):
        if(i%2==1):
            for j in range(4):
                arial = np.concatenate((arial, chains[i][j]))
        else:            
            for j in range(4):
                georgia = np.concatenate((georgia, chains[i][j]))
    return arial, georgia
def add_noise(dfs, sigma):
    out = []
    noise = np.random.normal(0, sigma, dfs.shape)  
    return (dfs + noise)

In [10]:
raw_data = split_all(extract_all())
dataset = add_noise(combine_all(raw_data),1)

In [11]:
dataset.shape

(34910, 16)

In [None]:
# Checking average length of each split lineage
for i in range(4):
    sum = 0
    for j in range(4):
        l, p = dfs_split[i][j].shape
        sum+= l
    print(str(sum/4))
            

In [None]:

def add_noise_split(dfs, sigma):
    out = []
    for i in range(4):
        lin = []
        for j in range(4):
            lin.append(add_noise(dfs[i][j], sigma))
        out.append(lin)
    return out
            
noisy_1 = add_noise(dataset, 1)
noisy_10 = add_noise(dataset, 10)
noisy_5 = add_noise(dataset, 5)
noisy_1_split = add_noise_split(raw_data, 1)
noisy_5_split = add_noise_split(raw_data, 5)
noisy_10_split = add_noise_split(raw_data, 10)


In [None]:
X = noisy_1
n_components_range = np.arange(1,21)
for n_components in n_components_range:
    # Fit a Gaussian mixture with EM
    gmm = mixture.GaussianMixture(n_components=n_components,
                                  covariance_type='full')
    gmm.fit(X)
    print(str(n_components) + ": "+ str(gmm.bic(X)))

# for i in range(4):
#     for j in range(4):
#         lowest_bic = np.infty
#         for n_components in n_components_range:
#             # Fit a Gaussian mixture with EM
#             gmm = mixture.GaussianMixture(n_components=n_components,
#                                           covariance_type='full').fit(X[i][j])
#             if gmm.bic(X[i][j]) < lowest_bic:
#                 lowest_bic = gmm.bic(X[i][j])
#                 best_gmm = n_components
#         print(best_gmm)

In [None]:
raw_data[0][0]

In [12]:
#Running the GMM on each chain
results = []
n_comps = 12
gmm = mixture.GaussianMixture(n_components=n_comps,
                              covariance_type='full', max_iter=5000, n_init=5)
gmm.fit(dataset)
weights = gmm.weights_
means = gmm.means_
chainframe = pd.DataFrame({'weight': weights, 'mean': list(means)}, columns=['weight', 'mean']).sort_values('weight', ascending=False)

chainframe
            
            

Unnamed: 0,weight,mean
6,0.224206,"[51.60090903084503, 47.82057699258017, 46.7859..."
8,0.140558,"[58.376097848856475, 59.388183427758484, 46.38..."
2,0.094838,"[37.779083449980035, 64.58117388787537, 51.592..."
0,0.088322,"[63.60463301835412, 68.05361418352214, 72.0545..."
7,0.070504,"[38.4597469624781, 74.29374473110146, 75.07627..."
5,0.063339,"[31.218423189717008, 48.50138624225242, 28.625..."
1,0.060382,"[21.44684307902486, 26.89743848337879, 33.7275..."
4,0.059706,"[35.71172876698902, 70.11041987588781, 43.6865..."
11,0.056122,"[35.77849602281612, 66.80588057631441, 37.6581..."
10,0.052945,"[53.539445308021534, 66.43943242045486, 61.394..."


In [14]:
chainframe = chainframe.to_numpy()

In [16]:
top_6= chainframe[:6,1]
rounded_6 = []
for i in top_6:
    rounded_6.append(np.round(i))
rounded_6

[array([52., 48., 47., 55., 53., 50., 59., 45., 51., 63., 42., 55., 41.,
        55., 51., 49.]),
 array([58., 59., 46., 48., 54., 58., 48., 50., 28., 24., 45., 50., 41.,
        42., 37., 63.]),
 array([38., 65., 52., 28., 51., 66., 56., 47., 65., 31., 34., 44., 57.,
        32., 51., 31.]),
 array([64., 68., 72., 60., 59., 39., 47., 74., 61., 31., 59., 72., 72.,
        35., 61., 35.]),
 array([38., 74., 75., 36., 46., 43., 31., 18., 63., 32., 33., 46., 29.,
        50., 52., 37.]),
 array([31., 49., 29., 32., 50., 63., 64., 64., 60., 48., 63., 53., 76.,
        77., 40., 50.])]

In [18]:
list(rounded_6[0])

[52.0,
 48.0,
 47.0,
 55.0,
 53.0,
 50.0,
 59.0,
 45.0,
 51.0,
 63.0,
 42.0,
 55.0,
 41.0,
 55.0,
 51.0,
 49.0]

In [21]:
def load_font(arr, filename):#List of values -> URL
    rounded = np.around(arr)
    out = "localhost:1999/font/"+filename+"/"
    for i in arr:
        out+= str(i)[:2] + "-"
    return out[:-1]+"/"
load_font(rounded_6[0], "test")

'localhost:1999/test/52-48-47-55-53-50-59-45-51-63-42-55-41-55-51-49/'