In [77]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib qt
import matplotlib.pyplot as plt
from tqdm import tqdm
from skimage import io, color
import cv2
import os
from sklearn.decomposition import PCA
from scipy import ndimage

# Function definitions

In [164]:
# Based on tutorial: https://jdhao.github.io/2017/11/06/resize-image-to-square-with-padding/
def make_square(img, desired_size=256, fill_color=[255, 255, 255]):
    if img.dtype != np.uint8:
        print(f'Converting to uint8...')
        img = (255*img).astype(np.uint8)
        
    scale_factor = desired_size/max(img.shape[0], img.shape[1])
    resized = cv2.resize(img, (int(scale_factor*img.shape[1]), int(scale_factor*img.shape[0])))
    new_size = resized.shape
    
    delta_w = desired_size - new_size[1]
    delta_h = desired_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    
    out = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=fill_color)
    return out

# use float for thresholding, but return uint8 image
def binarize(img, threshold=0.5, invert=True):
    if img.dtype == np.uint8:
        img = img/255.0 # convert to float64

    # Convert to grayscale
    if len(img.shape) >= 3:
        img = color.rgb2gray(img)
    
    # Threshold
    out = np.zeros_like(img)
    if invert: # detect dark characters
        mask = img < threshold
    else: # detect light characters
        mask = img > threshold

    out[mask] = 1
    return (255*out).astype(np.uint8)

def preprocess(img, desired_size=256, threshold=0.5, invert=True):
    if invert: # detect black character on white background
        fill_color = [255, 255, 255]
    else: # detect white character on black background
        fill_color = [0, 0, 0]

    img_square = make_square(img, desired_size=desired_size, fill_color=fill_color)
    img_bin = binarize(img_square, threshold=threshold, invert=invert)
    return img_bin

def contour_analysis(img, n=4, trim_points=10, bins=20, verbose=False):
    # Find all contours
    contours, hierarchy = cv2.findContours(img, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
    if verbose:
        print(f'# of contours: {len(contours)}')
        print('# of points in each contour:')
        for cnt in contours:
            print(f'\t{len(cnt)} points')

    # Remove contours with too few points
    contours_trimmed = [cnt for cnt in contours if len(cnt) > trim_points]
    if verbose:
        print(f'Trimming contours with fewer than {trim_points} points...')
        print(f'# of remaining contours: {len(contours_trimmed)}')
        for cnt in contours_trimmed:
            print(f'\t{len(cnt)} points')

    # Create dashed contours by keeping every nth point
    assert(n>=2)
    contours_dashed = [cnt[1::n] for cnt in contours_trimmed]
    if verbose:
        print(f'Taking every {n}th point to get dashed contour...')
        for cnt in contours_dashed:
            print(f'\t{len(cnt)} points')
            
    # Find angles between adjacent points in the contour
    thetaseq = []
    for i, cnt in enumerate(contours_dashed):
        for j, point in enumerate(cnt):
            if j == 0:
                prevx, prevy = point[0]
            else:
                x, y = point[0]
                thetaseq.append(np.arctan2(y-prevy, x-prevx))
                prevx = x
                prevy = y
    
    if verbose:
        print(f'# thetas: {len(thetaseq)}')
        print(f'max theta: {max(thetaseq)}')
        print(f'min theta: {min(thetaseq)}')

    hist =  np.histogram(thetaseq, bins=bins, range=(-np.pi, np.pi))
    return hist

def feature_analysis(img, n=4, trim_points=10, bins=20, verbose=False):
    # Find all contours
    contours, hierarchy = cv2.findContours(img, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
    if verbose:
        print(f'# of contours: {len(contours)}')
        print('# of points in each contour:')
        for cnt in contours:
            print(f'\t{len(cnt)} points')

    # Remove contours with too few points
    contours_trimmed = [cnt for cnt in contours if len(cnt) > trim_points]
    if verbose:
        print(f'Trimming contours with fewer than {trim_points} points...')
        print(f'# of remaining contours: {len(contours_trimmed)}')
        for cnt in contours_trimmed:
            print(f'\t{len(cnt)} points')

    # Create dashed contours by keeping every nth point
    assert(n>=2)
    contours_dashed = [cnt[1::n] for cnt in contours_trimmed]
    if verbose:
        print(f'Taking every {n}th point to get dashed contour...')
        for cnt in contours_dashed:
            print(f'\t{len(cnt)} points')
            
    # Find angles between adjacent points in the contour
    thetaseq = []
    for i, cnt in enumerate(contours_dashed):
        for j, point in enumerate(cnt):
            if j == 0:
                prevx, prevy = point[0]
            else:
                x, y = point[0]
                thetaseq.append(np.arctan2(y-prevy, x-prevx))
                prevx = x
                prevy = y
    
    dthetaseq = np.diff(thetaseq)
    hist_theta = np.histogram(thetaseq, bins=bins, range=(-np.pi, np.pi), density=True)
    hist_dtheta =  np.histogram(dthetaseq, bins=bins, range=(-np.pi, np.pi), density=True)
    density_theta, theta = hist_theta
    density_dtheta, dtheta = hist_dtheta
    
    if verbose:
        print(f'# thetas: {len(thetaseq)}')
        print(f'max theta: {max(thetaseq)}')
        print(f'min theta: {min(thetaseq)}')
        print(f'len(thetaseq): {len(thetaseq)}')
        print(f'dthetaseq.shape: {dthetaseq.shape}')
        print(f'density_theta.shape: {density_theta.shape}')
        print(f'density_dtheta.shape: {density_dtheta.shape}')

    rows, cols = img.shape
    X, Y = np.meshgrid(np.linspace(0, 1, cols), np.linspace(0, 1, rows))
    assert(X.shape == Y.shape == img.shape)
    M = np.sum(img)
    M_norm = M/(255*rows*cols)
    EX = np.sum(X*img)/M
    EY = np.sum(Y*img)/M
    DX = np.sum(X**2 * img)/M - EX**2
    DY = np.sum(Y**2 * img)/M - EY**2
    covXY = np.sum(X*Y*img)/M - EX*EY
    
    features = np.hstack((density_theta, density_dtheta, M_norm, EX, EY, DX, DY, covXY))
    #features = np.hstack((density_theta, density_dtheta, M_norm, DX, DY, covXY))
    if verbose:
        print(f'features.shape: {features.shape}')
        print(f'M = {M}')
        print(f'M_norm = {M_norm}')
        print(f'EX = {EX}')
        print(f'EY = {EY}')
        print(f'DX = {DX}')
        print(f'DY = {DY}')
        print(f'covXY = {covXY}')
    return features

def plot_angles(hist, ax=None, title='Distribution of angles', label=''):
    r, theta = hist
    bins = len(r)
    #theta += np.pi/bins
    r = np.append(r, r[0]) # append 0th element to end cuz -pi and pi are the same angle

    if ax is None:
        fig, ax = plt.subplots(subplot_kw={'projection': 'polar'}, figsize=(4, 4), dpi=300)

    ax.plot(theta, r, label=label)
    ax.grid(True)

    ax.set_title(title, va='bottom')
    plt.tight_layout()
    
def plot_mu_and_K(X, title='', filename='mu+K.png'):
    mu = np.mean(X, axis=1)
    K = np.cov(X)
    bins = len(mu)
    theta = np.linspace(-np.pi, np.pi, bins+1)
 
    fig = plt.figure(figsize=(8, 4))
    ax1 = plt.subplot(121, projection='polar')
    ax2 = plt.subplot(122)
    plot_angles([mu, theta], ax=ax1, title='Mean vector μ')
    fig.colorbar(ax2.imshow(K))
    ax2.set_title('Covariance matrix K')
    plt.suptitle(title, fontsize='xx-large')
    plt.tight_layout()
    plt.savefig(filename)
    
def plot_hists(X, title='', filename='hists.png'):
    bins = 20
    theta = np.linspace(-np.pi, np.pi, bins+1)
    X_theta = X[0:bins]
    X_dtheta = X[bins:2*bins]
    
    fig = plt.figure(figsize=(8, 4))
    ax1 = plt.subplot(121, projection='polar')
    ax2 = plt.subplot(122, projection='polar')
    plot_angles([X_theta, theta], ax=ax1, title='Contour Angles')
    plot_angles([X_dtheta, theta], ax=ax2, title='Contour Curvatures')
    plt.suptitle(title, fontsize='xx-large')
    plt.tight_layout()
    plt.savefig(filename)

def analyze_images_in_path(path='', bins=20, invert=True, verbose=False):
    filenames = os.listdir(path)
    filenames.sort()
    if verbose:
        print(filenames)
    
    X = np.zeros((bins, len(filenames)))
    for i, filename in enumerate(filenames):
        img = io.imread(path + filename)
        counts, theta = contour_analysis(preprocess(img, invert=invert), bins=bins)
        # divide counts by total counts to get Probability Mass Function (PMF)
        probs = counts / counts.sum()
        X[:, i] = probs

    return X

def feature_analysis_in_path(path='', bins=20, invert=True, verbose=False):
    filenames = os.listdir(path)
    filenames.sort()
    if verbose:
        print(filenames)
    
    num_features = 2*bins + 6
    #num_features = 2*bins + 4
    X = np.zeros((num_features, len(filenames)))
    for i, filename in enumerate(filenames):
        img = io.imread(path + filename)
        features = feature_analysis(preprocess(img, invert=invert), bins=bins)
        X[:, i] = features.T

    return X

# Mi Fu's *Poem Written In a Boat on the Wu River*

In [141]:
path = 'Extracted Characters/Mi Fu - Poem Written in a Boat on the Wu River/'
filenames = os.listdir(path)
filenames.sort()
print(filenames)

['001.png', '002.png', '003.png', '004.png', '005.png', '006.png', '007.png', '008.png', '009.png', '010.png', '011.png', '012.png', '013.png', '014.png', '015.png', '016.png', '017.png', '018.png', '019.png', '020.png', '021.png', '022.png', '023.png', '024.png', '025.png', '026.png', '027.png', '028.png', '029.png', '030.png', '031.png', '032.png', '033.png', '034.png', '035.png', '036.png', '037.png', '038.png', '039.png', '040.png', '041.png', '042.png', '043.png', '044.png', '045.png', '046.png', '047.png', '048.png', '049.png', '050.png', '051.png', '052.png', '053.png', '054.png', '055.png', '056.png', '057.png', '058.png', '059.png', '060.png', '061.png', '062.png', '063.png', '064.png', '065.png', '066.png', '067.png', '068.png', '069.png', '070.png', '071.png', '072.png', '073.png', '074.png', '075.png', '076.png', '077.png', '078.png', '079.png', '080.png', '081.png', '082.png', '083.png', '084.png', '085.png', '086.png', '087.png', '088.png', '089.png', '090.png', '091.png'

In [139]:
X = analyze_images_in_path(path)
plot_mu_and_K(X, title=f'Wu River ({X.shape[1]} images)', filename='Test/mu+K_WuRiver.png')

# PCA

In [5]:
pca = PCA(n_components=2)
pca.fit(np.transpose(X))
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')
print(f'Singular values: {pca.singular_values_}')
components = pca.fit_transform(np.transpose(X))
print(f'# of datapoints: {len(components)}')
print(f'Principal Components of each datapoint:\n{components}')

plt.figure(figsize=(4, 4))
plt.scatter(components[:, 0], components[:, 1], label='Wu River')
plt.title('PCA')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend()
plt.savefig('Test/PCA_WuRiver.png')

Explained variance ratio: [0.46376558 0.18567444]
Singular values: [0.64595277 0.40872158]
# of datapoints: 91
Principal Components of each datapoint:
[[ 0.00036497 -0.04055316]
 [-0.04857066 -0.07094967]
 [ 0.06122655  0.04654032]
 [-0.01613446 -0.00603782]
 [ 0.02814406 -0.01462888]
 [ 0.00671194 -0.00175597]
 [ 0.01776733  0.03306314]
 [ 0.03402     0.00555434]
 [-0.03777442 -0.049431  ]
 [-0.05745886 -0.04579842]
 [-0.00853481 -0.04679862]
 [-0.01135655 -0.00698931]
 [-0.02071141  0.00842589]
 [ 0.01274043 -0.0583408 ]
 [ 0.08382157  0.03102172]
 [-0.02297599  0.01905669]
 [ 0.10739135  0.03128132]
 [-0.05256259  0.00947863]
 [ 0.00198506 -0.08034348]
 [-0.04629028  0.01712935]
 [-0.02251515 -0.0095294 ]
 [-0.00461233 -0.00882975]
 [ 0.01980275  0.02980875]
 [-0.02633659 -0.0312785 ]
 [ 0.00528308  0.02810656]
 [ 0.01261909 -0.02890035]
 [-0.02613037  0.00528459]
 [ 0.06926955 -0.00316414]
 [-0.04079143 -0.00750503]
 [-0.02019476 -0.04851468]
 [-0.02102922  0.00042207]
 [-0.0328246

# Emperor Huizong

In [6]:
path_huizong = 'Extracted Characters/Emperor Huizong - Finches and bamboo/'
X_huizong = analyze_images_in_path(path_huizong)
plot_mu_and_K(X_huizong, title=f'Huizong ({X_huizong.shape[1]} images)', filename='Test/mu+K_Huizong.png')

In [7]:
pca = PCA(n_components=2)
pca.fit(np.transpose(X_huizong))
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')
print(f'Singular values: {pca.singular_values_}')
components = pca.fit_transform(np.transpose(X_huizong))
print(f'# of datapoints: {len(components)}')
print(f'Principal Components of each datapoint:\n{components}')

plt.figure(figsize=(4, 4))
plt.scatter(components[:, 0], components[:, 1], label='Huizong')
plt.title('PCA')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend()
plt.savefig('Test/PCA_Huizong.png')

Explained variance ratio: [0.53135269 0.15757212]
Singular values: [0.50312026 0.27398075]
# of datapoints: 63
Principal Components of each datapoint:
[[-2.16313251e-03 -4.25746873e-02]
 [ 8.57089448e-02  1.40339353e-02]
 [-2.77658039e-04 -3.65770779e-02]
 [-1.27794456e-01 -7.98409233e-03]
 [ 1.07393757e-01 -2.80251417e-02]
 [-2.24144291e-02  1.00944753e-02]
 [-2.48369017e-02 -3.93555482e-02]
 [-1.10309812e-01  1.60847233e-02]
 [-3.75527438e-02  5.48116811e-03]
 [ 3.65860470e-02 -2.08294503e-02]
 [ 8.33063050e-03  3.68827932e-03]
 [ 2.43327057e-02  4.01623480e-02]
 [-6.30493270e-02  2.49007855e-03]
 [-4.33027634e-02  3.31282796e-02]
 [ 2.29656044e-02  3.74667220e-02]
 [-4.41109453e-02 -1.26432794e-02]
 [ 1.89401046e-01 -6.69006215e-02]
 [-5.87827426e-02  7.85313554e-02]
 [ 1.01260225e-01  1.84081055e-02]
 [ 1.94304092e-02 -1.21097606e-02]
 [-5.06148221e-02 -3.15385971e-02]
 [ 5.08185361e-02  1.60215374e-02]
 [-1.51775337e-02 -2.87712629e-02]
 [-1.10392628e-02  4.48198762e-02]
 [ 5.1918

In [8]:
np.mean(X, axis=1) - np.mean(X_huizong, axis=1)

array([ 0.        , -0.01189713, -0.0015365 ,  0.00282308,  0.        ,
        0.01359497,  0.0182712 ,  0.00675491, -0.00955608,  0.        ,
       -0.02040361, -0.00902268, -0.00562586,  0.00024467,  0.        ,
        0.02281982,  0.02439142, -0.00315011, -0.01308333, -0.01462477])

# PCA Comparison

In [9]:
np.vstack((X.T, X_huizong.T)).shape

(154, 20)

In [11]:
pca = PCA(n_components=2)
components = pca.fit_transform(np.vstack((X.T, X_huizong.T)))
print(f'# of datapoints: {len(components)}')

cvec = [0]*X.shape[1] + [1]*X_huizong.shape[1]
print(f'len(cvec) = {len(cvec)}')


x = components[:, 0] # PC 1
y = components[:, 1] # PC 2

# https://matplotlib.org/stable/gallery/lines_bars_and_markers/scatter_with_legend.html
fig, ax = plt.subplots()
scatter = ax.scatter(x, y, c=cvec)
ax.set_title('PCA')
ax.set_xlabel('PC 1')
ax.set_ylabel('PC 2')

# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(),
                    loc="upper right", title="Classes")
ax.add_artist(legend1)
plt.savefig('Test/PCA_WuRiver+Huizong.png')

# of datapoints: 154
len(cvec) = 154


That was a really hacky way of labeling the classes on the PCA plot. Let's try a more elegant solution with pandas and Plotly. See https://plotly.com/python/pca-visualization/

In [109]:
import plotly.express as px
from sklearn.decomposition import PCA

df = px.data.iris()
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

pca = PCA(n_components=2)
pca.fit(X)
print(pca.components_)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, color=df['species'])
fig.show()

[[ 0.36158968 -0.08226889  0.85657211  0.35884393]
 [ 0.65653988  0.72971237 -0.1757674  -0.07470647]]


In [13]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


Ok, now that I see how the dataframe in Plotly's tutorial is structured, I can construct one from my own data. Or I can skip the dataframe altogether and just provide a list to the 'color' parameter in px.scatter.

In [149]:
paths = ['Extracted Characters/Mi Fu - Poem Written in a Boat on the Wu River/',
         'Extracted Characters/Emperor Huizong - Finches and bamboo/',
         'Extracted Characters/Su Shi - Inscription of Hanshi/',
         'Extracted Characters/WangXiZhi - On the Seventeenth Day/',
         'Extracted Characters/Mi Fu - On Cursive Calligraphy/']
label_list = ['Mi Fu (Wu River)',
             'Huizong',
             'Su Shi',
             'WangXiZhi',
             'Mi Fu (On Cursive Calligraphy)']
invert_list = [True,
              True,
              True,
              False,
              True]

# Construct labeled dataset of contour angle histograms
X_list = []
labels = []
for i, path in enumerate(paths):
    label = label_list[i]
    invert = invert_list[i]
    if invert:
        X = analyze_images_in_path(path, invert=True)
    else:
        X = analyze_images_in_path(path, invert=False)
    plot_mu_and_K(X, title=f'{label} ({X.shape[1]} images)', filename=f'Test/mu+K_{label.replace(" ", "")}.png')
    X_list.append(X)
    labels += [label] * X.shape[1]
    
X_total =  np.vstack([X.T for X in X_list])

# Perform PCA
pca = PCA(n_components=2)
components = pca.fit_transform(X_total)
print(f'# of datapoints: {len(components)}')
print(f'len(labels) = {len(labels)}')
assert(len(labels) == len(components))
total_var = pca.explained_variance_ratio_.sum() * 100
print('Total explained variance = {:0.2f}%'.format(total_var))

# Plot results with Plotly
fig = px.scatter(components, x=0, y=1, color=labels)
fig.update_layout(
    title="PCA",
    xaxis_title="PC 1",
    yaxis_title="PC 2",
    legend_title="Class",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()
fig.write_image('Test/PCA_Plotly_5.png')

# of datapoints: 431
len(labels) = 431
Total explained variance = 61.16%


In [58]:
# 3D PCA
pca = PCA(n_components=3)
components = pca.fit_transform(np.vstack([X.T for X in X_list]))

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=labels,
    size=[0.001]*len(labels),
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

In [53]:
# Double-check that the WangXiZhi characters look okay after post processing
path = 'Extracted Characters/WangXiZhi - On the Seventeenth Day/'
filenames = os.listdir(path)
filenames.sort()

bins = 20
X = np.zeros((bins, len(filenames)))
for i, filename in enumerate(filenames):
    img = io.imread(path + filename)
    preprocessed = preprocess(img, invert=False)
    io.imsave('Test/debug/preprocessed_{:02d}.png'.format(i), preprocessed)
    counts, theta = contour_analysis(preprocessed, bins=bins)
    # divide counts by total counts to get Probability Mass Function (PMF)
    probs = counts / counts.sum()
    X[:, i] = probs


# Using 46 features

In [166]:
paths = ['Extracted Characters/Mi Fu - Poem Written in a Boat on the Wu River/',
         'Extracted Characters/Emperor Huizong - Finches and bamboo/',
         'Extracted Characters/Su Shi - Inscription of Hanshi/',
         'Extracted Characters/WangXiZhi - On the Seventeenth Day/',
         'Extracted Characters/Mi Fu - On Cursive Calligraphy/']
label_list = ['Mi Fu (Wu River)',
             'Huizong',
             'Su Shi',
             'WangXiZhi',
             'Mi Fu (On Cursive Calligraphy)']
invert_list = [True,
              True,
              True,
              False,
              True]

# Construct labeled dataset of contour angle histograms
X_list = []
labels = []
for i, path in enumerate(paths):
    label = label_list[i]
    invert = invert_list[i]
    if invert:
        X = feature_analysis_in_path(path, invert=True)
    else:
        X = feature_analysis_in_path(path, invert=False)
    X_list.append(X)
    labels += [label] * X.shape[1]
    mu = np.mean(X, axis=1)
    plot_hists(mu, title=label, filename=f'Test/hists_{label.replace(" ", "")}.png')
    
X_total =  np.vstack([X.T for X in X_list])

# Perform PCA
pca = PCA(n_components=2)
components = pca.fit_transform(X_total)
print(f'# of datapoints: {len(components)}')
print(f'len(labels) = {len(labels)}')
assert(len(labels) == len(components))
total_var = pca.explained_variance_ratio_.sum() * 100
print('Total explained variance = {:0.2f}%'.format(total_var))

# Plot results with Plotly
fig = px.scatter(components, x=0, y=1, color=labels)
fig.update_layout(
    title="PCA",
    xaxis_title="PC 1",
    yaxis_title="PC 2",
    legend_title="Class",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)
fig.show()
fig.write_image('Test/PCA_Plotly_46features.png')

# of datapoints: 431
len(labels) = 431
Total explained variance = 56.90%


In [172]:
# Compare 5 histograms
fig = plt.figure(figsize=(20, 4))
ax1 = plt.subplot(151, projection='polar')
ax2 = plt.subplot(152, projection='polar')
ax3 = plt.subplot(153, projection='polar')
ax4 = plt.subplot(154, projection='polar')
ax5 = plt.subplot(155, projection='polar')
axes = [ax1, ax2, ax3, ax4, ax5]

bins = 20
theta = np.linspace(-np.pi, np.pi, bins+1)
for i, X in enumerate(X_list):
    label = label_list[i]
    mu = np.sum(X, axis=1)
    plot_angles([mu[0:bins], theta], ax=axes[i], title=label)
    #plot_angles([mu[bins:2*bins], theta], ax=axes[i], title=label) # curvature
plt.suptitle('Contour Angle Statistics', fontsize='xx-large')
plt.tight_layout()
plt.savefig('Test/ContourAngleStatistics.png')

In [165]:
# Show the Principal Components
pca.fit(X_total)
PC1, PC2 = pca.components_
print(f'PC 1 = {PC1}')
print(f'PC 2 = {PC2}')

plt.figure()
plt.plot(PC1, label='PC 1')
plt.plot(PC2, label='PC 2')
plt.legend()
plt.title('Principal Components')
plt.savefig('Test/PrincipalComponents.png')


bins = 20
theta = np.linspace(-np.pi, np.pi, bins+1)

fig = plt.figure(figsize=(8, 4))
ax1 = plt.subplot(121, projection='polar')
ax2 = plt.subplot(122, projection='polar')
plot_angles([PC1[0:bins], theta], ax=ax1, title='Contour Angles')
plot_angles([PC2[0:bins], theta], ax=ax1, title='Contour Angles')
plot_angles([PC1[bins:2*bins], theta], ax=ax2, title='Contour Curvatures')
plot_angles([PC2[bins:2*bins], theta], ax=ax2, title='Contour Curvatures')
plt.suptitle('Principal Components', fontsize='xx-large')
plt.legend()
plt.tight_layout()
plt.savefig('Test/PC_hists.png')


PC 1 = [-7.25767574e-03  2.36275509e-02  8.67508098e-03  4.09302599e-02
  7.95683404e-03 -1.05213301e-01  5.36869689e-03 -2.71217901e-02
  4.77057561e-02 -1.94217970e-02 -1.16822276e-02  3.24863164e-02
  4.02041480e-02  3.14566220e-02  1.22382389e-02 -1.06009212e-01
 -1.45449558e-02 -9.18060673e-03  5.41130087e-02 -4.33094654e-03
  2.01023805e-03  4.17500486e-03  7.73578600e-03  2.41875523e-02
  1.34309776e-02  4.27761630e-02  6.46863625e-02  1.45067264e-01
  2.23372777e-01 -2.25138774e-01 -8.64214509e-01  2.37648884e-01
  1.55996375e-01  6.51992135e-02  3.63392053e-02  3.23055081e-02
  1.90580454e-02  6.05412620e-03  4.32106553e-03  4.98873479e-03
 -3.07541951e-02  1.89393050e-02 -9.89687117e-03 -3.70624308e-03
 -3.22107451e-04  4.23934567e-03]
PC 2 = [ 1.05582947e-01  4.46742876e-02  2.17561244e-02 -3.67611964e-02
 -1.23902754e-01 -5.22935215e-01 -8.36530952e-02  7.67507075e-02
  1.28096270e-01  2.16530136e-01  2.90408446e-01  4.22493722e-02
  9.66834113e-03 -3.90228116e-02 -9.675065

No handles with labels found to put in legend.


In [144]:
# 3D PCA
pca = PCA(n_components=3)
components = pca.fit_transform(X_total)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=labels,
    size=[0.001]*len(labels),
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

# Classification
First, split labeled data into training and test sets

In [145]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_total, labels, test_size=0.33, random_state=42)
print(f'X_total.shape: {X_total.shape}')
print(f'X_train.shape: {X_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'len(y_train): {len(y_train)}')
print(f'len(y_test): {len(y_test)}')

X_total.shape: (431, 46)
X_train.shape: (288, 46)
X_test.shape: (143, 46)
len(y_train): 288
len(y_test): 143


Now let's run a bunch of classifiers based on this tutorial: https://www.kaggle.com/jeffd23/10-classifier-showdown-in-scikit-learn

In [146]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    #QuadraticDiscriminantAnalysis(),
]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 53.8462%
Log Loss: 7.033855174800827
SVC
****Results****
Accuracy: 22.3776%
Log Loss: 1.2497222835076418
NuSVC
****Results****
Accuracy: 61.5385%
Log Loss: 1.0265631883156578
DecisionTreeClassifier
****Results****
Accuracy: 48.9510%
Log Loss: 17.63168305474462
RandomForestClassifier
****Results****
Accuracy: 63.6364%
Log Loss: 1.0255808995640663
AdaBoostClassifier
****Results****
Accuracy: 32.1678%
Log Loss: 1.7390233506540531
GradientBoostingClassifier
****Results****
Accuracy: 60.1399%
Log Loss: 1.1589788072243492
GaussianNB
****Results****
Accuracy: 61.5385%
Log Loss: 2.5987213761799137
LinearDiscriminantAnalysis
****Results****
Accuracy: 68.5315%
Log Loss: 1.0550883887981337


In [147]:
plt.figure(figsize=(8, 4))
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")

plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')
plt.xlim(0, 100)
plt.tight_layout()
plt.savefig('Test/ClassifierAccuracy.png')

In [148]:
plt.figure(figsize=(8, 4))
sns.set_color_codes("muted")
sns.barplot(x='Log Loss', y='Classifier', data=log, color="g")

plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.xlim(0, 100)
plt.tight_layout()
plt.savefig('Test/ClassifierLogLoss.png')