# *A Corpus-Based Approach to Colour, Shape and Typography in Logos*

This notebook contains the analyses performed for the chapter *A Corpus-Based Approach to Colour, Shape and Typography in Logos* by Christian Mosbæk Johannessen, Mads Lomholt Tvede, Kristoffer Claussen Boesen and Tuomo Hiippala.

In [None]:
# Import the necessary packages
from scipy.stats import normaltest, mannwhitneyu, ttest_ind
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from matplotlib.patches import Ellipse
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from PIL import Image
import matplotlib.transforms as transforms
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Render matplotlib images in the notebook
%matplotlib inline

## Which principal components you want to compare?

Provide a list of tuples.

In [None]:
pcs = [("PC1", "PC2"), ("PC1", "PC3"), ("PC1", "PC4"), ("PC1", "PC5")]

## Preparing the data

Load the data and apply z-score normalisation to the measurements.

In [None]:
# Read data from CSV file; assign result to pandas DataFrame 'df'
df = pd.read_csv("corpus_data.csv", sep=";", decimal=',')

# Take all rows [:] in the DataFrame but include only columns with data [3:]
input_data = df.loc[:, df.columns[3:]]

# Scale the data to zero mean and unit variance by applying the function 'scale' from sklearn.preprocessing
df[input_data.columns] = input_data.apply(scale)

Filter the data for outliers by getting Boolean (True/False) values for each data cell.

Z-scores follow a normal distribution, so we know that 99.9% of data lie within four standard deviations on either side (-4 and +4).

In [None]:
# Get boolean values for filtering the data
over = ~df.loc[:, df.columns[3:]].apply(lambda x: x >= 4)
under = ~df.loc[:, df.columns[3:]].apply(lambda x: x <= -4)

# Filter the data for outliers
df[input_data.columns] = df[input_data.columns][over]
df[input_data.columns] = df[input_data.columns][under]

# Drop outlier rows
df = df.dropna(axis=0, how='any')

This leaves us with 45 logos.

In [None]:
df['Organization'].value_counts()

## Principal component analysis (PCA)

Initialize and fit PCA model with **five** principal components to the pre-processed data

In [None]:
# Fit PCA model
pca = PCA(n_components=5).fit(df.loc[:, df.columns[3:]])

print(f"Percentage of variance explained by principal components: {np.sum(pca.explained_variance_ratio_).round(3) * 100}%")

# Map variance explained by PC to dictionary
vmap = {f"PC{k}": v for k, v in enumerate(pca.explained_variance_ratio_.round(2).tolist(), start=1)}

In [None]:
# Set seaborn style
sns.set_style("whitegrid")

# Plot the principal components against explained variance 
plt.bar(list(range(1, len(pca.explained_variance_ratio_) + 1)), (pca.explained_variance_ratio_ * 100), width=0.6)

# Set labels
plt.xlabel("Principal component", fontsize=14)
plt.ylabel("Percentage of variation explained", fontsize=14)

# Save figure on disk
plt.savefig(f"plot_{len(pca.explained_variance_ratio_)}_principal_components.pdf", dpi=300, bbox_inches="tight")

Use the PCA model to transform the input data and add the result to the DataFrame.

In [None]:
# Transform input data and get principal components
X = pca.transform(df.loc[:, df.columns[3:]])

# Get the shape of the second dimension in X for the number of principal components
n_pc = X.shape[1]

# Add principal components to the dataframe
for x in range(0, n_pc):

    # Assign the principal components to column PCx
    df[f'PC{x+1}'] = X[:, x]

Define supporting functions for drawing confidence ellipses and adding logo thumbnails to the plot.

In [None]:
def confidence_ellipse(x, y, ax, n_std=2.0, facecolor='none', **kwargs):
    """
    Create a plot of the covariance confidence ellipse of *x* and *y*.

    Parameters
    ----------
    x, y : array-like, shape (n, )
        Input data.

    ax : matplotlib.axes.Axes
        The axes object to draw the ellipse into.

    n_std : float
        The number of standard deviations to determine the ellipse's radiuses.

    Returns
    -------
    matplotlib.patches.Ellipse

    Other parameters
    ----------------
    kwargs : `~matplotlib.patches.Patch` properties
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = np.cov(x, y)
    pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])

    # Using a special case to obtain the eigenvalues of this
    # two-dimensionl dataset.
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0),
                      width=ell_radius_x * 2,
                      height=ell_radius_y * 2,
                      fill=True, alpha=0.2,
                      facecolor=facecolor, **kwargs)

    # Calculating the standard deviation of x from
    # the square root of the variance and multiplying
    # with the given number of standard deviations.
    scale_x = np.sqrt(cov[0, 0]) * n_std
    mean_x = np.mean(x)

    # calculating the stdandard deviation of y ...
    scale_y = np.sqrt(cov[1, 1]) * n_std
    mean_y = np.mean(y)

    transf = transforms.Affine2D() \
        .rotate_deg(45) \
        .scale(scale_x, scale_y) \
        .translate(mean_x, mean_y)

    ellipse.set_transform(transf + ax.transData)
    
    return ax.add_patch(ellipse)

# Define a function for processing logo images
def plot_image(path):
    
    # Read image
    img = Image.open(path).resize((30, 30), Image.ANTIALIAS)    
    
    return OffsetImage(img)

Plot the principal components against each other.

In [None]:
# Initialize seaborn nicer plots and a matplotlib figure
sns.set_style("whitegrid")

# Choose what to plot: 'thumbnails' includes thumbnails of logos, 'ellipse' draws confidence ellipses and 'kde' plots kernel density estimation
thumbnails=False
ellipse=True
kde=False

# Check what kind of plot layout is required; this depends on the number of components to plot
if len(pcs) == 1:
    
    fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
    
if len(pcs) == 2:
    
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
    
if len(pcs) > 2:
    
    fig, axs = plt.subplots(nrows=int(round(len(pcs) / 2)), ncols=2, figsize=(12, 12))
    fig.tight_layout(pad=6.0)
    
    axs = axs.flatten()

# Define a colormap

# Map the diagram type and colours into a dictionary
color_dict = dict(zip(df['Organization'].unique(), sns.color_palette()))

# Get unique organization types
organizations = df['Organization'].unique().tolist()

# Loop over the principal component tuples and axes
for (pc_1, pc_2), ax in zip(pcs, axs):
    
    # Set axis shape
    ax.set_aspect(1)
    ax.set(xlim=(-4, 4), ylim=(-4, 4))

    # Loop over organizations and plot their principal components
    for org in organizations:

        # Get data for specific organization type
        slice = df.loc[df['Organization'] == org]
        
        # Plot confidence ellipse
        if ellipse:
            
            # Plot confidence ellipse
            confidence_ellipse(x=slice[pc_1], y=slice[pc_2], n_std=2, ax=ax, facecolor=color_dict[org])
            
        # Plot kernel density estimation
        if kde:
            
            # Plot kernel density estimation
            sns.kdeplot(slice[pc_1], slice[pc_2], ax=ax, shade=True, shade_lowest=False, n_levels=5, 
                        alpha=0.5, cbar=True, cbar_kws={"shrink": 0.875})

        # Plot the data
        ax.scatter(x=slice[pc_1], y=slice[pc_2], s=14, label=org)
        
        # Check if thumbnail images should be added to the plot
        if thumbnails:

            # Get X-Y coordinates and filenames
            x, y, img = slice[pc_1], slice[pc_2], slice['File name']
            
            # Loop over coordinates and image filenames
            for x_coord, y_coord, img_filename in zip(x, y, img):

                # Create a bounding box for annotation
                ab = AnnotationBbox(plot_image(f"logos/{img_filename}"), (x_coord, y_coord), frameon=False)

                # Add the annotation box to the axis
                ax.add_artist(ab)

    # Set axis labels and title
    ax.set_xlabel(f"{pc_1} (variance explained: {np.round(vmap[pc_1] * 100, 2)}%)", fontsize=16)
    ax.set_ylabel(f"{pc_2} (variance explained: {np.round(vmap[pc_2] * 100, 2)}%)", fontsize=16)
    ax.set_title(f"{pc_1} vs. {pc_2}", fontsize=18)

    # Add legend to the plot
    ax.legend()
        
# Save figure on disk
plt.savefig(f"{pc_1}_vs_{pc_2}_{'kde' if kde else 'pca'}_plot.pdf", dpi=300, bbox_inches="tight")

Calculate PCA loadings, that is, correlations between the original variables and the principal components derived.

In [None]:
# Calculate a PCA loadings matrix for examining the correlation between original variables and PC
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

# Convert to a DataFrame
loadings = pd.DataFrame(loadings.round(4), columns=df.filter(like='PC').columns.tolist(), index=df.columns[3:-n_pc])

# Print the PCA loadings
loadings

## A statistical comparison of principal components

Check if the principal components are normally distributed.

In [None]:
# Loop over each organisation in the list 'organizations'
for org in organizations:
    
    # Loop over each column for principal components; fetch column names from DataFrame 'loadings'
    for pc in loadings.columns:
        
        # Test for normality: the data is retrieved from DataFrame 'df'
        n_test = normaltest(df.loc[df['Organization'] == org][pc].to_numpy())
        
        # Print the result
        if n_test[1] > 0.05:
            
            print(f"{org}, {pc}, NORMAL.")
            
        else:
            
            print(f"{org}, {pc}, NOT NORMAL.")

Use Mann-Whitney U-test to compare the principal components.

In [None]:
# Loop over the principal components
for pc in loadings.columns:
    
    # Perform Mann-Whitney U-test on data fetched from the DataFrame 'df'
    mw_test = mannwhitneyu(df.loc[df['Organization'] == 'OIL'][pc].to_numpy(), df.loc[df['Organization'] == 'NGO'][pc].to_numpy())
        
    # Print the result
    if mw_test[1] > 0.05:
            
        print(f"{pc} Not statistically significant at {np.round(mw_test[0], 3)}, {np.round(mw_test[1], 3)}.")
        
    if mw_test[1] < 0.05:
        
        print(f"{pc} Statistically significant at {np.round(mw_test[0], 3)}, {np.round(mw_test[1], 3)}.")