# Mosaic Analysis
In this notebook we will used convolutional features and image metadata to generate visual mosaics over time.

To do so, we must transform our dimension-rich data into 2-dimensions. There are a trove of techniques to do dimensionality reduction, but in this case we'll be using an algorithm called UMap. UMap is unique because it allows us to persist the model and reuse it. This allows us to project our data into the same 2-dimensional latent space with new data.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import random
import numpy as np
import datetime
from dateutil import relativedelta
import json
import pickle
import dill

import matplotlib.pyplot
from matplotlib.pyplot import imshow
from PIL import Image, ImageFont, ImageDraw 
import pandas as pd
from sklearn.externals import joblib
from tqdm import tqdm
import umap.umap_ as umap
from rasterfairy import transformPointCloud2D

from config import cols_conv_feats, image_lookup_file, skip_hash, logits_file, working_dir



In [3]:
df_conv = pd.read_csv(logits_file, index_col=0, 
                      nrows=100000, compression='gzip')

In [4]:
len(df_conv)

100000

## Dimensionality Reduction

In [5]:
# UMAP Params
n_neighbors = 25
metric = 'minkowski'
min_dist = 0.25

sample_size = 8000

# Model files
encoder_file = (f'{working_dir}/encoder_{str(min_dist).replace(".", "-")}_'
                f'dist_{ metric }_sample_{ sample_size }.pkl')

In [None]:
sample_dataset = df_conv[cols_conv_feats].sample(sample_size, random_state=303)
sample_dataset.to_csv(f'{working_dir}/umap_training_data.csv')

encoder = umap.UMAP(n_neighbors=n_neighbors,
                    min_dist=min_dist,
                    metric=metric,
                    random_state=303,
                    verbose=1).fit(sample_dataset.values)

joblib.dump(encoder, encoder_file)

In [25]:
# dill.dump(encoder, open(encoder_file.replace('.pkl', '__dill.pkl'), 'wb'))

This creates a scatterplot, we use Mario Klingmann's RasterFairy software to convert this pointcloud into neat rows and columns.

## Generate Mosaic

In [6]:
encoder = joblib.load(encoder_file)

In [26]:
# encoder = dill.load(open(encoder_file.replace('.pkl', '__dill.pkl'), 'rb'))

In [15]:
df_media = pd.read_json(image_lookup_file, lines=True, 
                        orient='records', compression='gzip')

In [16]:
df_media = df_media[~df_media['d_hash'].isin(skip_hash)]

In [None]:
df_merged = df_conv.merge(df_media.set_index('d_hash'), 
                          how='inner')

In [None]:
i = 10
tile_width = 72
tile_height = 56
nx = 50
ny = 40
sample_size = nx * ny

# whaddup
df_sample = df_merged[i * sample_size : (i + 1) * sample_size]
images = df_sample.f_img
embeddings = encoder.transform(df_sample[cols_conv_feats].values)

In [None]:
width = 4000
height = 3000
max_dim = 100

tx, ty = embeddings[:,0], embeddings[:,1]
tx = (tx-np.min(tx)) / (np.max(tx) - np.min(tx))
ty = (ty-np.min(ty)) / (np.max(ty) - np.min(ty))

full_image = Image.new('RGB', 
                       size=(width, height), 
                       color=(55, 61, 71))

for img, x, y in tqdm(zip(images, tx, ty)):
    tile = Image.open(img)
    # resize image
    rs = max(1, tile.width / max_dim, tile.height / max_dim)
    tile_width = int(tile.width / rs)
    tile_height = int(tile.height / rs)
    tile_dims = (tile_width, tile_height)
    tile = tile.resize(size=tile_dims, 
                       resample=Image.ANTIALIAS)
    # add the image to the graph               
    x_coord = int((width - max_dim) * x)
    y_coord = int((height - max_dim) * y)
    img_coords = (x_coord, y_coord)
    full_image.paste(tile, box=img_coords,
                     mask=tile.convert('RGBA'))

matplotlib.pyplot.figure(figsize = (16,12))
imshow(full_image);

In [None]:
# assign to grid
grid_assignment = transformPointCloud2D(embeddings, 
                                        target=(nx, ny))

full_width = tile_width * nx
full_height = tile_height * (ny +1)
aspect_ratio = float(tile_width) / tile_height

grid_image = Image.new('RGB', (full_width, full_height))

for img, grid_pos in tqdm(zip(images, grid_assignment[0])):
    idx_x, idx_y = grid_pos
    x, y = tile_width * idx_x, tile_height * idx_y
    try:
        tile = Image.open(img)
        tile_ar = float(tile.width) / tile.height  # center-crop the tile to match aspect_ratio
        if (tile_ar > aspect_ratio):
            margin = 0.5 * (tile.width - aspect_ratio * tile.height)
            tile = tile.crop((margin, 0, margin + aspect_ratio * tile.height, tile.height))
        else:
            margin = 0.5 * (tile.height - float(tile.width) / aspect_ratio)
            tile = tile.crop((0, margin, tile.width, margin + float(tile.width) / aspect_ratio))
        tile = tile.resize((tile_width, tile_height), Image.ANTIALIAS)
        grid_image.paste(tile, (int(x), int(y)))
    except:
        pass
    
# write an annotation
fnt = ImageFont.truetype('Pillow/Tests/fonts/FreeMono.ttf', tile_height - 6)
draw = ImageDraw.Draw(grid_image)
draw.text((4, tile_height * (ny)), 
          f"Mosaic of r/dankmemes via PushShift.io @LeonYin", 
          (128, 255, 0), font=fnt)

grid_image

## Animations
We can create animations with these GIFs by sorting our initial dataset by time

In [None]:
# import glob
# import imageio

# pattern = '/beegfs/ly501/tiles/pol/charlottesville_100/pol_2000_[0-9][0-9][0-9][0-9][0-9][0-9].jpg'
# out = '/beegfs/ly501/tiles/pol/gif/charlotte_100_out_30fps_august.gif'

# def make_gif(pattern, dest, duration= .25):
#     '''
#     Saves a png for each congress into the figs subdirectory.
#     Uses ImageIO to combine images into a gif.
#     Deletes all png files in directory.
#     '''

#     filenames = glob.glob(pattern)
#     filenames.sort()
#     images = []

#     for filename in filenames:
#         images.append(imageio.imread(filename))

#     kwargs = { 'duration': duration }
#     imageio.mimsave(dest, images,  **kwargs)

# def make_mp4(pattern, dest, duration=30):
#     '''
#     Saves a png for each congress into the figs subdirectory.
#     Uses ImageIO to combine images into a gif.
#     Deletes all png files in directory.
#     '''
#     filenames = glob.glob(pattern)
#     filenames.sort()
#     images = []

#     writer = imageio.get_writer(dest, fps=duration)
#     for filename in filenames:
#         writer.append_data(imageio.imread(filename))
#     writer.close()

# make_mp4(pattern, out, duration = 23)