In [1]:
import umap
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from helper.constants import *
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
%matplotlib inline
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [3]:
class Visualizer():
    '''
    Visualizer for csv/json data.
    '''
    def __init__(self, name, data):
        '''
        Initialize Visualizer class with data and processing models.
        '''
        print('VISUALIZER: Initiating.')

        # Base
        self.name = name
        self.data = pd.DataFrame.from_dict(data)

        # Models
        self.reducer = umap.UMAP()
    
    def get_plots(self, variable_name):
        '''
        Pair plot showing both the distribution of single variables and relationships between any two variables in a dataset
        '''

        # Pair Plot
        sns.pairplot(self.data, hue=variable_name);
    
    def get_embeddings(self, scaled_data):
        '''
        Return UMAP embeddings from data.
        '''
        self.embeddings = self.reducer.fit_transform(scaled_data)
        return self.embeddings
    
    def get_UMAP(self, params, mapping, key):
        '''
        Plot UMAP
        '''
        data_values = self.data[params].values
        scaled_values = StandardScaler().fit_transform(data_values)
        embedding = self.get_embeddings(scaled_values)
        plt.scatter(
            embedding[:, 0],
            embedding[:, 1],
            c = [sns.color_palette()[x] for x in self.data[key].map(mapping)]
        )
        plt.gca().set_aspect('equal', 'datalim')
        plt.title('UMAP projection of the Penguin dataset', fontsize=24);


In [4]:
import json
with open(NEW_AUTHOR_DATA, 'r+', encoding='latin1') as file:
    umap_visual = Visualizer('UMAP', json.load(file))
    umap_visual.get_plots('DOB')

VISUALIZER: Initiating.
Index(['60235486', '43931362', '11987873', '98696422', '36964375', '87714692',
       '44590193', '99638465', '11444993', '68344390',
       ...
       '62701945', '15167666', '81824613', '77261938', '61518089', '57539380',
       '41521459', '48640556', '52392612', '51358801'],
      dtype='object', length=668)


ValueError: No variables found for grid columns.