In [5]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import os 

import pickle

import numpy as np
from scipy.stats import sem
from scipy.interpolate import interp1d

import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from modules.models.unsupervised.engagement_clusterers import auto_kmeans

AttributeError: module 'scipy.linalg' has no attribute 'decomp'

In [None]:
def sns_styleset():
    sns.set(context='paper', style='ticks', font='DejaVu Sans')
    matplotlib.rcParams['figure.dpi']        = 300
    matplotlib.rcParams['axes.linewidth']    = 1
    matplotlib.rcParams['xtick.major.width'] = 1
    matplotlib.rcParams['ytick.major.width'] = 1
    matplotlib.rcParams['xtick.major.size']  = 3
    matplotlib.rcParams['ytick.major.size']  = 3
    matplotlib.rcParams['xtick.minor.size']  = 2
    matplotlib.rcParams['ytick.minor.size']  = 2
    matplotlib.rcParams['font.size']         = 11
    matplotlib.rcParams['axes.titlesize']    = 11
    matplotlib.rcParams['axes.labelsize']    = 12
    matplotlib.rcParams['legend.fontsize']   = 10
    matplotlib.rcParams['xtick.labelsize']   = 10
    matplotlib.rcParams['ytick.labelsize']   = 10
    
sns_styleset()

### 1 - LOAD GLOBALS

In [None]:
INPUTS_PATH = 'data\\train\\inputs\\{}'
TARGETS_PATH = 'data\\train\\targets\\{}'

CMAP = matplotlib.cm.get_cmap('Paired')

BTCH = [i for i in range(len(os.listdir(INPUTS_PATH.format('continuous_features'))))]
BTCH = BTCH[0::20]

SNAPSHOTS = 10

INPUTS = [
    'delta_sessions',
    'active_time',
    'session_time',
    'activity'
]
TARGETS = [
    'tar_sessions',
    'tar_delta_sessions',    
    'tar_active_time',
    'tar_session_time',
    'tar_activity'
]

### 2 - LOAD REMAPPERS

In [None]:
TARGETS_RMP = {
    'tar_delta_sessions': 'Future Absence',    
    'tar_active_time': 'Future Active Time',
    'tar_session_time': 'Future Session Time',
    'tar_activity': 'Future Session Activity',
    'tar_sessions': 'Future N° Sessions'
}
INPUTS_RMP = {
    'tar_delta_sessions': 'Absence',    
    'tar_active_time': 'Active Time',
    'tar_session_time': 'Session Time',
    'tar_activity': 'Session Activity',
    'tar_sessions': 'N° Sessions'
}
with open('results\\saved_objects\\mappers\\context.pkl', 'rb') as pickle_file:
    CONTEXT_RMP = pickle.load(pickle_file) 
CONTEXT_RMP = {value: key for key, value in CONTEXT_RMP.items()}

### 3 - LOAD DATA

In [None]:
with (open('results\\saved_data_containers\\melchior.pkl', 'rb')) as container:
    DATA_CONTAINER = pickle.load(container)

In [None]:
name_remap = {
    1: 'Hitman Sniper',
    2: 'JustCause 3',
    3: 'LIS BeSt',
    4: 'LIS',
    5: 'JustCause 4',
    6: 'Hitman Go',
}
contexts = DATA_CONTAINER['context'][0]
color = DATA_CONTAINER['prediction']['tar_session_time'][0]
# color = group_wise_binning(
#     array=color,
#     grouper=contexts,
#     n_bins=100,
#     method='discret'
# )

### 4 - CLUSTER PROFILES

In [None]:
DATA_CONTAINER['partitions'] = {}
for snapshot in [3]:
    
    reduction = np.load(f'results\\saved_dim_reduction\\2D\\umap_melchior_eng_emb_{snapshot}.npy')
    reduction = reduction[~np.isnan(reduction).any(axis=1)]
    
    features = np.load(f'results\\saved_dim_reduction\\10D\\umap_melchior_eng_emb_{snapshot}.npy')
    features = features[~np.isnan(features).any(axis=1)]
    
    contexts = DATA_CONTAINER['context'][snapshot]
    
    DATA_CONTAINER['partitions'][snapshot] = np.empty(contexts.shape)
    
    index = 0
    for context in np.unique(contexts):
        
        context_idx = np.argwhere(contexts == context).flatten()
        context_name = CONTEXT_RMP[context]
        context_features = features[context_idx, :]
        context_features = (context_features - context_features.mean(axis=0)) / context_features.std(axis=0)
        
        #clustering
        clusterer, partition_labels, centroids = auto_kmeans(
            X=context_features, 
            min_k=2, 
            max_k=10, 
            verbose=1, 
            fast=True, 
            save_name=f'{context_name}_step_{snapshot+1}',
            max_no_improvement=100,
            batch_size=1000,
            max_iter=3000,
            n_init=3000,
            reassignment_ratio=0.1
        )
        
        DATA_CONTAINER['partitions'][snapshot][context_idx] = partition_labels
        
        # define the gridspec
        fig = plt.figure(figsize=(15, 6))
        spec = fig.add_gridspec(ncols=10, nrows=2)
        
        ax_context = fig.add_subplot(spec[:, :4])
        axs_metrics = [
            fig.add_subplot(spec[0, 4:6]),
            fig.add_subplot(spec[0, 6:8]),
            fig.add_subplot(spec[0, 8:]),
            fig.add_subplot(spec[1, 5:7]),
            fig.add_subplot(spec[1, 7:9])
        ]
        
        # plotting context
        ax_context.scatter(
            np.delete(reduction, context_idx, axis=0)[:, 0],
            np.delete(reduction, context_idx, axis=0)[:, 1],
            s=0.25,
            alpha=0.5,
            marker='o',
            edgecolor='',
            color='darkgray',
        )
        
        for partition_label in np.unique(partition_labels):
            
            partition_idx = np.argwhere(partition_labels == partition_label).flatten()
            context_partition_reduction = reduction[context_idx, :][partition_idx, :]
            ax_context.scatter(
                context_partition_reduction[:, 0],
                context_partition_reduction[:, 1],
                s=0.25,
                marker='o',
                edgecolor='',
                c=CMAP([partition_label] * len(partition_idx)),
                label=f'Partition {partition_label+1}',
            )
        
        index = 0
        for target_name, ax_metric in zip(TARGETS, axs_metrics):
        
            for partition_label in np.unique(partition_labels):

                partition_idx = np.argwhere(partition_labels == partition_label).flatten()
                context_partition_reduction = reduction[context_idx, :][partition_idx, :]

                prediction = DATA_CONTAINER['prediction'][target_name][snapshot]
                context_prediction = prediction[context_idx, :]

                if target_name != 'tar_sessions':
                    metric_name = target_name[4:]
                    metric = DATA_CONTAINER['input_metrics'][metric_name][snapshot]

                    context_metric = metric[context_idx, :]

                    context_line = np.hstack(
                        (context_metric, context_prediction)
                    )
                    context_mean = np.mean(context_line, axis=0)
                    context_std = np.std(context_line, axis=0)

                    context_partition_line = context_line[partition_idx, :]
                    context_partition_line = (context_partition_line - context_mean) / context_std

                    if metric_name == 'delta_sessions':
                        context_partition_line = context_partition_line[:, 1:]
                        steps = [step for step in range(2, context_partition_line.shape[1] +2)]
                    else:
                        steps = [step for step in range(1, context_partition_line.shape[1] +1)]
                    context_partition_sem = sem(
                        context_partition_line
                    )
                    context_partition_line = np.mean(
                        context_partition_line, 
                        axis=0
                    )

                    ax_metric.plot(
                        steps[:-1],
                        context_partition_line[:-1],
                        c=CMAP(partition_label)
                    )
                    ax_metric.fill_between(
                        steps[:-1],
                        context_partition_line[:-1] + (1.96 * context_partition_sem[:-1]),
                        context_partition_line[:-1] - (1.96 * context_partition_sem[:-1]),
                        color=CMAP(partition_label),
                        alpha=0.25
                    )
                    ax_metric.plot(
                        [steps[-2], steps[-1]],
                        [context_partition_line[-2],context_partition_line[-1]],
                        c=CMAP(partition_label),
                        linestyle='--',
                        marker='o',
                        markersize=2
                    )
                    ax_metric.fill_between(
                        np.array([steps[-2], steps[-1]]),
                        np.array([context_partition_line[-2],context_partition_line[-1]]) + (1.96 * np.array([context_partition_sem[-2],context_partition_sem[-1]])),
                        np.array([context_partition_line[-2],context_partition_line[-1]]) - (1.96 * np.array([context_partition_sem[-2],context_partition_sem[-1]])),
                        color=CMAP(partition_label),
                        alpha=0.25
                    )
                else:
                    context_line = np.zeros(
                        shape=(context_prediction.shape[0], snapshot + 1)
                    )
                    context_line[:, -1] = context_prediction.flatten()
                    for r_index in range(snapshot -1,-1,-1):

                        context_line[:, r_index] = context_line[:, r_index+1] + 1

                    context_mean = np.mean(context_line[:, -1])
                    context_std = np.std(context_line[:, -1])
                    context_partition_line = context_line[partition_idx, :]
                    context_partition_line = (context_partition_line - context_mean) / context_std
                    steps = [step for step in range(1, context_partition_line.shape[1] +1)]

                    context_partition_sem = sem(
                        context_partition_line
                    )
                    context_partition_line = np.mean(
                        context_partition_line, 
                        axis=0
                    )

                    ax_metric.bar(
                        partition_label,
                        context_partition_line[-1],
                        color=CMAP(partition_label),
                        yerr=1.96*context_partition_sem[-1]
                    )

            yabs_max = abs(max(ax_metric.get_ylim(), key=abs))
            ax_metric.set_ylim(ymin=-yabs_max, ymax=yabs_max)
            ax_metric.axhline(
                0,
                c='k',
                linestyle=':'
            )

            metric_name = INPUTS_RMP[target_name]
            ax_metric.set_title(f'{metric_name} - $t$ {snapshot+1}')
            if index == 0:
                ax_metric.set_xticks([])
            else:
                ax_metric.set_xlabel('Session')

            index +=1
        
        ax_context.set_title(f'Partition Game Context {context_name} - $t$ {snapshot+1}')
        ax_context.set_xlabel('Dimension 1')
        ax_context.set_ylabel('Dimension 2')
        ax_context.legend(markerscale=8)
        plt.tight_layout()
        plt.savefig(
            f'results\\figures\\clusterer\\clusterer_profiles\\{CONTEXT_RMP[context]}_{snapshot+1}.png',
            dpi=500,
            bbox_inches='tight'
        )
        plt.show()