In [57]:

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import warnings
import pandas
import numpy
import os
import matplotlib.pyplot as plot
import seaborn as sns
from math import sqrt


MIN_THRESHOLD = 0.8
ANOMALIES_FOLDER = "Anomalies/"
СLEANED_ANOMALIES_FOLDER = "CleanAnomalies/"

MOD_ANOMALIES_FOLDER = "mod_Anomalies/"

# TODO: A separate specific workflow for UA metrics. E.g. - strongly preferred days or hours... etc.
IGNORED_METRICS = ['ua_day_of_week', 'ua_hour_of_day', 'players_fraction', 'Unnamed: 0']



<h4>Tools</h4>

In [58]:
def clean_anomalies_files(ANOMALIES_FOLDER, СLEANED_ANOMALIES_FOLDER, IGNORED_METRICS):
    is_folder_exists = os.path.exists(СLEANED_ANOMALIES_FOLDER)
    if not is_folder_exists:
        input_files = get_folder_files(ANOMALIES_FOLDER, IGNORED_METRICS)
        os.makedirs(СLEANED_ANOMALIES_FOLDER)
        for file_name in input_files:
            cleaned_data_frame = get_anomaly_data_from_file(ANOMALIES_FOLDER, file_name, IGNORED_METRICS)
            cleaned_data_frame.to_csv(СLEANED_ANOMALIES_FOLDER + file_name, sep=",")
    return

def  create_folder_if_not_exists(folder):
    is_folder_exists = os.path.exists(folder)
    if not is_folder_exists:
        os.makedirs(folder)
    return

def get_folder_files(_folder_path, exclude_metrics_list = []):
    folder_files = []
    for (_folderpath, _subfoldernames, _filenames) in os.walk(_folder_path):
        folder_files.extend(_filenames)
        break
    
    if len(exclude_metrics_list) > 0:
        folder_files = exclude_files(exclude_metrics_list, folder_files)

    return folder_files

def exclude_files(exclude_metrics_list, _folder_files):
    result_files_list = _folder_files.copy()
    for file_name in _folder_files:
        for metric in extract_persona_metrics(file_name):
            if metric in exclude_metrics_list and file_name in result_files_list:
                result_files_list.remove(file_name)
                break
    return result_files_list

def extract_persona_metrics(_file_name):
    _file_name = _file_name.replace('mod_','')
    persona_metrics = _file_name[:-4].split('__')
    return persona_metrics


def get_metrics_correlation(_table, _metric1, _metric2):
    return _table[_metric1].corr(_table[_metric2])

def get_anomaly_data_from_file(folder, file_name, exclude_metrics_list = []):
    df = pandas.read_csv(folder + file_name, sep=",")

    # exclude_metrics_list.append(str(df.columns[0])) # нулевая колонка - порядковый номер
    df.drop(exclude_metrics_list, axis = 1, inplace=True)

    return df


def get_3d_plot_data(persona_metrics_names, interest_matrix, _formula):
    plot_data_frame = interest_matrix.filter(persona_metrics_names, axis=1).copy()
    m1_x_m2 = plot_data_frame.loc[persona_metrics_names[0], persona_metrics_names[1]]
    plot_data_frame['M1xM2_value'] = m1_x_m2
    plot_data_frame['vector_length'] = 0
    plot_data_frame['3d_subscription'] = ''
    plot_data_frame['M3'] = ''
    
    column_x = plot_data_frame[persona_metrics_names[0]]
    column_y = plot_data_frame[persona_metrics_names[1]]
    
    list_without_m1m2_metrics =  list(set(plot_data_frame.index) - set(persona_metrics_names))
    for index in list_without_m1m2_metrics:
        vector_length = _formula(m1_x_m2, column_x.loc[index], column_y.loc[index])
        plot_data_frame.loc[index, 'vector_length'] = vector_length
        subscription_for_3d = str(round(vector_length, 2)) + ' ' + str(index) + ' - ' + persona_metrics_names[0] + ' - ' + persona_metrics_names[1]
        plot_data_frame.loc[index, '3d_subscription'] = subscription_for_3d
        plot_data_frame.loc[index, 'M3'] = str(index)

    M1xM2_metric_names = persona_metrics_names[0] + ' - ' + persona_metrics_names[1]
    plot_data_frame['M1xM2_metric_names'] = M1xM2_metric_names
    
    new_col_names = {persona_metrics_names[0]: 'M1xE', persona_metrics_names[1]: 'M2xE'}
    plot_data_frame = plot_data_frame.rename(columns = new_col_names)
        
    return plot_data_frame

def collect_3d_plot_data_from_files(files, folder, _formula):
    all_plot_data = pandas.DataFrame()
    for file_name in files:
        persona_metric_names = extract_persona_metrics(file_name)
        anomalies_data_frame = get_anomaly_data_from_file(folder, file_name, ['Unnamed: 0'])
        corr_matrix = anomalies_data_frame.corr()
        interest_matrix = 1-abs(corr_matrix)
        scatter_plot_df = get_3d_plot_data(persona_metric_names, interest_matrix, _formula)
        scatter_plot_df['file_name'] = file_name
        
        all_plot_data = pandas.concat([all_plot_data, scatter_plot_df])

    return all_plot_data.sort_values(by=['vector_length'], ascending=False)


def prepare_table(all_files_plot_data, ROWS_TO_LIST):
    table_to_show = all_files_plot_data.iloc[:ROWS_TO_LIST, [0, 1, 2, 4]].copy()
    table_to_show = table_to_show.reset_index(drop = True)
    pandas.set_option('display.max_colwidth', 100)
    return table_to_show


def interest_formula(x, y, z):
    return sqrt(x**2 + y**2 + z**2)

def interest_formula2(x, y, z):
    return (x + y + z)

def interest_formula3(x, y, z):
    return ( (x**3) + (y **3) + (z**3) ) ** (1/3)

def interest_formula4(x, y, z):
    return 3 - (1 - x) * (1 - y) * (1 - z)

def interest_formula5(x, y, z):
    return max([x, y, z])



def get_full_metric_names_list(files):
    # почему-то в M3 попали пустые значения и их много - надо понять откуда они там вобще
    # или вобще отказаться от M3 и индекс использовать - но это не особо удобно
    plot_data = plot_data.query("M3.notnull() and M3 != ''")
    # print(plot_data['M3'].unique().tolist())
    return plot_data['M3'].unique().tolist()

def get_metric_names_list(plot_data):
    # почему-то в M3 попали пустые значения и их много - надо понять откуда они там вобще
    # или вобще отказаться от M3 и индекс использовать - но это не особо удобно
    plot_data = plot_data.query("M3.notnull() and M3 != ''")
    # print(plot_data['M3'].unique().tolist())

    metric_names_list = plot_data.sort_values(by=['vector_length'], ascending=False)['M3'].unique().tolist()
    metric_names_list.insert(0,'all')
    return  metric_names_list

def get_table_for_display(metric_name, threshold_value, plot_data):
    filter_query = "(vector_length >= @threshold_value)"
    if(metric_name != 'all'):
        filter_query += ' and (M3 == @metric_name)'
        
    table_to_show = plot_data.query(filter_query)
    table_to_show = table_to_show.iloc[:, [6, 3]].copy()
    # table_to_show = table_to_show.reset_index(drop = True)
    return table_to_show

def get_table_filtered_by_threshold(threshold_value, plot_data):
    table_filtered = plot_data.query("(vector_length >= @threshold_value)").copy()
    return table_filtered

def get_formula_dict():
    formula_dictionary = {
        'vector_length (minkovskij, p = 2)': interest_formula, 
        'sum(x, y, z)': interest_formula2, 
        'root_of_3 (minkovskij, p = 3)': interest_formula3,
        '3 - mult(1 - *)': interest_formula4,
        'max(x, y, z)': interest_formula5,
    }
    return formula_dictionary


def get_formula_from_dict(_value):
    formula_dictionary = get_formula_dict()
    return formula_dictionary.get(_value, interest_formula)




<h4>Main</h4>

In [59]:

PERSONAS_AMOUNT = 3
ROWS_TO_LIST = 15
warnings.filterwarnings("ignore")

def analyze_anomalies(anomalies_files):

    formula_list = list(get_formula_dict().keys())
    formula_drop_down = widgets.Dropdown(
        options = formula_list,
        value = formula_list[0],
        description = 'Formula:',
        disabled=False,
    )

    metric_drop_down = widgets.Dropdown(description='Metric name:', disabled=False,)
 
    threshold_slider = widgets.FloatSlider(min=0.0, max=3, step=0.05, value=0)

    def metric_drop_down_update(*args):
        old_metric_value = metric_drop_down.value
        formula_value = formula_drop_down.value
        threshold_value = threshold_slider.value
        formula = get_formula_from_dict(formula_value)

        all_files_plot_data = collect_3d_plot_data_from_files(anomalies_files, СLEANED_ANOMALIES_FOLDER, formula)
        filtered_by_threshold_data = get_table_filtered_by_threshold(threshold_value, all_files_plot_data)
        metric_list = get_metric_names_list(filtered_by_threshold_data)
        
        metric_drop_down.options = metric_list
        if( len(metric_list) > 0
            and old_metric_value in metric_drop_down.options):
                metric_drop_down.value = old_metric_value
        else:
                metric_drop_down.value = metric_list[0]
        

        # threshold_min = all_files_plot_data.iloc[-1]['vector_length']
        # threshold_max = all_files_plot_data.iloc[0]['vector_length']
        # threshold_slider.min = threshold_min
        # threshold_slider.max = threshold_max
        
        return
      
    # привязать к выбору формулы настройки других виджетов
    formula_drop_down.observe(metric_drop_down_update)
    threshold_slider.observe(metric_drop_down_update)

    metric_drop_down_update()

    interact(
    #interact_manual(
        pocess_choice_and_display_result,
        anomalies_files = fixed(anomalies_files),
        anomalies_folder = fixed(СLEANED_ANOMALIES_FOLDER),
        formula_value = formula_drop_down,
        metric_name = metric_drop_down,
        threshold_value = threshold_slider
    );

    return

def pocess_choice_and_display_result(anomalies_files, anomalies_folder, formula_value, metric_name, threshold_value):
    if((formula_value == None) or (metric_name == None) or threshold_value == None): return
    
    formula = get_formula_from_dict(formula_value)
    all_files_plot_data = collect_3d_plot_data_from_files(anomalies_files, anomalies_folder, formula)
    # metric_list = get_metric_names_list(all_files_plot_data)

    table_to_show = get_table_for_display(metric_name, threshold_value, all_files_plot_data)
    pandas.set_option('display.max_colwidth', 100)
    display(table_to_show)

    # display(prepare_table(all_files_plot_data, ROWS_TO_LIST))
    # show_3d_plot_with_hue(all_files_plot_data.head(ROWS_TO_LIST))
    
    return


    


## Задачи
- интерактивно связать список Formula и Metric_name
- попробовать убрать run_interact
- автоматически менять threshold на средний при смене списков
- поработать с формулами

<h2>Debug</h2>

In [60]:
clean_anomalies_files(ANOMALIES_FOLDER, СLEANED_ANOMALIES_FOLDER,IGNORED_METRICS)
# вопросы - стоит ли искать более-независимые друг от друга тройки? пока таких не видно

anomalies_files = get_folder_files(СLEANED_ANOMALIES_FOLDER)

analyze_anomalies(anomalies_files)

interactive(children=(Dropdown(description='Formula:', options=('vector_length ((minkovskij, p = 2))', 'sum(x,…