In [3]:
import json
import numpy as np
import pandas as pd
import plotly.express as px

from src.clustering.pipeline_helper_retriever import PipelineHelperRetriever
from src.core.distribution_data import *
from src.core import file_manager as fm
from src.embeddings.constants import EMBEDDING_MODELS_TRANSLATION

In [4]:
def prepare_df_metrics_of_range_n_neighbors(embedding):
    df = pd.read_csv(fm.filename_from_data_dir(f'output/patient/k100/{embedding}/mutual_information_by_range_n_neighbors.csv'))

    data = []
    embedding = df['embedding'].unique()[0]
    for n_neighbor in df['n_neighbor'].unique():
        dependencies = df[df['n_neighbor'] == n_neighbor]['mutual_information']
        distribution_data = get_distribution_data(dependencies)
        record = vars(distribution_data)
        record['n_neighbor'] = n_neighbor
        record['embedding'] = embedding
        data.append(record)

    return pd.DataFrame(data)

def prepare_data_for_all_embeddings():
    dfs = [prepare_df_metrics_of_range_n_neighbors(embedding) for embedding in EMBEDDING_MODELS_TRANSLATION.keys()]
    return pd.concat(dfs)

df = prepare_data_for_all_embeddings()

df.head()

Unnamed: 0,q1,q3,med,lower_bound,upper_bound,n_neighbor,embedding
0,0.26427,0.352531,0.306179,0.131879,0.484922,3,bert_pt
1,0.255042,0.344334,0.297789,0.121105,0.478271,4,bert_pt
2,0.249414,0.340764,0.293055,0.112389,0.47779,5,bert_pt
3,0.247971,0.338188,0.290152,0.112647,0.473512,6,bert_pt
4,0.248738,0.337102,0.290668,0.116191,0.469649,7,bert_pt


In [8]:
chart = px.line(df, x='n_neighbor', y='med', color='embedding')
chart.update_layout(xaxis_title='n_neighbor', yaxis_title='Mediana')
chart.show()

In [None]:
from src.core.chart_helper import *
from src.embeddings.constants import  EMBEDDING_MODELS_TRANSLATION

def json_to_numpy(mutual_information):
    result = json.loads(mutual_information)

    return np.array(result[0])

def load_mutual_info_df(embedding_name, use_intent_index):
    prefix_name = 'grouped_by_intent_' if use_intent_index else ''
    path_file = fm.filename_from_data_dir(f'output/patient/k100/{embedding_name}/{prefix_name}mutual_information.csv')

    print(f'Reading file: {path_file}')
    df = pd.read_csv(path_file)

    df['dependency'] = df.apply(lambda row: json_to_numpy(row['mutual_information']), axis=1)

    return df

def get_key(row):
  prefix = '~ (others) - ' if row['intents'] == 'without_others' else ''

  return  f"{row['embedding']}: {prefix}{row['variation']}"


def extract_dependency_to_collumn(df):
  dfs = []
  
  for index, row in df.iterrows():
      features_length = len(row['dependency'])
      
      data = {
          'embedding': [row['embedding']] * features_length,
          'intents': [row['intents']] * features_length,
          'variation': [row['variation']] * features_length,
          'feature': range(features_length),
          'dependency': row['dependency'],
      }
      
      dfs.append(pd.DataFrame(data))


  return pd.concat(dfs)

def show_charts(embedding_name, use_intent_index=False):
  df = load_mutual_info_df(embedding_name, use_intent_index)

  df_extracted = extract_dependency_to_collumn(df)

  box_plot = px.box(df_extracted, x='intents', y='dependency', color='variation')

  n_features = len(df['dependency'][0])

  box_plot.update_layout(
    yaxis_title='Dependencia',
    xaxis_title=f'variações  do {embedding_name} com {n_features} features'
  )
  
  return box_plot
# df = pd.concat(dfs)
# df['x'] = df.apply(get_key, axis=1)
# df['y'] = df.apply(lambda x: x[0], axis=1)

In [15]:
np.median([1,2,3,4,5,4,3,2,1])

3.0

In [2]:
from src.core.chart_helper import *
from src.embeddings.constants import  EMBEDDING_MODELS_TRANSLATION

def json_to_numpy(mutual_information):
    result = json.loads(mutual_information)

    return np.array(result[0])

def load_mutual_info_df(embedding_name, use_intent_index):
    prefix_name = 'grouped_by_intent_' if use_intent_index else ''
    path_file = fm.filename_from_data_dir(f'output/patient/k100/{embedding_name}/{prefix_name}mutual_information.csv')

    print(f'Reading file: {path_file}')
    df = pd.read_csv(path_file)

    df['dependency'] = df.apply(lambda row: json_to_numpy(row['mutual_information']), axis=1)

    return df

def get_key(row):
  prefix = '~ (others) - ' if row['intents'] == 'without_others' else ''

  return  f"{row['embedding']}: {prefix}{row['variation']}"


def extract_dependency_to_collumn(df):
  dfs = []
  
  for index, row in df.iterrows():
      features_length = len(row['dependency'])
      
      data = {
          'embedding': [row['embedding']] * features_length,
          'intents': [row['intents']] * features_length,
          'variation': [row['variation']] * features_length,
          'feature': range(features_length),
          'dependency': row['dependency'],
      }
      
      dfs.append(pd.DataFrame(data))


  return pd.concat(dfs)

def show_charts(embedding_name, use_intent_index=False):
  df = load_mutual_info_df(embedding_name, use_intent_index)

  df_extracted = extract_dependency_to_collumn(df)

  box_plot = px.box(df_extracted, x='intents', y='dependency', color='variation')

  n_features = len(df['dependency'][0])

  box_plot.update_layout(
    yaxis_title='Dependencia',
    xaxis_title=f'variações  do {embedding_name} com {n_features} features'
  )
  
  return box_plot
# df = pd.concat(dfs)
# df['x'] = df.apply(get_key, axis=1)
# df['y'] = df.apply(lambda x: x[0], axis=1)

In [3]:
show_charts('bert_pt')

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/bert_pt/mutual_information.csv


In [4]:
show_charts('bert_pt', use_intent_index=True)

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/bert_pt/grouped_by_intent_mutual_information.csv


In [5]:
show_charts('flair_pt')

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/flair_pt/mutual_information.csv


In [6]:
show_charts('flair_pt', use_intent_index=True)

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/flair_pt/grouped_by_intent_mutual_information.csv


In [7]:
show_charts('glove')

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/glove/mutual_information.csv


In [8]:
show_charts('glove', use_intent_index=True)

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/glove/grouped_by_intent_mutual_information.csv


In [9]:
show_charts('lasbe')

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/lasbe/mutual_information.csv


In [10]:
show_charts('lasbe', use_intent_index=True)

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/lasbe/grouped_by_intent_mutual_information.csv


In [11]:
show_charts('use')

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/use/mutual_information.csv


In [12]:
show_charts('use', use_intent_index=True)

Reading file: /home/valmir/dev/python/intent_classifier/data/output/patient/k100/use/grouped_by_intent_mutual_information.csv


In [None]:
# def convert_to_list(row):
#     list_as_str = row['mutual_information']
#
#     return [np.array([word for word in re.sub('[\[\]]', '', list_as_str).replace('\n', '').split(' ') if word])]

# def is_float(element):
#     try:
#         float(element)
#         return True
#     except ValueError:
#         return False
#
#
# def convert_to_list(row):
#     list_as_str = row['mutual_information']
#
#     list_as_str = re.sub("[\\\'\",\n\[\]aray()]", '', list_as_str)
#
#     result = [float(word) for word in list_as_str.split(' ') if is_float(word)]
#
#     return [result]

# def convert_to_list(row):
#     result = json.loads(row['mutual_information'])
#
#     return result[0]

In [None]:
# def convert_to_list(row):
#     list_as_str = row['mutual_information']
#
#     return [np.array([word for word in re.sub('[\[\]]', '', list_as_str).replace('\n', '').split(' ') if word])]

# def is_float(element):
#     try:
#         float(element)
#         return True
#     except ValueError:
#         return False
#
#
# def convert_to_list(row):
#     list_as_str = row['mutual_information']
#
#     list_as_str = re.sub("[\\\'\",\n\[\]aray()]", '', list_as_str)
#
#     result = [float(word) for word in list_as_str.split(' ') if is_float(word)]
#
#     return [result]

# def convert_to_list(row):
#     result = json.loads(row['mutual_information'])
#
#     return result[0]