In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import pandas as pd
from src.core import file_manager as fm
from src.embeddings.constants import EMBEDDING_MODELS_TRANSLATION

In [3]:
embeddings = list(EMBEDDING_MODELS_TRANSLATION.keys())

In [4]:
def describe_intersections_models(suffix='train', column_to_compared='correct_txt'):
  for i in range(len(embeddings) -1):
    df = pd.read_csv(fm.filename_from_data_dir(f'output/neural_models/patient/{embeddings[i]}/data_{suffix}.csv'))
    print(f'{embeddings[i]} has {df[column_to_compared].count()} sentences')
    
    for j in range(i + 1, len(embeddings)):
      df_2 = pd.read_csv(fm.filename_from_data_dir(f'output/neural_models/patient/{embeddings[j]}/data_{suffix}.csv'))
      sentences_in_common = df.loc[df[column_to_compared].isin(df_2[column_to_compared])][column_to_compared].count()
      
      print(f'Has in common: {sentences_in_common} with the {embeddings[j]}, that has {df_2[column_to_compared].count()} sentences')
    
    print('==================================================================\n')


  sentences_in_all_models = pd.read_csv(fm.filename_from_data_dir(
    f'output/neural_models/patient/{embeddings[0]}/data_{suffix}.csv')
    )[column_to_compared]

  for i in range(1, len(embeddings)):
    df = pd.read_csv(fm.filename_from_data_dir(f'output/neural_models/patient/{embeddings[i]}/data_{suffix}.csv'))

    sentences_in_all_models = df.loc[df[column_to_compared].isin(sentences_in_all_models)][column_to_compared]

  print(f'\nThere are {sentences_in_all_models.count()} present in all models')

In [23]:
describe_intersections_models(suffix='train')

bert_pt has 6185 sentences
Has in common: 2376 with the flair_pt, that has 6188 sentences
Has in common: 2007 with the glove, that has 6230 sentences
Has in common: 2094 with the lasbe, that has 5604 sentences
Has in common: 1836 with the use, that has 4849 sentences

flair_pt has 6188 sentences
Has in common: 1946 with the glove, that has 6230 sentences
Has in common: 1909 with the lasbe, that has 5604 sentences
Has in common: 1713 with the use, that has 4849 sentences

glove has 6230 sentences
Has in common: 1623 with the lasbe, that has 5604 sentences
Has in common: 1431 with the use, that has 4849 sentences

lasbe has 5604 sentences
Has in common: 2136 with the use, that has 4849 sentences


There are 206 present in all models


In [24]:
describe_intersections_models(suffix='test')

bert_pt has 2652 sentences
Has in common: 416 with the flair_pt, that has 2652 sentences
Has in common: 388 with the glove, that has 2670 sentences
Has in common: 398 with the lasbe, that has 2403 sentences
Has in common: 311 with the use, that has 2079 sentences

flair_pt has 2652 sentences
Has in common: 331 with the glove, that has 2670 sentences
Has in common: 342 with the lasbe, that has 2403 sentences
Has in common: 290 with the use, that has 2079 sentences

glove has 2670 sentences
Has in common: 297 with the lasbe, that has 2403 sentences
Has in common: 261 with the use, that has 2079 sentences

lasbe has 2403 sentences
Has in common: 376 with the use, that has 2079 sentences


There are 3 present in all models


In [4]:
def describe_intersections(variation):
  for i in range(len(embeddings) -1):
    df = fm.read_annotated_df_with_embeddings(embedding_name=embeddings[i], variation=variation)
    print(f'{embeddings[i]} has {df["txt"].count()} sentences')
    
    for j in range(i + 1, len(embeddings)):
      df_2 = fm.read_annotated_df_with_embeddings(embedding_name=embeddings[j], variation=variation)
      sentences_in_common = df.loc[df['txt'].isin(df_2['txt'])]['txt'].count()
      
      print(f'Has in common: {sentences_in_common} with the {embeddings[j]}, that has {df_2["txt"].count()} sentences')
    
    print('==================================================================\n')


  sentences_in_all_models = fm.read_annotated_df_with_embeddings(embedding_name=embeddings[0], variation=variation)['txt']

  for i in range(1, len(embeddings)):
    df = fm.read_annotated_df_with_embeddings(embedding_name=embeddings[i], variation=variation)

    sentences_in_all_models = df.loc[df['txt'].isin(sentences_in_all_models)]['txt']

  print(f'\nThere are {sentences_in_all_models.count()} present in all models')

  return sentences_in_all_models
     

In [5]:
sentences_in_all_models = describe_intersections('without_others_intent/k100_without_sentences_higher_than_median')

bert_pt has 8837 sentences
Has in common: 4762 with the flair_pt, that has 8840 sentences
Has in common: 4054 with the glove, that has 8900 sentences
Has in common: 4264 with the lasbe, that has 8007 sentences
Has in common: 3746 with the use, that has 6928 sentences

flair_pt has 8840 sentences
Has in common: 3879 with the glove, that has 8900 sentences
Has in common: 3907 with the lasbe, that has 8007 sentences
Has in common: 3493 with the use, that has 6928 sentences

glove has 8900 sentences
Has in common: 3376 with the lasbe, that has 8007 sentences
Has in common: 3009 with the use, that has 6928 sentences

lasbe has 8007 sentences
Has in common: 4296 with the use, that has 6928 sentences


There are 1068 present in all models


In [8]:
actor = 'patient'
variation = 'without_others_intent/k100_without_sentences_higher_than_median'

path = fm.filename_from_data_dir( f'output/{actor}/{variation}/intersection_sentences.csv')
sentences_in_all_models.to_csv(path, index=False)

path_300 = fm.filename_from_data_dir( f'output/{actor}/{variation}/intersection_300_sentences.csv')
sentences_in_all_models.sample(n=300, random_state=42).to_csv(path_300, index=False)

In [12]:
describe_intersections('without_others_intent/k100_without_outliers')

bert_pt has 17309 sentences
Has in common: 13488 with the flair_pt, that has 17341 sentences
Has in common: 12646 with the glove, that has 17330 sentences
Has in common: 12787 with the lasbe, that has 15790 sentences
Has in common: 11562 with the use, that has 13636 sentences

flair_pt has 17341 sentences
Has in common: 12272 with the glove, that has 17330 sentences
Has in common: 12330 with the lasbe, that has 15790 sentences
Has in common: 11041 with the use, that has 13636 sentences

glove has 17330 sentences
Has in common: 11496 with the lasbe, that has 15790 sentences
Has in common: 10201 with the use, that has 13636 sentences

lasbe has 15790 sentences
Has in common: 11461 with the use, that has 13636 sentences


There are 7072 present in all models


In [13]:
describe_intersections('k100_without_sentences_higher_than_median')

bert_pt has 13402 sentences
Has in common: 8303 with the flair_pt, that has 13401 sentences
Has in common: 7158 with the glove, that has 12933 sentences
Has in common: 7817 with the lasbe, that has 13398 sentences
Has in common: 7589 with the use, that has 13397 sentences

flair_pt has 13401 sentences
Has in common: 7269 with the glove, that has 12933 sentences
Has in common: 7322 with the lasbe, that has 13398 sentences
Has in common: 7166 with the use, that has 13397 sentences

glove has 12933 sentences
Has in common: 6937 with the lasbe, that has 13398 sentences
Has in common: 6836 with the use, that has 13397 sentences

lasbe has 13398 sentences
Has in common: 9118 with the use, that has 13397 sentences


There are 2226 present in all models


In [15]:
describe_intersections('k100_without_outliers')

bert_pt has 26312 sentences
Has in common: 25854 with the flair_pt, that has 26249 sentences
Has in common: 24758 with the glove, that has 25141 sentences
Has in common: 25959 with the lasbe, that has 26378 sentences
Has in common: 26045 with the use, that has 26471 sentences

flair_pt has 26249 sentences
Has in common: 24727 with the glove, that has 25141 sentences
Has in common: 25881 with the lasbe, that has 26378 sentences
Has in common: 25971 with the use, that has 26471 sentences

glove has 25141 sentences
Has in common: 24801 with the lasbe, that has 26378 sentences
Has in common: 24871 with the use, that has 26471 sentences

lasbe has 26378 sentences
Has in common: 26137 with the use, that has 26471 sentences


There are 23855 present in all models
