# Contextual BERT embeddings

In [None]:
data = '/Gradient_shuffle/'
result = '/BERTsimul/'

# 1. Setup

In [None]:
# data analysis in python
import pandas as pd
import numpy as np
import torch
from scipy.spatial.distance import cosine

Install the transformers package from Hugging Face which will give us a pytorch interface for working with BERT.



In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 27.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 11.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 49.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


Next load the pre-trained BERT model and tokenizer

In [None]:
# check if any GPU instance is availabe
# https://www.databricks.com/blog/2021/10/28/gpu-accelerated-sentiment-analysis-using-pytorch-and-huggingface-on-databricks.html
# 'cude': GPU
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
DEVICE

device(type='cuda')

In [None]:
from transformers import BertModel, BertTokenizer

model = BertModel.from_pretrained('bert-base-uncased',
           output_hidden_states = True) #.to(DEVICE); looks like the above cell already takes care of this
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

# 2. Get contextual embeddings

Put the input text into a specific format that BERT can read. add the ```[CLS]``` to the beginning and ```[SEP]``` to the end of the input. convert the tokenized BERT input to the tensor format.

In [None]:
def bert_text_preparation(text, tokenizer):
  """
  Preprocesses text input in a way that BERT can interpret.
  """
  marked_text = "[CLS] " + text + " [SEP]"
  tokenized_text = tokenizer.tokenize(marked_text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1]*len(indexed_tokens)

  # convert inputs to tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensor = torch.tensor([segments_ids])

  return tokenized_text, tokens_tensor, segments_tensor

to obtain the actual BERT embeddings, we take preprocessed input text, which now is represented by tensors, put it into our pre-trained BERT model.

which vector works best as a contextualized embedding depends on the task. 

according to Devlin et al (2019), the sum of the last four layers of the model worked well.

In [None]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    """
    Obtains BERT embeddings for tokens, in context of the given response.
    """
    # gradient calculation id disabled
    with torch.no_grad():
      # obtain hidden states
      outputs = model(tokens_tensor, segments_tensor)
      hidden_states = outputs[2]

    # concatenate the tensors for all layers
    # use "stack" to create new dimension in tensor
    token_embeddings = torch.stack(hidden_states, dim=0)

    # remove dimension 1, the "batches"
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # swap dimensions 0 and 1 so we can loop over tokens
    token_embeddings = token_embeddings.permute(1,0,2)

    # intialized list to store embeddings
    token_vecs_sum = []

    # "token_embeddings" is a [Y x 12 x 768] tensor
    # where Y is the number of tokens in the response

    # loop over tokens in response
    for token in token_embeddings:

        # "token" is a [12 x 768] tensor

        # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

    return token_vecs_sum

create contextual embeddings for a response.

## experiment

In [None]:
from collections import OrderedDict
sentences = ["he eventually sold the shares back to the bank at a premium. the river flowed over the bank. the next day a little girl walked by the river bank and picked a bouquet of flowers."
]

context_embeddings = []
context_tokens = []

for sentence in sentences: #.split(' ')
  # call function
  tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
  list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

  # make ordered dictionary to keep track of the position of each word
  tokens = OrderedDict()

  # loop over tokens in sensitive sentence
  for token in tokenized_text[1:-1]:
    # keep track of position of word and whether it occurs multiple times
    if token in tokens:
      tokens[token] += 1
    else:
      tokens[token] = 1

    # compute the position of the current token
    token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
    current_index = token_indices[tokens[token]-1]

    # get the corresponding embedding
    token_vec = list_token_embeddings[current_index]
    
    # save values
    context_tokens.append(token)
    context_embeddings.append(token_vec)

In [None]:
len(context_embeddings)

38

In [None]:
len(context_tokens)

38

In [None]:
context_tokens.index('the')

3

In [None]:
context_tokens

['he',
 'eventually',
 'sold',
 'the',
 'shares',
 'back',
 'to',
 'the',
 'bank',
 'at',
 'a',
 'premium',
 '.',
 'the',
 'river',
 'flowed',
 'over',
 'the',
 'bank',
 '.',
 'the',
 'next',
 'day',
 'a',
 'little',
 'girl',
 'walked',
 'by',
 'the',
 'river',
 'bank',
 'and',
 'picked',
 'a',
 'bouquet',
 'of',
 'flowers',
 '.']

In [None]:
context_tokens[-2]

'flowers'

In [None]:
context_tokens[-20]

'bank'

In [None]:
context_tokens[8]

'bank'

In [None]:
context_tokens[-6]

'picked'

In [None]:
from scipy.spatial.distance import cosine
token1 = context_embeddings[-20]
token2 = context_embeddings[-8]
print('similarity between bank_river vs. bank_river but diff positions: ', 1-cosine(token1, token2))

similarity between bank_river vs. bank_river but diff positions:  0.8264751434326172


In [None]:
from scipy.spatial.distance import cosine
token1 = context_embeddings[8]
token2 = context_embeddings[-2]
print('similarity between bank_financial vs. bank_river: ', 1-cosine(token1, token2))

similarity between bank_financial vs. bank_river:  0.6709258556365967


In [None]:
from scipy.spatial.distance import cosine
token1 = context_embeddings[8]
token2 = context_embeddings[9]
print('similarity between bank_financial vs. at: ', 1-cosine(token1, token2))

similarity between bank_financial vs. at:  0.43222352862358093


In [None]:
from scipy.spatial.distance import cosine
token1 = context_embeddings[8]
token2 = context_embeddings[-6]
print('similarity between bank_financial vs. river: ', 1-cosine(token1, token2))

similarity between bank_financial vs. river:  0.2084815353155136


In [None]:
from scipy.spatial.distance import cosine
token1 = context_embeddings[8]
token2 = context_embeddings[2]
print('similarity between bank_financial vs. sold: ', 1-cosine(token1, token2))

similarity between bank_financial vs. sold:  0.3736796975135803


In [None]:
token1 = context_embeddings[-2]
token2 = context_embeddings[-6]
print('similarity between bank_river vs. river: ', 1-cosine(token1, token2))

similarity between bank_river vs. river:  0.5334401726722717


In [None]:
token1 = context_embeddings[context_tokens.index('the')]
token2 = context_embeddings[7]
print('similarity between the vs. the (different syntactic positions): ', 1-cosine(token1, token2))

similarity between the vs. the (different syntactic positions):  0.7172697186470032


In [None]:
token1 = context_embeddings[7]
token2 = context_embeddings[7]
print('similarity between the vs. the (same syntactic positions): ', 1-cosine(token1, token2))

similarity between the vs. the (same syntactic positions):  1


## baseline

In [None]:
baseline = pd.read_csv(data + 'simulation_HV_baseline_vb_response_deid_v3.csv')

baseline['bert_tokens'] = ''
baseline['bert_emb'] = ''
baseline['len_bert_tokens_emb'] = ''

for i in baseline.index:
  context_embeddings = []
  context_tokens = []

  if baseline['n_words'][i] > 4: # only process lines with response len bigger than 4
      # only keep the first 430 tokens. BERT uses a subword tokenizer (WordPiece), 
      # so the maximum length corresponds to 512 subword tokens.
      lst = baseline['content'][i].split(' ')[:431] 
      sentence = ' '.join(lst)
      tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
      list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

      # make ordered dictionary to keep track of the position of each word
      tokens = OrderedDict()

      # loop over tokens in sensitive sentence
      for token in tokenized_text[1:-1]:
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
          tokens[token] += 1
        else:
          tokens[token] = 1

        # compute the position of the current token
        token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
        current_index = token_indices[tokens[token]-1]

        # get the corresponding embedding
        token_vec = list_token_embeddings[current_index]
        
        # save values
        context_tokens.append(token)
        context_embeddings.append(token_vec)

      baseline['bert_tokens'][i] = context_tokens
      baseline['bert_emb'][i] = context_embeddings
      baseline['len_bert_tokens_emb'][i] = len(context_embeddings)

  if i % 5 == 0:
      print('progress: ', i)
    
baseline.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


progress:  0


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.7688), tensor(0.7937), tensor(0.167...",462
1,11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(5.6645), tensor(-2.5050), tensor(3.09...",194
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2083), tensor(1.4897), tensor(4.913...",494
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.2509), tensor(-2.1763), tensor(4.0...",456
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.5036), tensor(3.9237), tensor(4.951...",171


In [None]:
baseline.to_csv(result + 'simulation_HV_baseline_vb_response_deid_v3_bert.csv')

## incoh10

In [None]:
incoh10 = pd.read_csv(data + 'simulation_HV_incoh_vb_response_deid_10v3.csv')
incoh10.tail()

Unnamed: 0,grid,content,n_words
0,10455,"I'm a young man , an en an en- an engineer by ...",421
1,11689,Sure . I'm thirty three years good . My name i...,159
2,12376,Alright . um I live in not especially cool Spr...,468
3,12630,um So I'm currently twenty-nine . I was born a...,966
4,13493,Mhm . I'm a thirty five year old man who uh um...,134


In [None]:
incoh10['bert_tokens'] = ''
incoh10['bert_emb'] = ''
incoh10['len_bert_tokens_emb'] = ''

for i in incoh10.index:
  context_embeddings = []
  context_tokens = []

  #try:
  if incoh10['n_words'][i] > 4: 
      lst = incoh10['content'][i].split(' ')[:431] # only keep the first 430 tokens
      sentence = ' '.join(lst)
      tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
      list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

      # make ordered dictionary to keep track of the position of each word
      tokens = OrderedDict()

      # loop over tokens in sensitive sentence
      for token in tokenized_text[1:-1]:
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
          tokens[token] += 1
        else:
          tokens[token] = 1

        # compute the position of the current token
        token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
        current_index = token_indices[tokens[token]-1]

        # get the corresponding embedding
        token_vec = list_token_embeddings[current_index]
        
        # save values
        context_tokens.append(token)
        context_embeddings.append(token_vec)

      incoh10['bert_tokens'][i] = context_tokens
      incoh10['bert_emb'][i] = context_embeddings
      incoh10['len_bert_tokens_emb'][i] = len(context_embeddings)

  if i % 5 == 0:
      print('progress: ', i)

  # if not limit sequence length (punc included), 
  # then runtime error. 
  # BERT is incapable of processing long texts 
  # due to its quadratically increasing memory and time consumption.

  #except RuntimeError: 
    #print('RuntimeError: ', i, vb['n_words'][i])
    #continue
    
incoh10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


progress:  0


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.6725), tensor(1.0466), tensor(0.263...",464
1,11689,Sure . I'm thirty three years good . My name i...,159,"[sure, ., i, ', m, thirty, three, years, good,...","[[tensor(5.6659), tensor(-2.2755), tensor(3.61...",190
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2656), tensor(1.1820), tensor(4.869...",496
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4018), tensor(-2.1731), tensor(4.3...",456
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.4953), tensor(3.7805), tensor(5.160...",171


In [None]:
incoh10.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_bert.csv')

## incoh20

In [None]:
incoh20 = pd.read_csv(data + 'simulation_HV_incoh_vb_response_deid_20v3.csv', index_col = 0)
incoh20.head()

Unnamed: 0_level_0,content,n_words
grid,Unnamed: 1_level_1,Unnamed: 2_level_1
10455,"I'm a young man , an en an en- an things by tr...",421
11689,Sure . I'm thirty three years medical . My nam...,159
12376,Alright . um I live in not especially lazy Spr...,468
12630,um So I'm currently twenty-nine . I was born a...,966
13493,Mhm . I'm a thirty five year old man who uh um...,134


In [None]:
incoh20['bert_tokens'] = ''
incoh20['bert_emb'] = ''
incoh20['len_bert_tokens_emb'] = ''

for i in incoh20.index:
  context_embeddings = []
  context_tokens = []

  try:
    if incoh20['n_words'][i] > 4: #and incoh20['n_tokens'][i] < 513:
      lst = incoh20['content'][i].split(' ')[:431]
      sentence = ' '.join(lst)
      tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
      list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

      # make ordered dictionary to keep track of the position of each word
      tokens = OrderedDict()

      # loop over tokens in sensitive sentence
      for token in tokenized_text[1:-1]:
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
          tokens[token] += 1
        else:
          tokens[token] = 1

        # compute the position of the current token
        token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
        current_index = token_indices[tokens[token]-1]

        # get the corresponding embedding
        token_vec = list_token_embeddings[current_index]
        
        # save values
        context_tokens.append(token)
        context_embeddings.append(token_vec)

    incoh20['bert_tokens'][i] = context_tokens
    incoh20['bert_emb'][i] = context_embeddings
    incoh20['len_bert_tokens_emb'][i] = len(context_embeddings)

    if i % 5 == 0:
      print('progress: ', i)

  except RuntimeError:
    print('RuntimeError: ', i, incoh20['n_words'][i])
    continue
    
incoh20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


progress:  10455
progress:  12630


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10455,"I'm a young man , an en an en- an things by tr...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.5740), tensor(0.6673), tensor(-0.25...",464
11689,Sure . I'm thirty three years medical . My nam...,159,"[sure, ., i, ', m, thirty, three, years, medic...","[[tensor(5.7015), tensor(-2.7486), tensor(3.70...",190
12376,Alright . um I live in not especially lazy Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.0813), tensor(1.3699), tensor(4.995...",496
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4019), tensor(-2.1877), tensor(4.2...",456
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.1580), tensor(3.9266), tensor(5.391...",171


In [None]:
incoh20.to_csv(result + 'simulation_HV_incoh_vb_response_deid_20v3_bert.csv')

## incoh50

In [None]:
incoh50 = pd.read_csv(data + 'simulation_HV_incoh_vb_response_deid_50v3.csv', index_col = 0)
incoh50.head()

Unnamed: 0_level_0,content,n_words
grid,Unnamed: 1_level_1,Unnamed: 2_level_1
10455,"I'm a good year , an en an en- an responsibili...",421
11689,Sure . I'm thirty three years anxious . My nam...,159
12376,Alright . um something live in not especially ...,468
12630,um So I'm currently twenty-nine . I was born a...,966
13493,Mhm . I'm a thirty five hospital old man anyth...,134


In [None]:
incoh50['bert_tokens'] = ''
incoh50['bert_emb'] = ''
incoh50['len_bert_tokens_emb'] = ''

for i in incoh50.index:
  context_embeddings = []
  context_tokens = []

  try:
    if incoh50['n_words'][i] > 4: # and nosw['n_tokens'][i] < 513:
      lst = incoh50['content'][i].split(' ')[:431]
      sentence = ' '.join(lst)
      tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
      list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

      # make ordered dictionary to keep track of the position of each word
      tokens = OrderedDict()

      # loop over tokens in sensitive sentence
      for token in tokenized_text[1:-1]:
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
          tokens[token] += 1
        else:
          tokens[token] = 1

        # compute the position of the current token
        token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
        current_index = token_indices[tokens[token]-1]

        # get the corresponding embedding
        token_vec = list_token_embeddings[current_index]
        
        # save values
        context_tokens.append(token)
        context_embeddings.append(token_vec)

    incoh50['bert_tokens'][i] = context_tokens
    incoh50['bert_emb'][i] = context_embeddings
    incoh50['len_bert_tokens_emb'][i] = len(context_embeddings)

    if i % 5 == 0:
      print('progress: ', i)

  except RuntimeError:
    print('RuntimeError: ', i, incoh50['n_words'][i])
    continue
    
incoh50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


progress:  10455
progress:  12630


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10455,"I'm a good year , an en an en- an responsibili...",421,"[i, ', m, a, good, year, ,, an, en, an, en, -,...","[[tensor(-1.4038), tensor(-0.1184), tensor(-0....",459
11689,Sure . I'm thirty three years anxious . My nam...,159,"[sure, ., i, ', m, thirty, three, years, anxio...","[[tensor(4.6395), tensor(-2.2006), tensor(4.34...",192
12376,Alright . um something live in not especially ...,468,"[alright, ., um, something, live, in, not, esp...","[[tensor(4.7609), tensor(0.7377), tensor(4.862...",498
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.8654), tensor(-2.2503), tensor(4.9...",457
13493,Mhm . I'm a thirty five hospital old man anyth...,134,"[m, ##hm, ., i, ', m, a, thirty, five, hospita...","[[tensor(2.0145), tensor(3.1620), tensor(5.582...",172


In [None]:
incoh50.to_csv(result + 'simulation_HV_incoh_vb_response_deid_50v3_bert.csv')

## ineff10

In [None]:
ineff10 = pd.read_csv(data + 'simulation_HV_ineff_vb_response_deid_10v3.csv', index_col = 0)

ineff10['bert_tokens'] = ''
ineff10['bert_emb'] = ''
ineff10['len_bert_tokens_emb'] = ''

for i in ineff10.index:
  context_embeddings = []
  context_tokens = []

  try:
    if ineff10['n_words'][i] > 4: # and nosw['n_tokens'][i] < 513:
      lst = ineff10['content'][i].split(' ')[:431]
      sentence = ' '.join(lst)
      tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
      list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

      # make ordered dictionary to keep track of the position of each word
      tokens = OrderedDict()

      # loop over tokens in sensitive sentence
      for token in tokenized_text[1:-1]:
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
          tokens[token] += 1
        else:
          tokens[token] = 1

        # compute the position of the current token
        token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
        current_index = token_indices[tokens[token]-1]

        # get the corresponding embedding
        token_vec = list_token_embeddings[current_index]
        
        # save values
        context_tokens.append(token)
        context_embeddings.append(token_vec)

    ineff10['bert_tokens'][i] = context_tokens
    ineff10['bert_emb'][i] = context_embeddings
    ineff10['len_bert_tokens_emb'][i] = len(context_embeddings)

    if i % 5 == 0:
      print('progress: ', i)

  except RuntimeError:
    print('RuntimeError: ', i, ineff10['n_words'][i])
    continue
    
ineff10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


progress:  10455
progress:  12630


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.9260), tensor(0.7718), tensor(0.139...",469
11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(6.0613), tensor(-1.9364), tensor(2.85...",212
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.3717), tensor(1.2232), tensor(4.933...",497
12630,um So I was born and raised in South Washingto...,966,"[um, so, i, was, born, and, raised, in, south,...","[[tensor(-0.9319), tensor(-1.5137), tensor(3.7...",463
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.5872), tensor(3.8303), tensor(5.034...",178


In [None]:
ineff10.to_csv(result + 'simulation_HV_ineff_vb_response_deid_10v3_bert.csv')

## ineff20

In [None]:
ineff20 = pd.read_csv(data + 'simulation_HV_ineff_vb_response_deid_20v3.csv', index_col = 0)

ineff20['bert_tokens'] = ''
ineff20['bert_emb'] = ''
ineff20['len_bert_tokens_emb'] = ''

for i in ineff20.index:
  context_embeddings = []
  context_tokens = []

  try:
    if ineff20['n_words'][i] > 4: # and nosw['n_tokens'][i] < 513:
      lst = ineff20['content'][i].split(' ')[:431]
      sentence = ' '.join(lst)
      tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
      list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

      # make ordered dictionary to keep track of the position of each word
      tokens = OrderedDict()

      # loop over tokens in sensitive sentence
      for token in tokenized_text[1:-1]:
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
          tokens[token] += 1
        else:
          tokens[token] = 1

        # compute the position of the current token
        token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
        current_index = token_indices[tokens[token]-1]

        # get the corresponding embedding
        token_vec = list_token_embeddings[current_index]
        
        # save values
        context_tokens.append(token)
        context_embeddings.append(token_vec)

    ineff20['bert_tokens'][i] = context_tokens
    ineff20['bert_emb'][i] = context_embeddings
    ineff20['len_bert_tokens_emb'][i] = len(context_embeddings)

    if i % 5 == 0:
      print('progress: ', i)

  except RuntimeError:
    print('RuntimeError: ', i, ineff20['n_words'][i])
    continue
    
ineff20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


progress:  10455
progress:  12630


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.8871), tensor(0.7492), tensor(0.043...",465
11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(5.7484), tensor(-1.9574), tensor(3.33...",181
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(4.6833), tensor(0.4654), tensor(5.237...",486
12630,um So I was born and raised in South Washingto...,966,"[um, so, i, was, born, and, raised, in, south,...","[[tensor(-1.1326), tensor(-1.8485), tensor(3.1...",471
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(3.0104), tensor(3.4729), tensor(5.414...",142


In [None]:
ineff20.to_csv(result + 'simulation_HV_ineff_vb_response_deid_20v3_bert.csv')

## ineff50

In [None]:
ineff50 = pd.read_csv(data + 'simulation_HV_ineff_vb_response_deid_50v3.csv', index_col = 0)

ineff50['bert_tokens'] = ''
ineff50['bert_emb'] = ''
ineff50['len_bert_tokens_emb'] = ''

for i in ineff50.index:
  context_embeddings = []
  context_tokens = []

  try:
    if ineff50['n_words'][i] > 4: # and nosw['n_tokens'][i] < 513:
      lst = ineff50['content'][i].split(' ')[:431]
      sentence = ' '.join(lst)
      tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
      list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

      # make ordered dictionary to keep track of the position of each word
      tokens = OrderedDict()

      # loop over tokens in sensitive sentence
      for token in tokenized_text[1:-1]:
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
          tokens[token] += 1
        else:
          tokens[token] = 1

        # compute the position of the current token
        token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
        current_index = token_indices[tokens[token]-1]

        # get the corresponding embedding
        token_vec = list_token_embeddings[current_index]
        
        # save values
        context_tokens.append(token)
        context_embeddings.append(token_vec)

    ineff50['bert_tokens'][i] = context_tokens
    ineff50['bert_emb'][i] = context_embeddings
    ineff50['len_bert_tokens_emb'][i] = len(context_embeddings)

    if i % 5 == 0:
      print('progress: ', i)

  except RuntimeError:
    print('RuntimeError: ', i, ineff50['n_words'][i])
    continue
    
ineff50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


progress:  10455
progress:  12630


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(1.0818), tensor(1.0875), tensor(-0.18...",431
11689,We have been using that opportunity to do more...,159,"[we, have, been, using, that, opportunity, to,...","[[tensor(3.7089), tensor(0.8028), tensor(-1.74...",228
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.3717), tensor(1.6071), tensor(5.350...",480
12630,"My is things are fantastic . No , I mean My uh...",966,"[my, is, things, are, fantastic, ., no, ,, i, ...","[[tensor(-2.2958), tensor(2.2740), tensor(1.50...",474
13493,Mhm . I still get to play . And my stock inves...,134,"[m, ##hm, ., i, still, get, to, play, ., and, ...","[[tensor(1.7302), tensor(3.9717), tensor(6.281...",183


In [None]:
ineff50.to_csv(result + 'simulation_HV_ineff_vb_response_deid_50v3_bert.csv')

# K 2:10

## baseline

In [None]:
baseline['bert_word_k2'] = ''
baseline['bert_word_k3'] = ''
baseline['bert_word_k4'] = ''
baseline['bert_word_k5'] = ''
baseline['bert_word_k6'] = ''
baseline['bert_word_k7'] = ''
baseline['bert_word_k8'] = ''
baseline['bert_word_k9'] = ''
baseline['bert_word_k10'] = ''

ks=['2', '3', '4', '5', '6', '7', '8', '9', '10']
for k in ks:
    cur = 'bert_word_k' + k 
    print('Coherence k ', k)
    for i in baseline.index:
        if type(baseline['bert_emb'][i]) != float:
            temp = [] # this is gonna be a list of similarity scores for the response
            # loop over each embedding vector
            for id,v in enumerate(baseline['bert_emb'][i]):
                w1 = v
                try:
                    w2 = baseline['bert_emb'][i][id + int(k)] # at k inter-token distance
                except IndexError: # takes care of out of range error
                    continue
                sim = 1-cosine(w1, w2) # calcuate cosine similarity
                temp.append(sim) # append the similarity to the temporary list
            baseline[cur][i] = np.average(temp) # add othe stats here
baseline.head()

Coherence k  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Coherence k  3
Coherence k  4
Coherence k  5
Coherence k  6
Coherence k  7
Coherence k  8
Coherence k  9
Coherence k  10


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_mv5,bert_word_mv10,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.7688), tensor(0.7937), tensor(0.167...",462,0.471428,0.474674,0.41295,0.38864,0.367721,0.353955,0.342045,0.338082,0.324235,0.318819,0.318824
1,11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(5.6645), tensor(-2.5050), tensor(3.09...",194,0.504627,0.501954,0.426685,0.372436,0.354607,0.339884,0.335918,0.32825,0.32619,0.308189,0.301729
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2083), tensor(1.4897), tensor(4.913...",494,0.4818,0.474423,0.411201,0.375869,0.353776,0.349704,0.333051,0.326555,0.318976,0.323212,0.314139
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.2509), tensor(-2.1763), tensor(4.0...",456,0.500832,0.494821,0.415659,0.388839,0.366737,0.354915,0.347091,0.341731,0.336869,0.334747,0.329355
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.5036), tensor(3.9237), tensor(4.951...",171,0.478361,0.477872,0.384679,0.363791,0.32135,0.309973,0.297819,0.283803,0.286407,0.29583,0.309508


## incoh10

In [None]:
incoh10['bert_word_k2'] = ''
incoh10['bert_word_k3'] = ''
incoh10['bert_word_k4'] = ''
incoh10['bert_word_k5'] = ''
incoh10['bert_word_k6'] = ''
incoh10['bert_word_k7'] = ''
incoh10['bert_word_k8'] = ''
incoh10['bert_word_k9'] = ''
incoh10['bert_word_k10'] = ''

In [None]:
ks=['2', '3', '4', '5', '6', '7', '8', '9', '10']
for k in ks:
    cur = 'bert_word_k' + k 
    print('Coherence k ', k)
    for i in incoh10.index:
        if type(incoh10['bert_emb'][i]) != float:
            temp = []
            for id,v in enumerate(incoh10['bert_emb'][i]):
                w1 = v
                try:
                    w2 = incoh10['bert_emb'][i][id + int(k)]
                except IndexError:
                    continue
                sim = 1-cosine(w1, w2)
                temp.append(sim)
                #print('flag: ', temp)
            incoh10[cur][i] = np.average(temp)
incoh10.head()

Coherence k  2
Coherence k  3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Coherence k  4
Coherence k  5
Coherence k  6
Coherence k  7
Coherence k  8
Coherence k  9
Coherence k  10


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.6725), tensor(1.0466), tensor(0.263...",464,0.419732,0.398102,0.373147,0.363936,0.352692,0.342627,0.329379,0.325641,0.325631
1,11689,Sure . I'm thirty three years good . My name i...,159,"[sure, ., i, ', m, thirty, three, years, good,...","[[tensor(5.6659), tensor(-2.2755), tensor(3.61...",190,0.437061,0.38617,0.367037,0.349232,0.34827,0.336172,0.340942,0.324468,0.319427
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2656), tensor(1.1820), tensor(4.869...",496,0.420912,0.384082,0.360058,0.358527,0.341441,0.338162,0.331947,0.332439,0.323517
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4018), tensor(-2.1731), tensor(4.3...",456,0.41965,0.394277,0.370679,0.357445,0.353164,0.345605,0.341885,0.341371,0.333201
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.4953), tensor(3.7805), tensor(5.160...",171,0.388267,0.370461,0.327668,0.31585,0.301831,0.289095,0.295041,0.301678,0.316763


In [None]:
incoh10.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_bert.csv')

## incoh20

In [None]:
incoh20['bert_word_k2'] = ''
incoh20['bert_word_k3'] = ''
incoh20['bert_word_k4'] = ''
incoh20['bert_word_k5'] = ''
incoh20['bert_word_k6'] = ''
incoh20['bert_word_k7'] = ''
incoh20['bert_word_k8'] = ''
incoh20['bert_word_k9'] = ''
incoh20['bert_word_k10'] = ''

ks=['2', '3', '4', '5', '6', '7', '8', '9', '10']
for k in ks:
    cur = 'bert_word_k' + k 
    print('Coherence k ', k)
    for i in incoh20.index:
        if type(incoh20['bert_emb'][i]) != float:
            temp = []
            for id,v in enumerate(incoh20['bert_emb'][i]):
                w1 = v
                try:
                    w2 = incoh20['bert_emb'][i][id + int(k)]
                except IndexError:
                    continue
                sim = 1-cosine(w1, w2)
                temp.append(sim)
                #print('flag: ', temp)
            incoh20[cur][i] = np.average(temp)
incoh20.head()

Coherence k  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Coherence k  3
Coherence k  4
Coherence k  5
Coherence k  6
Coherence k  7
Coherence k  8
Coherence k  9
Coherence k  10


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10455,"I'm a young man , an en an en- an things by tr...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.5740), tensor(0.6673), tensor(-0.25...",464,0.423546,0.40207,0.3811,0.369743,0.355661,0.348424,0.335308,0.327706,0.329574
11689,Sure . I'm thirty three years medical . My nam...,159,"[sure, ., i, ', m, thirty, three, years, medic...","[[tensor(5.7015), tensor(-2.7486), tensor(3.70...",190,0.448121,0.402427,0.387234,0.370825,0.367952,0.355175,0.361326,0.347132,0.337009
12376,Alright . um I live in not especially lazy Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.0813), tensor(1.3699), tensor(4.995...",496,0.432127,0.393979,0.372034,0.370072,0.350946,0.349551,0.342697,0.344309,0.338485
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4019), tensor(-2.1877), tensor(4.2...",456,0.423406,0.397939,0.376072,0.365122,0.358143,0.355902,0.350251,0.349574,0.340883
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.1580), tensor(3.9266), tensor(5.391...",171,0.393248,0.378669,0.337608,0.327369,0.313787,0.299737,0.30466,0.31099,0.318905


In [None]:
incoh20.to_csv(result + 'simulation_HV_incoh_vb_response_deid_20v3_bert.csv')

## incoh50

In [None]:
incoh50['bert_word_k2'] = ''
incoh50['bert_word_k3'] = ''
incoh50['bert_word_k4'] = ''
incoh50['bert_word_k5'] = ''
incoh50['bert_word_k6'] = ''
incoh50['bert_word_k7'] = ''
incoh50['bert_word_k8'] = ''
incoh50['bert_word_k9'] = ''
incoh50['bert_word_k10'] = ''

ks=['2', '3', '4', '5', '6', '7', '8', '9', '10']
for k in ks:
    cur = 'bert_word_k' + k 
    print('Coherence k ', k)
    for i in incoh50.index:
        if type(incoh50['bert_emb'][i]) != float:
            temp = []
            for id,v in enumerate(incoh50['bert_emb'][i]):
                w1 = v
                try:
                    w2 = incoh50['bert_emb'][i][id + int(k)]
                except IndexError:
                    continue
                sim = 1-cosine(w1, w2)
                temp.append(sim)
                #print('flag: ', temp)
            incoh50[cur][i] = np.average(temp)
incoh50.head()

Coherence k  2
Coherence k  3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Coherence k  4
Coherence k  5
Coherence k  6
Coherence k  7
Coherence k  8
Coherence k  9
Coherence k  10


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10455,"I'm a good year , an en an en- an responsibili...",421,"[i, ', m, a, good, year, ,, an, en, an, en, -,...","[[tensor(-1.4038), tensor(-0.1184), tensor(-0....",459,0.441923,0.424328,0.402711,0.393327,0.382006,0.375118,0.368116,0.360517,0.363095
11689,Sure . I'm thirty three years anxious . My nam...,159,"[sure, ., i, ', m, thirty, three, years, anxio...","[[tensor(4.6395), tensor(-2.2006), tensor(4.34...",192,0.474055,0.428855,0.419308,0.400513,0.400701,0.387484,0.391745,0.380941,0.367475
12376,Alright . um something live in not especially ...,468,"[alright, ., um, something, live, in, not, esp...","[[tensor(4.7609), tensor(0.7377), tensor(4.862...",498,0.461128,0.427746,0.405324,0.404731,0.387605,0.384336,0.380061,0.384769,0.378545
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.8654), tensor(-2.2503), tensor(4.9...",457,0.443361,0.420886,0.402643,0.390756,0.382505,0.383815,0.375148,0.372052,0.370879
13493,Mhm . I'm a thirty five hospital old man anyth...,134,"[m, ##hm, ., i, ', m, a, thirty, five, hospita...","[[tensor(2.0145), tensor(3.1620), tensor(5.582...",172,0.416152,0.407561,0.361486,0.353147,0.342681,0.324768,0.334419,0.334229,0.33699


In [None]:
incoh50.to_csv(result + 'simulation_HV_incoh_vb_response_deid_50v3_bert.csv')

## ineff10

In [None]:
ineff10['bert_word_k2'] = ''
ineff10['bert_word_k3'] = ''
ineff10['bert_word_k4'] = ''
ineff10['bert_word_k5'] = ''
ineff10['bert_word_k6'] = ''
ineff10['bert_word_k7'] = ''
ineff10['bert_word_k8'] = ''
ineff10['bert_word_k9'] = ''
ineff10['bert_word_k10'] = ''

ks=['2', '3', '4', '5', '6', '7', '8', '9', '10']
for k in ks:
    cur = 'bert_word_k' + k 
    print('Coherence k ', k)
    for i in ineff10.index:
        if type(ineff10['bert_emb'][i]) != float:
            temp = []
            for id,v in enumerate(ineff10['bert_emb'][i]):
                w1 = v
                try:
                    w2 = ineff10['bert_emb'][i][id + int(k)]
                except IndexError:
                    continue
                sim = 1-cosine(w1, w2)
                temp.append(sim)
                #print('flag: ', temp)
            ineff10[cur][i] = np.average(temp)
ineff10.head()

Coherence k  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Coherence k  3
Coherence k  4
Coherence k  5
Coherence k  6
Coherence k  7
Coherence k  8
Coherence k  9
Coherence k  10


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.9260), tensor(0.7718), tensor(0.139...",469,0.415859,0.389433,0.366713,0.355071,0.340469,0.339874,0.323313,0.315986,0.318991
11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(6.0613), tensor(-1.9364), tensor(2.85...",212,0.451335,0.394343,0.379211,0.361111,0.364849,0.34994,0.352626,0.342873,0.332989
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.3717), tensor(1.2232), tensor(4.933...",497,0.411034,0.368746,0.348774,0.350121,0.327221,0.325798,0.316564,0.321827,0.310504
12630,um So I was born and raised in South Washingto...,966,"[um, so, i, was, born, and, raised, in, south,...","[[tensor(-0.9319), tensor(-1.5137), tensor(3.7...",463,0.421071,0.38107,0.360023,0.358633,0.345952,0.337948,0.335499,0.329621,0.320563
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.5872), tensor(3.8303), tensor(5.034...",178,0.393846,0.371014,0.330708,0.321578,0.30734,0.294739,0.296669,0.307008,0.319991


In [None]:
ineff10.to_csv(result + 'simulation_HV_ineff_vb_response_deid_10v3_bert.csv')

## ineff20

In [None]:
ineff20['bert_word_k2'] = ''
ineff20['bert_word_k3'] = ''
ineff20['bert_word_k4'] = ''
ineff20['bert_word_k5'] = ''
ineff20['bert_word_k6'] = ''
ineff20['bert_word_k7'] = ''
ineff20['bert_word_k8'] = ''
ineff20['bert_word_k9'] = ''
ineff20['bert_word_k10'] = ''

ks=['2', '3', '4', '5', '6', '7', '8', '9', '10']
for k in ks:
    cur = 'bert_word_k' + k 
    print('Coherence k ', k)
    for i in ineff20.index:
        if type(ineff20['bert_emb'][i]) != float:
            temp = []
            for id,v in enumerate(ineff20['bert_emb'][i]):
                w1 = v
                try:
                    w2 = ineff20['bert_emb'][i][id + int(k)]
                except IndexError:
                    continue
                sim = 1-cosine(w1, w2)
                temp.append(sim)
                #print('flag: ', temp)
            ineff20[cur][i] = np.average(temp)
ineff20.head()

Coherence k  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Coherence k  3
Coherence k  4
Coherence k  5
Coherence k  6
Coherence k  7
Coherence k  8
Coherence k  9
Coherence k  10


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.8871), tensor(0.7492), tensor(0.043...",465,0.419734,0.394527,0.36742,0.357507,0.342382,0.339261,0.321388,0.318035,0.319717
11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(5.7484), tensor(-1.9574), tensor(3.33...",181,0.446248,0.396125,0.359399,0.359655,0.345632,0.331845,0.328439,0.323529,0.317078
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(4.6833), tensor(0.4654), tensor(5.237...",486,0.406104,0.375603,0.348739,0.345745,0.32947,0.329442,0.315648,0.323214,0.314113
12630,um So I was born and raised in South Washingto...,966,"[um, so, i, was, born, and, raised, in, south,...","[[tensor(-1.1326), tensor(-1.8485), tensor(3.1...",471,0.422655,0.37827,0.365426,0.36086,0.34634,0.337186,0.332113,0.324751,0.319289
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(3.0104), tensor(3.4729), tensor(5.414...",142,0.405599,0.384211,0.34458,0.326265,0.315097,0.304823,0.308425,0.320578,0.324224


In [None]:
ineff20.to_csv(result + 'simulation_HV_ineff_vb_response_deid_20v3_bert.csv')

## ineff50

In [None]:
ineff50['bert_word_k2'] = ''
ineff50['bert_word_k3'] = ''
ineff50['bert_word_k4'] = ''
ineff50['bert_word_k5'] = ''
ineff50['bert_word_k6'] = ''
ineff50['bert_word_k7'] = ''
ineff50['bert_word_k8'] = ''
ineff50['bert_word_k9'] = ''
ineff50['bert_word_k10'] = ''

ks=['2', '3', '4', '5', '6', '7', '8', '9', '10']
for k in ks:
    cur = 'bert_word_k' + k 
    print('Coherence k ', k)
    for i in ineff50.index:
        if type(ineff50['bert_emb'][i]) != float:
            temp = []
            for id,v in enumerate(ineff50['bert_emb'][i]):
                w1 = v
                try:
                    w2 = ineff50['bert_emb'][i][id + int(k)]
                except IndexError:
                    continue
                sim = 1-cosine(w1, w2)
                temp.append(sim)
                #print('flag: ', temp)
            ineff50[cur][i] = np.average(temp)
ineff50.head()

Coherence k  2
Coherence k  3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Coherence k  4
Coherence k  5
Coherence k  6
Coherence k  7
Coherence k  8
Coherence k  9
Coherence k  10


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(1.0818), tensor(1.0875), tensor(-0.18...",431,0.431574,0.402701,0.375233,0.360198,0.355781,0.357323,0.337893,0.335999,0.332884
11689,We have been using that opportunity to do more...,159,"[we, have, been, using, that, opportunity, to,...","[[tensor(3.7089), tensor(0.8028), tensor(-1.74...",228,0.450827,0.392143,0.364411,0.346152,0.339268,0.32745,0.327672,0.334092,0.321685
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.3717), tensor(1.6071), tensor(5.350...",480,0.41699,0.371672,0.356068,0.360196,0.338796,0.349111,0.318997,0.324514,0.311103
12630,"My is things are fantastic . No , I mean My uh...",966,"[my, is, things, are, fantastic, ., no, ,, i, ...","[[tensor(-2.2958), tensor(2.2740), tensor(1.50...",474,0.422702,0.385004,0.365272,0.352853,0.345306,0.342191,0.332007,0.327148,0.323434
13493,Mhm . I still get to play . And my stock inves...,134,"[m, ##hm, ., i, still, get, to, play, ., and, ...","[[tensor(1.7302), tensor(3.9717), tensor(6.281...",183,0.407593,0.392881,0.342866,0.318033,0.314927,0.301858,0.292772,0.300559,0.304624


In [None]:
ineff50.to_csv(result + 'simulation_HV_ineff_vb_response_deid_50v3_bert.csv')

# MV 5 10

In [None]:
# Average semantic similarity of each word in 5- or 10- words window

def divide_chunks(l, n):
      
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]
  
# n: How many elements each
# list should have
test = ['1','5','99','34','109','gh','io','wer','90','901','98','iop','er4','op0']
divide_chunks(test,5)
print(list(divide_chunks(test,5)))

[['1', '5', '99', '34', '109'], ['gh', 'io', 'wer', '90', '901'], ['98', 'iop', 'er4', 'op0']]


## baseline

In [None]:
baseline['bert_word_mv5'] = ''
baseline['bert_word_mv10'] = ''

ks = ['5', '10']
for k in ks:
    cur = 'bert_word_mv' + k 
    print('current: ', cur)
    for i in baseline.index:
        if True: # type(baseline['bert_emb'][i]) != float: 
            # call function, divide response sequence into chunks of 5-token-unit
            word_embed_chunk = list(divide_chunks(baseline['bert_emb'][i], int(k)))

            # this is gonna be a list of similarity-scores means
            # its length is the number of chunks that the response can be divided into
            chunk_temp_sum = [] 

            # loop over each 5-token chunk
            for chunck_id, word_embed in enumerate(word_embed_chunk):
                temp_sum = []

                # get a collection of similarity scores for that 5-token chunk
                # then take the average for that 5-token chunk
                for word_id, embed in enumerate(word_embed):
                    w1 = embed
                    try:
                        w2 = word_embed[word_id+1]
                    except IndexError:
                        continue
                    temp = 1-cosine(w1, w2)
                    temp_sum.append(temp)
                temp_sim = np.nanmean(temp_sum) # don't use np.average
                chunk_temp_sum.append(temp_sim)   

        sim = np.nanmean(chunk_temp_sum) # add other stats here
        baseline[cur][i] = sim

baseline.head()

current:  bert_word_mv5
current:  bert_word_mv10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_mv5,bert_word_mv10,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.7688), tensor(0.7937), tensor(0.167...",462,0.471428,0.474674,0.41295,0.38864,0.367721,0.353955,0.342045,0.338082,0.324235,0.318819,0.318824
1,11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(5.6645), tensor(-2.5050), tensor(3.09...",194,0.504627,0.501954,0.426685,0.372436,0.354607,0.339884,0.335918,0.32825,0.32619,0.308189,0.301729
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2083), tensor(1.4897), tensor(4.913...",494,0.4818,0.474423,0.411201,0.375869,0.353776,0.349704,0.333051,0.326555,0.318976,0.323212,0.314139
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.2509), tensor(-2.1763), tensor(4.0...",456,0.500832,0.494821,0.415659,0.388839,0.366737,0.354915,0.347091,0.341731,0.336869,0.334747,0.329355
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.5036), tensor(3.9237), tensor(4.951...",171,0.478361,0.477872,0.384679,0.363791,0.32135,0.309973,0.297819,0.283803,0.286407,0.29583,0.309508


## incoh10

In [None]:
incoh10['bert_word_mv5'] = ''
incoh10['bert_word_mv10'] = ''

In [None]:
ks = ['5', '10']
for k in ks:
    cur = 'bert_word_mv' + k 
    print('current: ', cur)
    for i in incoh10.index:
        if type(incoh10['bert_emb'][i]) != float: 
            word_embed_chunk = list(divide_chunks(incoh10['bert_emb'][i], int(k)))
            chunk_temp_sum = []
            for chunck_id, word_embed in enumerate(word_embed_chunk):
                temp_sum = []
                for word_id, embed in enumerate(word_embed):
                    w1 = embed
                    try:
                        w2 = word_embed[word_id+1]
                    except IndexError:
                        continue
                    temp = 1-cosine(w1, w2)
                    temp_sum.append(temp)
                temp_sim = np.nanmean(temp_sum)
                chunk_temp_sum.append(temp_sim)
        sim = np.nanmean(chunk_temp_sum)
        incoh10[cur][i] = sim

incoh10.head()

current:  bert_word_mv5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


current:  bert_word_mv10


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.6725), tensor(1.0466), tensor(0.263...",464,0.419732,0.398102,0.373147,0.363936,0.352692,0.342627,0.329379,0.325641,0.325631,0.48515,0.483163
1,11689,Sure . I'm thirty three years good . My name i...,159,"[sure, ., i, ', m, thirty, three, years, good,...","[[tensor(5.6659), tensor(-2.2755), tensor(3.61...",190,0.437061,0.38617,0.367037,0.349232,0.34827,0.336172,0.340942,0.324468,0.319427,0.500808,0.503623
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2656), tensor(1.1820), tensor(4.869...",496,0.420912,0.384082,0.360058,0.358527,0.341441,0.338162,0.331947,0.332439,0.323517,0.485034,0.483255
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4018), tensor(-2.1731), tensor(4.3...",456,0.41965,0.394277,0.370679,0.357445,0.353164,0.345605,0.341885,0.341371,0.333201,0.501024,0.495825
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.4953), tensor(3.7805), tensor(5.160...",171,0.388267,0.370461,0.327668,0.31585,0.301831,0.289095,0.295041,0.301678,0.316763,0.480811,0.478044


In [None]:
incoh10.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_bert.csv')

## incho20

In [None]:
incoh20['bert_word_mv5'] = ''
incoh20['bert_word_mv10'] = ''

ks = ['5', '10']
for k in ks:
    cur = 'bert_word_mv' + k 
    print('current: ', cur)
    for i in incoh20.index:
        if type(incoh20['bert_emb'][i]) != float: 
            word_embed_chunk = list(divide_chunks(incoh20['bert_emb'][i], int(k)))
            chunk_temp_sum = []
            for chunck_id, word_embed in enumerate(word_embed_chunk):
                temp_sum = []
                for word_id, embed in enumerate(word_embed):
                    w1 = embed
                    try:
                        w2 = word_embed[word_id+1]
                    except IndexError:
                        continue
                    temp = 1-cosine(w1, w2)
                    temp_sum.append(temp)
                temp_sim = np.nanmean(temp_sum)
                chunk_temp_sum.append(temp_sim)
        sim = np.nanmean(chunk_temp_sum)
        incoh20[cur][i] = sim

incoh20.head()

current:  bert_word_mv5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


current:  bert_word_mv10


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10455,"I'm a young man , an en an en- an things by tr...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.5740), tensor(0.6673), tensor(-0.25...",464,0.423546,0.40207,0.3811,0.369743,0.355661,0.348424,0.335308,0.327706,0.329574,0.488356,0.485694
11689,Sure . I'm thirty three years medical . My nam...,159,"[sure, ., i, ', m, thirty, three, years, medic...","[[tensor(5.7015), tensor(-2.7486), tensor(3.70...",190,0.448121,0.402427,0.387234,0.370825,0.367952,0.355175,0.361326,0.347132,0.337009,0.50886,0.511256
12376,Alright . um I live in not especially lazy Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.0813), tensor(1.3699), tensor(4.995...",496,0.432127,0.393979,0.372034,0.370072,0.350946,0.349551,0.342697,0.344309,0.338485,0.495238,0.493909
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4019), tensor(-2.1877), tensor(4.2...",456,0.423406,0.397939,0.376072,0.365122,0.358143,0.355902,0.350251,0.349574,0.340883,0.504377,0.499642
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.1580), tensor(3.9266), tensor(5.391...",171,0.393248,0.378669,0.337608,0.327369,0.313787,0.299737,0.30466,0.31099,0.318905,0.487613,0.486644


In [None]:
incoh20.to_csv(result + 'simulation_HV_incoh_vb_response_deid_20v3_bert.csv')

## incoh50

In [None]:
incoh50['bert_word_mv5'] = ''
incoh50['bert_word_mv10'] = ''

ks = ['5', '10']
for k in ks:
    cur = 'bert_word_mv' + k 
    print('current: ', cur)
    for i in incoh50.index:
        if type(incoh50['bert_emb'][i]) != float: 
            word_embed_chunk = list(divide_chunks(incoh50['bert_emb'][i], int(k)))
            chunk_temp_sum = []
            for chunck_id, word_embed in enumerate(word_embed_chunk):
                temp_sum = []
                for word_id, embed in enumerate(word_embed):
                    w1 = embed
                    try:
                        w2 = word_embed[word_id+1]
                    except IndexError:
                        continue
                    temp = 1-cosine(w1, w2)
                    temp_sum.append(temp)
                temp_sim = np.nanmean(temp_sum)
                chunk_temp_sum.append(temp_sim)
        sim = np.nanmean(chunk_temp_sum)
        incoh50[cur][i] = sim

incoh50.head()

current:  bert_word_mv5
current:  bert_word_mv10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10455,"I'm a good year , an en an en- an responsibili...",421,"[i, ', m, a, good, year, ,, an, en, an, en, -,...","[[tensor(-1.4038), tensor(-0.1184), tensor(-0....",459,0.441923,0.424328,0.402711,0.393327,0.382006,0.375118,0.368116,0.360517,0.363095,0.498728,0.494
11689,Sure . I'm thirty three years anxious . My nam...,159,"[sure, ., i, ', m, thirty, three, years, anxio...","[[tensor(4.6395), tensor(-2.2006), tensor(4.34...",192,0.474055,0.428855,0.419308,0.400513,0.400701,0.387484,0.391745,0.380941,0.367475,0.529403,0.527393
12376,Alright . um something live in not especially ...,468,"[alright, ., um, something, live, in, not, esp...","[[tensor(4.7609), tensor(0.7377), tensor(4.862...",498,0.461128,0.427746,0.405324,0.404731,0.387605,0.384336,0.380061,0.384769,0.378545,0.514372,0.511451
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.8654), tensor(-2.2503), tensor(4.9...",457,0.443361,0.420886,0.402643,0.390756,0.382505,0.383815,0.375148,0.372052,0.370879,0.511806,0.509827
13493,Mhm . I'm a thirty five hospital old man anyth...,134,"[m, ##hm, ., i, ', m, a, thirty, five, hospita...","[[tensor(2.0145), tensor(3.1620), tensor(5.582...",172,0.416152,0.407561,0.361486,0.353147,0.342681,0.324768,0.334419,0.334229,0.33699,0.486157,0.478926


In [None]:
incoh50.to_csv(result + 'simulation_HV_incoh_vb_response_deid_50v3_bert.csv')

## ineff10

In [None]:
ineff10['bert_word_mv5'] = ''
ineff10['bert_word_mv10'] = ''

ks = ['5', '10']
for k in ks:
    cur = 'bert_word_mv' + k 
    print('current: ', cur)
    for i in ineff10.index:
        if type(ineff10['bert_emb'][i]) != float: 
            word_embed_chunk = list(divide_chunks(ineff10['bert_emb'][i], int(k)))
            chunk_temp_sum = []
            for chunck_id, word_embed in enumerate(word_embed_chunk):
                temp_sum = []
                for word_id, embed in enumerate(word_embed):
                    w1 = embed
                    try:
                        w2 = word_embed[word_id+1]
                    except IndexError:
                        continue
                    temp = 1-cosine(w1, w2)
                    temp_sum.append(temp)
                temp_sim = np.nanmean(temp_sum)
                chunk_temp_sum.append(temp_sim)
        sim = np.nanmean(chunk_temp_sum)
        ineff10[cur][i] = sim

ineff10.head()

current:  bert_word_mv5
current:  bert_word_mv10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.9260), tensor(0.7718), tensor(0.139...",469,0.415859,0.389433,0.366713,0.355071,0.340469,0.339874,0.323313,0.315986,0.318991,0.479298,0.47775
11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(6.0613), tensor(-1.9364), tensor(2.85...",212,0.451335,0.394343,0.379211,0.361111,0.364849,0.34994,0.352626,0.342873,0.332989,0.512381,0.515649
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.3717), tensor(1.2232), tensor(4.933...",497,0.411034,0.368746,0.348774,0.350121,0.327221,0.325798,0.316564,0.321827,0.310504,0.480417,0.471675
12630,um So I was born and raised in South Washingto...,966,"[um, so, i, was, born, and, raised, in, south,...","[[tensor(-0.9319), tensor(-1.5137), tensor(3.7...",463,0.421071,0.38107,0.360023,0.358633,0.345952,0.337948,0.335499,0.329621,0.320563,0.482921,0.482146
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.5872), tensor(3.8303), tensor(5.034...",178,0.393846,0.371014,0.330708,0.321578,0.30734,0.294739,0.296669,0.307008,0.319991,0.484459,0.490165


In [None]:
ineff10.to_csv(result + 'simulation_HV_ineff_vb_response_deid_10v3_bert.csv')

## ineff20

In [None]:
ineff20['bert_word_mv5'] = ''
ineff20['bert_word_mv10'] = ''

ks = ['5', '10']
for k in ks:
    cur = 'bert_word_mv' + k 
    print('current: ', cur)
    for i in ineff20.index:
        if type(ineff20['bert_emb'][i]) != float: 
            word_embed_chunk = list(divide_chunks(ineff20['bert_emb'][i], int(k)))
            chunk_temp_sum = []
            for chunck_id, word_embed in enumerate(word_embed_chunk):
                temp_sum = []
                for word_id, embed in enumerate(word_embed):
                    w1 = embed
                    try:
                        w2 = word_embed[word_id+1]
                    except IndexError:
                        continue
                    temp = 1-cosine(w1, w2)
                    temp_sum.append(temp)
                temp_sim = np.nanmean(temp_sum)
                chunk_temp_sum.append(temp_sim)
        sim = np.nanmean(chunk_temp_sum)
        ineff20[cur][i] = sim

ineff20.head()

current:  bert_word_mv5
current:  bert_word_mv10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.8871), tensor(0.7492), tensor(0.043...",465,0.419734,0.394527,0.36742,0.357507,0.342382,0.339261,0.321388,0.318035,0.319717,0.481598,0.48208
11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(5.7484), tensor(-1.9574), tensor(3.33...",181,0.446248,0.396125,0.359399,0.359655,0.345632,0.331845,0.328439,0.323529,0.317078,0.515076,0.510636
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(4.6833), tensor(0.4654), tensor(5.237...",486,0.406104,0.375603,0.348739,0.345745,0.32947,0.329442,0.315648,0.323214,0.314113,0.47378,0.475669
12630,um So I was born and raised in South Washingto...,966,"[um, so, i, was, born, and, raised, in, south,...","[[tensor(-1.1326), tensor(-1.8485), tensor(3.1...",471,0.422655,0.37827,0.365426,0.36086,0.34634,0.337186,0.332113,0.324751,0.319289,0.482398,0.485622
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(3.0104), tensor(3.4729), tensor(5.414...",142,0.405599,0.384211,0.34458,0.326265,0.315097,0.304823,0.308425,0.320578,0.324224,0.488125,0.476935


In [None]:
ineff20.to_csv(result + 'simulation_HV_ineff_vb_response_deid_20v3_bert.csv')

## ineff50

In [None]:
ineff50['bert_word_mv5'] = ''
ineff50['bert_word_mv10'] = ''

ks = ['5', '10']
for k in ks:
    cur = 'bert_word_mv' + k 
    print('current: ', cur)
    for i in ineff50.index:
        if type(ineff50['bert_emb'][i]) != float: 
            word_embed_chunk = list(divide_chunks(ineff50['bert_emb'][i], int(k)))
            chunk_temp_sum = []
            for chunck_id, word_embed in enumerate(word_embed_chunk):
                temp_sum = []
                for word_id, embed in enumerate(word_embed):
                    w1 = embed
                    try:
                        w2 = word_embed[word_id+1]
                    except IndexError:
                        continue
                    temp = 1-cosine(w1, w2)
                    temp_sum.append(temp)
                temp_sim = np.nanmean(temp_sum)
                chunk_temp_sum.append(temp_sim)
        sim = np.nanmean(chunk_temp_sum)
        ineff50[cur][i] = sim

ineff50.head()

current:  bert_word_mv5
current:  bert_word_mv10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(1.0818), tensor(1.0875), tensor(-0.18...",431,0.431574,0.402701,0.375233,0.360198,0.355781,0.357323,0.337893,0.335999,0.332884,0.496595,0.498576
11689,We have been using that opportunity to do more...,159,"[we, have, been, using, that, opportunity, to,...","[[tensor(3.7089), tensor(0.8028), tensor(-1.74...",228,0.450827,0.392143,0.364411,0.346152,0.339268,0.32745,0.327672,0.334092,0.321685,0.503106,0.507054
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.3717), tensor(1.6071), tensor(5.350...",480,0.41699,0.371672,0.356068,0.360196,0.338796,0.349111,0.318997,0.324514,0.311103,0.48519,0.483654
12630,"My is things are fantastic . No , I mean My uh...",966,"[my, is, things, are, fantastic, ., no, ,, i, ...","[[tensor(-2.2958), tensor(2.2740), tensor(1.50...",474,0.422702,0.385004,0.365272,0.352853,0.345306,0.342191,0.332007,0.327148,0.323434,0.479335,0.483949
13493,Mhm . I still get to play . And my stock inves...,134,"[m, ##hm, ., i, still, get, to, play, ., and, ...","[[tensor(1.7302), tensor(3.9717), tensor(6.281...",183,0.407593,0.392881,0.342866,0.318033,0.314927,0.301858,0.292772,0.300559,0.304624,0.488476,0.489403


In [None]:
ineff50.to_csv(result + 'simulation_HV_ineff_vb_response_deid_50v3_bert.csv')

# FOC and SOC

## Setup and experiments

In [None]:
# install sentence-transformers from hugging face
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 56.5 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=a368ee69347e5421c1a9a4bd1bfe517920f36afcd516675f582a7eeebc0a02ba
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentencepiece, sentence-transformers
Successfully installed sentenc

In [None]:
# actuall utilize BERT
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-large')
type(model) #if worked well, should print: sentence_transformers.SentenceTransformer.SentenceTransformer

Downloading:   0%|          | 0.00/748 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

sentence_transformers.SentenceTransformer.SentenceTransformer

In [None]:
# with context
# take in the whole sequence at once
sentence1 = "he eventually sold the shares back to the bank at a premium. the river flowed over the bank. the next day a little girl walked by the river bank. she picked a bouquet of flowers."
sentence1 = sentence1.split('.')
embedding1 = model.encode(sentence1, convert_to_tensor=True)
len(embedding1)

5

In [None]:
sentence1[:-1]

['he eventually sold the shares back to the bank at a premium',
 ' the river flowed over the bank',
 ' the next day a little girl walked by the river bank',
 ' she picked a bouquet of flowers']

In [None]:
embedding1

tensor([[-1.0473,  1.3119,  0.1855,  ..., -0.0362, -0.5574, -0.3642],
        [ 0.5865,  0.8791,  0.1452,  ...,  0.0166,  0.1725, -1.0531],
        [ 1.4020,  0.0524, -0.0772,  ...,  0.2691,  0.4424, -0.2052],
        [ 0.7256,  0.1863, -0.2016,  ...,  1.6148, -0.4648,  0.4464],
        [-0.4258, -0.1729, -0.4653,  ..., -0.0790, -0.3457, -0.0386]],
       device='cuda:0')

In [None]:
cosine_scores = util.pytorch_cos_sim(embedding1[0], embedding1[1])
cosine_scores.item()

0.35455918312072754

In [None]:
cosine_scores = util.pytorch_cos_sim(embedding1[1], embedding1[2])
cosine_scores.item()

0.430277943611145

In [None]:
cosine_scores = util.pytorch_cos_sim(embedding1[0], embedding1[2])
cosine_scores.item()

0.025652559474110603

In [None]:
cosine_scores = util.pytorch_cos_sim(embedding1[2], embedding1[3])
cosine_scores.item()

0.2187005579471588

In [None]:
cosine_scores = util.pytorch_cos_sim(embedding1[0], embedding1[3])
cosine_scores.item()

0.16904005408287048

In [None]:
# without context
# take in one sequence string each time
sentence1 = ["he eventually sold the shares back to the bank at a premium.",
                               "the river flowed over the bank.",
                               "the next day a little girl walked by the river bank.",
                                "she picked a bouquet of flowers."]

sent_0 = model.encode(sentence1[0], convert_to_tensor = True)
sent_1 = model.encode(sentence1[1], convert_to_tensor = True)
sent_2 = model.encode(sentence1[2], convert_to_tensor = True)
sent_3 = model.encode(sentence1[3], convert_to_tensor = True)
cosine_scores = util.pytorch_cos_sim(sent_0, sent_1)
cosine_scores.item()

0.3437160849571228

In [None]:
cosine_scores = util.pytorch_cos_sim(sent_1, sent_2)
cosine_scores.item()

0.423532098531723

In [None]:
cosine_scores = util.pytorch_cos_sim(sent_0, sent_2)
cosine_scores.item()

0.014571648091077805

In [None]:
cosine_scores = util.pytorch_cos_sim(sent_2, sent_3)
cosine_scores.item()

0.22313213348388672

In [None]:
cosine_scores = util.pytorch_cos_sim(sent_0, sent_3)
cosine_scores.item()

0.1720224916934967

## baseline

In [None]:
# with context
baseline['bert_sent_context_emb'] = ''

# get contextualized sentence embeddings from BERT
for i in baseline.index:
  if baseline['n_words'][i] > 4:
    context = baseline['content'][i].split('.')[:-1]
    baseline['bert_sent_context_emb'][i] = model.encode(context, convert_to_tensor=True)
  if i % 5 == 0:
    print('current: ', i)

# calcuate sentence pairs similarity, either adjacent or with one intervening
baseline['bert_foc'] = ''
baseline['bert_soc'] = ''

for i in baseline.index:
    for idx, sent in enumerate(baseline['bert_sent_context_emb'][i]):
        temp_foc = [] # this is gonna be a list of cosine similarity scores, each response has its list
        temp_soc = []
        try:
            temp_foc.append(util.pytorch_cos_sim(sent, baseline['bert_sent_context_emb'][i][idx+1]).item())
            baseline['bert_foc'][i] = np.average(temp_foc) # add stats here

            temp_soc.append(util.pytorch_cos_sim(sent, baseline['bert_sent_context_emb'][i][idx+2]).item())
            baseline['bert_soc'][i] = np.average(temp_soc)
        except IndexError: # takes care of out of range error, in the FOC/SOC calculations
            continue

baseline.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


current:  0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_mv5,bert_word_mv10,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_sent_context_emb,bert_foc,bert_soc
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.7688), tensor(0.7937), tensor(0.167...",462,0.471428,0.474674,0.41295,0.38864,0.367721,0.353955,0.342045,0.338082,0.324235,0.318819,0.318824,"[[tensor(2.2570, device='cuda:0'), tensor(0.14...",0.408088,0.266888
1,11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(5.6645), tensor(-2.5050), tensor(3.09...",194,0.504627,0.501954,0.426685,0.372436,0.354607,0.339884,0.335918,0.32825,0.32619,0.308189,0.301729,"[[tensor(-1.0070, device='cuda:0'), tensor(-0....",0.073789,0.009538
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2083), tensor(1.4897), tensor(4.913...",494,0.4818,0.474423,0.411201,0.375869,0.353776,0.349704,0.333051,0.326555,0.318976,0.323212,0.314139,"[[tensor(-0.4359, device='cuda:0'), tensor(-0....",0.410767,0.568263
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.2509), tensor(-2.1763), tensor(4.0...",456,0.500832,0.494821,0.415659,0.388839,0.366737,0.354915,0.347091,0.341731,0.336869,0.334747,0.329355,"[[tensor(0.8687, device='cuda:0'), tensor(-0.6...",0.361507,0.327566
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.5036), tensor(3.9237), tensor(4.951...",171,0.478361,0.477872,0.384679,0.363791,0.32135,0.309973,0.297819,0.283803,0.286407,0.29583,0.309508,"[[tensor(-0.6955, device='cuda:0'), tensor(-0....",-0.010623,0.392595


In [None]:
baseline.to_csv(result + 'simulation_HV_baseline_vb_response_deid_v3_bert.csv')

## incoh10

In [None]:
incoh10.head()

Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.6725), tensor(1.0466), tensor(0.263...",464,0.419732,0.398102,0.373147,0.363936,0.352692,0.342627,0.329379,0.325641,0.325631,0.48515,0.483163
1,11689,Sure . I'm thirty three years good . My name i...,159,"[sure, ., i, ', m, thirty, three, years, good,...","[[tensor(5.6659), tensor(-2.2755), tensor(3.61...",190,0.437061,0.38617,0.367037,0.349232,0.34827,0.336172,0.340942,0.324468,0.319427,0.500808,0.503623
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2656), tensor(1.1820), tensor(4.869...",496,0.420912,0.384082,0.360058,0.358527,0.341441,0.338162,0.331947,0.332439,0.323517,0.485034,0.483255
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4018), tensor(-2.1731), tensor(4.3...",456,0.41965,0.394277,0.370679,0.357445,0.353164,0.345605,0.341885,0.341371,0.333201,0.501024,0.495825
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.4953), tensor(3.7805), tensor(5.160...",171,0.388267,0.370461,0.327668,0.31585,0.301831,0.289095,0.295041,0.301678,0.316763,0.480811,0.478044


In [None]:
# with context
incoh10['bert_sent_context_emb'] = ''

for i in incoh10.index:
  if incoh10['n_words'][i] > 4:
    context = incoh10['content'][i].split('.')[:-1]
    incoh10['bert_sent_context_emb'][i] = model.encode(context, convert_to_tensor=True)
  if i % 5 == 0:
    print('current: ', i)

incoh10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


current:  0


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.6725), tensor(1.0466), tensor(0.263...",464,0.419732,0.398102,0.373147,0.363936,0.352692,0.342627,0.329379,0.325641,0.325631,0.48515,0.483163,"[[tensor(2.2570, device='cuda:0'), tensor(0.14..."
1,11689,Sure . I'm thirty three years good . My name i...,159,"[sure, ., i, ', m, thirty, three, years, good,...","[[tensor(5.6659), tensor(-2.2755), tensor(3.61...",190,0.437061,0.38617,0.367037,0.349232,0.34827,0.336172,0.340942,0.324468,0.319427,0.500808,0.503623,"[[tensor(-1.0070, device='cuda:0'), tensor(-0...."
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2656), tensor(1.1820), tensor(4.869...",496,0.420912,0.384082,0.360058,0.358527,0.341441,0.338162,0.331947,0.332439,0.323517,0.485034,0.483255,"[[tensor(-0.4359, device='cuda:0'), tensor(-0...."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4018), tensor(-2.1731), tensor(4.3...",456,0.41965,0.394277,0.370679,0.357445,0.353164,0.345605,0.341885,0.341371,0.333201,0.501024,0.495825,"[[tensor(0.8687, device='cuda:0'), tensor(-0.6..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.4953), tensor(3.7805), tensor(5.160...",171,0.388267,0.370461,0.327668,0.31585,0.301831,0.289095,0.295041,0.301678,0.316763,0.480811,0.478044,"[[tensor(-0.6955, device='cuda:0'), tensor(-0...."


In [None]:
len(incoh10['bert_sent_context_emb'][0])

19

In [None]:
incoh10.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_bert.csv')

In [None]:
incoh10['bert_foc'] = ''
incoh10['bert_soc'] = ''

for i in incoh10.index:
    for idx, sent in enumerate(incoh10['bert_sent_context_emb'][i]):
        temp_foc = []
        temp_soc = []
        try:
            temp_foc.append(util.pytorch_cos_sim(sent, incoh10['bert_sent_context_emb'][i][idx+1]).item())
            incoh10['bert_foc'][i] = np.average(temp_foc)

            temp_soc.append(util.pytorch_cos_sim(sent, incoh10['bert_sent_context_emb'][i][idx+2]).item())
            incoh10['bert_soc'][i] = np.average(temp_soc)
        except IndexError:
            continue

incoh10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb,bert_foc,bert_soc
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.6725), tensor(1.0466), tensor(0.263...",464,0.419732,0.398102,0.373147,0.363936,0.352692,0.342627,0.329379,0.325641,0.325631,0.48515,0.483163,"[[tensor(2.2570, device='cuda:0'), tensor(0.14...",0.399801,0.248768
1,11689,Sure . I'm thirty three years good . My name i...,159,"[sure, ., i, ', m, thirty, three, years, good,...","[[tensor(5.6659), tensor(-2.2755), tensor(3.61...",190,0.437061,0.38617,0.367037,0.349232,0.34827,0.336172,0.340942,0.324468,0.319427,0.500808,0.503623,"[[tensor(-1.0070, device='cuda:0'), tensor(-0....",0.160284,0.009538
2,12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.2656), tensor(1.1820), tensor(4.869...",496,0.420912,0.384082,0.360058,0.358527,0.341441,0.338162,0.331947,0.332439,0.323517,0.485034,0.483255,"[[tensor(-0.4359, device='cuda:0'), tensor(-0....",0.410767,0.548882
3,12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4018), tensor(-2.1731), tensor(4.3...",456,0.41965,0.394277,0.370679,0.357445,0.353164,0.345605,0.341885,0.341371,0.333201,0.501024,0.495825,"[[tensor(0.8687, device='cuda:0'), tensor(-0.6...",0.348991,0.303899
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.4953), tensor(3.7805), tensor(5.160...",171,0.388267,0.370461,0.327668,0.31585,0.301831,0.289095,0.295041,0.301678,0.316763,0.480811,0.478044,"[[tensor(-0.6955, device='cuda:0'), tensor(-0....",0.085526,0.482344


In [None]:
incoh10.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_bert.csv')

## incoh20

In [None]:
incoh20.head()

Unnamed: 0,grid,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb
0,10455,"I'm a young man , an en an en- an things by tr...",421,"['i', ""'"", 'm', 'a', 'young', 'man', ',', 'an'...","[tensor([ 5.7402e-01, 6.6729e-01, -2.5507e-01...",464,0.423546,0.40207,0.3811,0.369743,0.355661,0.348424,0.335308,0.327706,0.329574,0.488356,0.485694,"tensor([[ 1.3454, 0.2357, -0.1815, ..., -0.7..."
1,11689,Sure . I'm thirty three years medical . My nam...,159,"['sure', '.', 'i', ""'"", 'm', 'thirty', 'three'...","[tensor([ 5.7015e+00, -2.7486e+00, 3.7004e+00...",190,0.448121,0.402427,0.387234,0.370825,0.367952,0.355175,0.361326,0.347132,0.337009,0.50886,0.511256,"tensor([[-1.0070, -0.4818, -0.6671, ..., -0.0..."
2,12376,Alright . um I live in not especially lazy Spr...,468,"['alright', '.', 'um', 'i', 'live', 'in', 'not...","[tensor([ 5.0813e+00, 1.3699e+00, 4.9952e+00...",496,0.432127,0.393979,0.372034,0.370072,0.350946,0.349551,0.342697,0.344309,0.338485,,0.493909,"tensor([[-0.4359, -0.2084, -0.6767, ..., -0.7..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"['um', 'so', 'i', ""'"", 'm', 'currently', 'twen...","[tensor([-4.0193e-01, -2.1877e+00, 4.2517e+00...",456,0.423406,0.397939,0.376072,0.365122,0.358143,0.355902,0.350251,0.349574,0.340883,,0.499642,"tensor([[ 0.8687, -0.6463, -0.0449, ..., -0.2..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"['m', '##hm', '.', 'i', ""'"", 'm', 'a', 'thirty...","[tensor([ 2.1580e+00, 3.9266e+00, 5.3917e+00...",171,0.393248,0.378669,0.337608,0.327369,0.313787,0.299737,0.30466,0.31099,0.318905,,,"tensor([[-0.6955, -0.3440, -0.6190, ..., -0.3..."


In [None]:
# with context
incoh20['bert_sent_context_emb'] = ''

for i in incoh20.index:
  if incoh20['n_words'][i] > 4:
    context = incoh20['content'][i].split('.')[:-1]
    incoh20['bert_sent_context_emb'][i] = model.encode(context, convert_to_tensor=True)
  if i % 5 == 0:
    print('current: ', i)

incoh20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


current:  10455
current:  12630


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10455,"I'm a young man , an en an en- an things by tr...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.5740), tensor(0.6673), tensor(-0.25...",464,0.423546,0.40207,0.3811,0.369743,0.355661,0.348424,0.335308,0.327706,0.329574,0.488356,0.485694,"[[tensor(1.3454, device='cuda:0'), tensor(0.23..."
11689,Sure . I'm thirty three years medical . My nam...,159,"[sure, ., i, ', m, thirty, three, years, medic...","[[tensor(5.7015), tensor(-2.7486), tensor(3.70...",190,0.448121,0.402427,0.387234,0.370825,0.367952,0.355175,0.361326,0.347132,0.337009,0.50886,0.511256,"[[tensor(-1.0070, device='cuda:0'), tensor(-0...."
12376,Alright . um I live in not especially lazy Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.0813), tensor(1.3699), tensor(4.995...",496,0.432127,0.393979,0.372034,0.370072,0.350946,0.349551,0.342697,0.344309,0.338485,0.495238,0.493909,"[[tensor(-0.4359, device='cuda:0'), tensor(-0...."
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4019), tensor(-2.1877), tensor(4.2...",456,0.423406,0.397939,0.376072,0.365122,0.358143,0.355902,0.350251,0.349574,0.340883,0.504377,0.499642,"[[tensor(0.8687, device='cuda:0'), tensor(-0.6..."
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.1580), tensor(3.9266), tensor(5.391...",171,0.393248,0.378669,0.337608,0.327369,0.313787,0.299737,0.30466,0.31099,0.318905,0.487613,0.486644,"[[tensor(-0.6955, device='cuda:0'), tensor(-0...."


In [None]:
incoh20['bert_foc'] = ''
incoh20['bert_soc'] = ''

for i in incoh20.index:
    for idx, sent in enumerate(incoh20['bert_sent_context_emb'][i]):
        temp_foc = []
        temp_soc = []
        try:
            temp_foc.append(util.pytorch_cos_sim(sent, incoh20['bert_sent_context_emb'][i][idx+1]).item())
            incoh20['bert_foc'][i] = np.average(temp_foc)

            temp_soc.append(util.pytorch_cos_sim(sent, incoh20['bert_sent_context_emb'][i][idx+2]).item())
            incoh20['bert_soc'][i] = np.average(temp_soc)
        except IndexError:
            continue

incoh20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb,bert_foc,bert_soc
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10455,"I'm a young man , an en an en- an things by tr...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.5740), tensor(0.6673), tensor(-0.25...",464,0.423546,0.40207,0.3811,0.369743,0.355661,0.348424,0.335308,0.327706,0.329574,0.488356,0.485694,"[[tensor(1.3454, device='cuda:0'), tensor(0.23...",0.268502,0.261329
11689,Sure . I'm thirty three years medical . My nam...,159,"[sure, ., i, ', m, thirty, three, years, medic...","[[tensor(5.7015), tensor(-2.7486), tensor(3.70...",190,0.448121,0.402427,0.387234,0.370825,0.367952,0.355175,0.361326,0.347132,0.337009,0.50886,0.511256,"[[tensor(-1.0070, device='cuda:0'), tensor(-0....",0.160284,0.009538
12376,Alright . um I live in not especially lazy Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.0813), tensor(1.3699), tensor(4.995...",496,0.432127,0.393979,0.372034,0.370072,0.350946,0.349551,0.342697,0.344309,0.338485,0.495238,0.493909,"[[tensor(-0.4359, device='cuda:0'), tensor(-0....",0.535564,0.366907
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.4019), tensor(-2.1877), tensor(4.2...",456,0.423406,0.397939,0.376072,0.365122,0.358143,0.355902,0.350251,0.349574,0.340883,0.504377,0.499642,"[[tensor(0.8687, device='cuda:0'), tensor(-0.6...",0.28413,0.31138
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.1580), tensor(3.9266), tensor(5.391...",171,0.393248,0.378669,0.337608,0.327369,0.313787,0.299737,0.30466,0.31099,0.318905,0.487613,0.486644,"[[tensor(-0.6955, device='cuda:0'), tensor(-0....",0.061188,0.539256


In [None]:
incoh20.to_csv(result + 'simulation_HV_incoh_vb_response_deid_20v3_bert.csv')

## incoh50

In [None]:
incoh50.head()

Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10455,"I'm a good year , an en an en- an responsibili...",421,"[i, ', m, a, good, year, ,, an, en, an, en, -,...","[[tensor(-1.4038), tensor(-0.1184), tensor(-0....",459,0.441923,0.424328,0.402711,0.393327,0.382006,0.375118,0.368116,0.360517,0.363095,0.498728,0.494
11689,Sure . I'm thirty three years anxious . My nam...,159,"[sure, ., i, ', m, thirty, three, years, anxio...","[[tensor(4.6395), tensor(-2.2006), tensor(4.34...",192,0.474055,0.428855,0.419308,0.400513,0.400701,0.387484,0.391745,0.380941,0.367475,0.529403,0.527393
12376,Alright . um something live in not especially ...,468,"[alright, ., um, something, live, in, not, esp...","[[tensor(4.7609), tensor(0.7377), tensor(4.862...",498,0.461128,0.427746,0.405324,0.404731,0.387605,0.384336,0.380061,0.384769,0.378545,0.514372,0.511451
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.8654), tensor(-2.2503), tensor(4.9...",457,0.443361,0.420886,0.402643,0.390756,0.382505,0.383815,0.375148,0.372052,0.370879,0.511806,0.509827
13493,Mhm . I'm a thirty five hospital old man anyth...,134,"[m, ##hm, ., i, ', m, a, thirty, five, hospita...","[[tensor(2.0145), tensor(3.1620), tensor(5.582...",172,0.416151,0.407561,0.361486,0.353147,0.342681,0.324768,0.334419,0.334229,0.33699,0.486157,0.478926


In [None]:
# with context
incoh50['bert_sent_context_emb'] = ''

for i in incoh50.index:
  if incoh50['n_words'][i] > 4:
    context = incoh50['content'][i].split('.')[:-1]
    incoh50['bert_sent_context_emb'][i] = model.encode(context, convert_to_tensor=True)
  if i % 5 == 0:
    print('current: ', i)

incoh50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


current:  10455
current:  12630


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10455,"I'm a good year , an en an en- an responsibili...",421,"[i, ', m, a, good, year, ,, an, en, an, en, -,...","[[tensor(-1.4038), tensor(-0.1184), tensor(-0....",459,0.441923,0.424328,0.402711,0.393327,0.382006,0.375118,0.368116,0.360517,0.363095,0.498728,0.494,"[[tensor(0.4119, device='cuda:0'), tensor(0.31..."
11689,Sure . I'm thirty three years anxious . My nam...,159,"[sure, ., i, ', m, thirty, three, years, anxio...","[[tensor(4.6395), tensor(-2.2006), tensor(4.34...",192,0.474055,0.428855,0.419308,0.400513,0.400701,0.387484,0.391745,0.380941,0.367475,0.529403,0.527393,"[[tensor(-1.0070, device='cuda:0'), tensor(-0...."
12376,Alright . um something live in not especially ...,468,"[alright, ., um, something, live, in, not, esp...","[[tensor(4.7609), tensor(0.7377), tensor(4.862...",498,0.461128,0.427746,0.405324,0.404731,0.387605,0.384336,0.380061,0.384769,0.378545,0.514372,0.511451,"[[tensor(-0.4359, device='cuda:0'), tensor(-0...."
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.8654), tensor(-2.2503), tensor(4.9...",457,0.443361,0.420886,0.402643,0.390756,0.382505,0.383815,0.375148,0.372052,0.370879,0.511806,0.509827,"[[tensor(0.8687, device='cuda:0'), tensor(-0.6..."
13493,Mhm . I'm a thirty five hospital old man anyth...,134,"[m, ##hm, ., i, ', m, a, thirty, five, hospita...","[[tensor(2.0145), tensor(3.1620), tensor(5.582...",172,0.416152,0.407561,0.361486,0.353147,0.342681,0.324768,0.334419,0.334229,0.33699,0.486157,0.478926,"[[tensor(-0.6955, device='cuda:0'), tensor(-0...."


In [None]:
incoh50['bert_foc'] = ''
incoh50['bert_soc'] = ''

for i in incoh50.index:
    for idx, sent in enumerate(incoh50['bert_sent_context_emb'][i]):
        temp_foc = []
        temp_soc = []
        try:
            temp_foc.append(util.pytorch_cos_sim(sent, incoh50['bert_sent_context_emb'][i][idx+1]).item())
            incoh50['bert_foc'][i] = np.average(temp_foc)

            temp_soc.append(util.pytorch_cos_sim(sent, incoh50['bert_sent_context_emb'][i][idx+2]).item())
            incoh50['bert_soc'][i] = np.average(temp_soc)
        except IndexError:
            continue

incoh50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb,bert_foc,bert_soc
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10455,"I'm a good year , an en an en- an responsibili...",421,"[i, ', m, a, good, year, ,, an, en, an, en, -,...","[[tensor(-1.4038), tensor(-0.1184), tensor(-0....",459,0.441923,0.424328,0.402711,0.393327,0.382006,0.375118,0.368116,0.360517,0.363095,0.498728,0.494,"[[tensor(0.4119, device='cuda:0'), tensor(0.31...",0.344729,0.377332
11689,Sure . I'm thirty three years anxious . My nam...,159,"[sure, ., i, ', m, thirty, three, years, anxio...","[[tensor(4.6395), tensor(-2.2006), tensor(4.34...",192,0.474055,0.428855,0.419308,0.400513,0.400701,0.387484,0.391745,0.380941,0.367475,0.529403,0.527393,"[[tensor(-1.0070, device='cuda:0'), tensor(-0....",0.160284,-0.010957
12376,Alright . um something live in not especially ...,468,"[alright, ., um, something, live, in, not, esp...","[[tensor(4.7609), tensor(0.7377), tensor(4.862...",498,0.461128,0.427746,0.405324,0.404731,0.387605,0.384336,0.380061,0.384769,0.378545,0.514372,0.511451,"[[tensor(-0.4359, device='cuda:0'), tensor(-0....",0.521745,0.472009
12630,um So I'm currently twenty-nine . I was born a...,966,"[um, so, i, ', m, currently, twenty, -, nine, ...","[[tensor(-0.8654), tensor(-2.2503), tensor(4.9...",457,0.443361,0.420886,0.402643,0.390756,0.382505,0.383815,0.375148,0.372052,0.370879,0.511806,0.509827,"[[tensor(0.8687, device='cuda:0'), tensor(-0.6...",0.192683,0.218711
13493,Mhm . I'm a thirty five hospital old man anyth...,134,"[m, ##hm, ., i, ', m, a, thirty, five, hospita...","[[tensor(2.0145), tensor(3.1620), tensor(5.582...",172,0.416152,0.407561,0.361486,0.353147,0.342681,0.324768,0.334419,0.334229,0.33699,0.486157,0.478926,"[[tensor(-0.6955, device='cuda:0'), tensor(-0....",0.116799,0.528694


In [None]:
incoh50.to_csv(result + 'simulation_HV_incoh_vb_response_deid_50v3_bert.csv')

## ineff10

In [None]:
# with context
ineff10['bert_sent_context_emb'] = ''

for i in ineff10.index:
  if ineff10['n_words'][i] > 4:
    context = ineff10['content'][i].split('.')[:-1]
    ineff10['bert_sent_context_emb'][i] = model.encode(context, convert_to_tensor=True)
  if i % 5 == 0:
    print('current: ', i)

ineff10['bert_foc'] = ''
ineff10['bert_soc'] = ''

for i in ineff10.index:
    for idx, sent in enumerate(ineff10['bert_sent_context_emb'][i]):
        temp_foc = []
        temp_soc = []
        try:
            temp_foc.append(util.pytorch_cos_sim(sent, ineff10['bert_sent_context_emb'][i][idx+1]).item())
            ineff10['bert_foc'][i] = np.average(temp_foc)

            temp_soc.append(util.pytorch_cos_sim(sent, ineff10['bert_sent_context_emb'][i][idx+2]).item())
            ineff10['bert_soc'][i] = np.average(temp_soc)
        except IndexError:
            continue

ineff10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


current:  10455
current:  12630


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb,bert_foc,bert_soc
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.9260), tensor(0.7718), tensor(0.139...",469,0.415859,0.389433,0.366713,0.355071,0.340469,0.339874,0.323313,0.315986,0.318991,0.479298,0.47775,"[[tensor(2.2570, device='cuda:0'), tensor(0.14...",0.377784,0.266888
11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(6.0613), tensor(-1.9364), tensor(2.85...",212,0.451335,0.394343,0.379211,0.361111,0.364849,0.34994,0.352626,0.342873,0.332989,0.512381,0.515649,"[[tensor(-1.0070, device='cuda:0'), tensor(-0....",0.277705,0.42147
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.3717), tensor(1.2232), tensor(4.933...",497,0.411034,0.368746,0.348774,0.350121,0.327221,0.325798,0.316564,0.321827,0.310504,0.480417,0.471675,"[[tensor(-0.4359, device='cuda:0'), tensor(-0....",0.410767,0.568263
12630,um So I was born and raised in South Washingto...,966,"[um, so, i, was, born, and, raised, in, south,...","[[tensor(-0.9319), tensor(-1.5137), tensor(3.7...",463,0.421071,0.38107,0.360023,0.358633,0.345952,0.337948,0.335499,0.329621,0.320563,0.482921,0.482146,"[[tensor(-0.2357, device='cuda:0'), tensor(-0....",0.281601,0.047029
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(2.5872), tensor(3.8303), tensor(5.034...",178,0.393846,0.371014,0.330708,0.321578,0.30734,0.294739,0.296669,0.307008,0.319991,0.484459,0.490165,"[[tensor(-0.6955, device='cuda:0'), tensor(-0....",0.109099,0.155571


In [None]:
ineff10.to_csv(result + 'simulation_HV_ineff_vb_response_deid_10v3_bert.csv')

## ineff20

In [None]:
# with context
ineff20['bert_sent_context_emb'] = ''

for i in ineff20.index:
  if ineff20['n_words'][i] > 4:
    context = ineff20['content'][i].split('.')[:-1]
    ineff20['bert_sent_context_emb'][i] = model.encode(context, convert_to_tensor=True)
  if i % 5 == 0:
    print('current: ', i)

ineff20['bert_foc'] = ''
ineff20['bert_soc'] = ''

for i in ineff20.index:
    for idx, sent in enumerate(ineff20['bert_sent_context_emb'][i]):
        temp_foc = []
        temp_soc = []
        try:
            temp_foc.append(util.pytorch_cos_sim(sent, ineff20['bert_sent_context_emb'][i][idx+1]).item())
            ineff20['bert_foc'][i] = np.average(temp_foc)

            temp_soc.append(util.pytorch_cos_sim(sent, ineff20['bert_sent_context_emb'][i][idx+2]).item())
            ineff20['bert_soc'][i] = np.average(temp_soc)
        except IndexError:
            continue

ineff20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


current:  10455
current:  12630


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb,bert_foc,bert_soc
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(0.8871), tensor(0.7492), tensor(0.043...",465,0.419734,0.394527,0.36742,0.357507,0.342382,0.339261,0.321388,0.318035,0.319717,0.481598,0.48208,"[[tensor(2.2570, device='cuda:0'), tensor(0.14...",0.036486,0.266888
11689,Sure . I'm thirty three years old . My name is...,159,"[sure, ., i, ', m, thirty, three, years, old, ...","[[tensor(5.7484), tensor(-1.9574), tensor(3.33...",181,0.446248,0.396125,0.359399,0.359655,0.345632,0.331845,0.328439,0.323529,0.317078,0.515076,0.510636,"[[tensor(-1.0070, device='cuda:0'), tensor(-0....",0.277704,0.498985
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(4.6833), tensor(0.4654), tensor(5.237...",486,0.406104,0.375603,0.348739,0.345745,0.32947,0.329442,0.315648,0.323214,0.314113,0.47378,0.475669,"[[tensor(-0.4359, device='cuda:0'), tensor(-0....",0.410767,0.568263
12630,um So I was born and raised in South Washingto...,966,"[um, so, i, was, born, and, raised, in, south,...","[[tensor(-1.1326), tensor(-1.8485), tensor(3.1...",471,0.422655,0.37827,0.365426,0.36086,0.34634,0.337186,0.332113,0.324751,0.319289,0.482398,0.485622,"[[tensor(-0.2357, device='cuda:0'), tensor(-0....",0.281601,0.047029
13493,Mhm . I'm a thirty five year old man who uh um...,134,"[m, ##hm, ., i, ', m, a, thirty, five, year, o...","[[tensor(3.0104), tensor(3.4729), tensor(5.414...",142,0.405599,0.384211,0.34458,0.326265,0.315097,0.304823,0.308425,0.320578,0.324224,0.488125,0.476935,"[[tensor(-0.6955, device='cuda:0'), tensor(-0....",0.009786,0.397892


In [None]:
ineff20.to_csv(result + 'simulation_HV_ineff_vb_response_deid_20v3_bert.csv')

## ineff50

In [None]:
# with context
ineff50['bert_sent_context_emb'] = ''

for i in ineff50.index:
  if ineff50['n_words'][i] > 4:
    context = ineff50['content'][i].split('.')[:-1]
    ineff50['bert_sent_context_emb'][i] = model.encode(context, convert_to_tensor=True)
  if i % 5 == 0:
    print('current: ', i)

ineff50['bert_foc'] = ''
ineff50['bert_soc'] = ''

for i in ineff50.index:
    for idx, sent in enumerate(ineff50['bert_sent_context_emb'][i]):
        temp_foc = []
        temp_soc = []
        try:
            temp_foc.append(util.pytorch_cos_sim(sent, ineff50['bert_sent_context_emb'][i][idx+1]).item())
            ineff50['bert_foc'][i] = np.average(temp_foc)

            temp_soc.append(util.pytorch_cos_sim(sent, ineff50['bert_sent_context_emb'][i][idx+2]).item())
            ineff50['bert_soc'][i] = np.average(temp_soc)
        except IndexError:
            continue

ineff50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


current:  10455
current:  12630


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,content,n_words,bert_tokens,bert_emb,len_bert_tokens_emb,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_word_mv5,bert_word_mv10,bert_sent_context_emb,bert_foc,bert_soc
grid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
10455,"I'm a young man , an en an en- an engineer by ...",421,"[i, ', m, a, young, man, ,, an, en, an, en, -,...","[[tensor(1.0818), tensor(1.0875), tensor(-0.18...",431,0.431574,0.402701,0.375233,0.360198,0.355781,0.357323,0.337893,0.335999,0.332884,0.496595,0.498576,"[[tensor(2.2570, device='cuda:0'), tensor(0.14...",0.156946,0.250388
11689,We have been using that opportunity to do more...,159,"[we, have, been, using, that, opportunity, to,...","[[tensor(3.7089), tensor(0.8028), tensor(-1.74...",228,0.450827,0.392143,0.364411,0.346152,0.339268,0.32745,0.327672,0.334092,0.321685,0.503106,0.507054,"[[tensor(-0.8558, device='cuda:0'), tensor(0.5...",0.277705,0.254643
12376,Alright . um I live in not especially cool Spr...,468,"[alright, ., um, i, live, in, not, especially,...","[[tensor(5.3717), tensor(1.6071), tensor(5.350...",480,0.41699,0.371672,0.356068,0.360196,0.338796,0.349111,0.318997,0.324514,0.311103,0.48519,0.483654,"[[tensor(-0.4359, device='cuda:0'), tensor(-0....",0.079694,0.006841
12630,"My is things are fantastic . No , I mean My uh...",966,"[my, is, things, are, fantastic, ., no, ,, i, ...","[[tensor(-2.2958), tensor(2.2740), tensor(1.50...",474,0.422702,0.385004,0.365272,0.352853,0.345306,0.342191,0.332007,0.327148,0.323434,0.479335,0.483949,"[[tensor(-1.4369, device='cuda:0'), tensor(-0....",0.281601,0.055963
13493,Mhm . I still get to play . And my stock inves...,134,"[m, ##hm, ., i, still, get, to, play, ., and, ...","[[tensor(1.7302), tensor(3.9717), tensor(6.281...",183,0.407593,0.392881,0.342866,0.318033,0.314927,0.301858,0.292772,0.300559,0.304624,0.488476,0.489403,"[[tensor(-0.6955, device='cuda:0'), tensor(-0....",0.287085,0.330583


In [None]:
ineff50.to_csv(result + 'simulation_HV_ineff_vb_response_deid_50v3_bert.csv')

# Prep dataframe for R

In [None]:
baseline = pd.read_csv('simulation_HV_baseline_vb_response_deid_v3_bert.csv')
baseline = baseline[['grid', 'bert_word_mv5', 'bert_word_mv10',
       'bert_word_k2', 'bert_word_k3', 'bert_word_k4', 'bert_word_k5',
       'bert_word_k6', 'bert_word_k7', 'bert_word_k8', 'bert_word_k9',
       'bert_word_k10', 'bert_foc', 'bert_soc']]
baseline['SSDvHC'] = 0.0
baseline['group'] = 'HV_0_shuffle'
baseline.drop('grid', axis = 1, inplace = True)

ineff10 = pd.read_csv('simulation_HV_ineff_vb_response_deid_10v3_bert.csv')
ineff10 = ineff10[['grid', 'bert_word_mv5', 'bert_word_mv10',
       'bert_word_k2', 'bert_word_k3', 'bert_word_k4', 'bert_word_k5',
       'bert_word_k6', 'bert_word_k7', 'bert_word_k8', 'bert_word_k9',
       'bert_word_k10', 'bert_foc', 'bert_soc']]
ineff10['SSDvHC'] = 0.1
ineff10['group'] = 'HV_10_shuffle'
ineff10.drop('grid', axis = 1, inplace = True)

ineff20 = pd.read_csv('simulation_HV_ineff_vb_response_deid_20v3_bert.csv')
ineff20 = ineff20[['grid', 'bert_word_mv5', 'bert_word_mv10',
       'bert_word_k2', 'bert_word_k3', 'bert_word_k4', 'bert_word_k5',
       'bert_word_k6', 'bert_word_k7', 'bert_word_k8', 'bert_word_k9',
       'bert_word_k10', 'bert_foc', 'bert_soc']]
ineff20['SSDvHC'] = 0.2
ineff20['group'] = 'HV_20_shuffle'
ineff20.drop('grid', axis = 1, inplace = True)

ineff50 = pd.read_csv('simulation_HV_ineff_vb_response_deid_50v3_bert.csv')
ineff50 = ineff50[['grid', 'bert_word_mv5', 'bert_word_mv10',
       'bert_word_k2', 'bert_word_k3', 'bert_word_k4', 'bert_word_k5',
       'bert_word_k6', 'bert_word_k7', 'bert_word_k8', 'bert_word_k9',
       'bert_word_k10', 'bert_foc', 'bert_soc']]
ineff50['SSDvHC'] = 0.5
ineff50['group'] = 'HV_50_shuffle'
ineff50.drop('grid', axis = 1, inplace = True)

ineff = pd.concat([baseline, ineff10, ineff20, ineff50])
ineff.reset_index(drop = True, inplace = True)
ineff.head(20)

ineff.to_csv('Analysis/simulation_HV_baseline_ineff_vb_response_deid_v3_bert.csv')
ineff.head()

Unnamed: 0,bert_word_mv5,bert_word_mv10,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_foc,bert_soc,SSDvHC,group
0,0.471428,0.474674,0.41295,0.38864,0.367721,0.353955,0.342045,0.338082,0.324235,0.318819,0.318824,0.408088,0.266888,0.0,HV_0_shuffle
1,0.504627,0.501954,0.426685,0.372436,0.354607,0.339884,0.335918,0.32825,0.32619,0.308189,0.301729,0.073789,0.009538,0.0,HV_0_shuffle
2,0.4818,0.474423,0.411201,0.375869,0.353776,0.349704,0.333051,0.326555,0.318976,0.323212,0.314139,0.410767,0.568263,0.0,HV_0_shuffle
3,0.500832,0.494821,0.415659,0.388839,0.366737,0.354915,0.347091,0.341731,0.336869,0.334747,0.329355,0.361507,0.327566,0.0,HV_0_shuffle
4,0.478361,0.477872,0.384679,0.363791,0.32135,0.309973,0.297819,0.283803,0.286407,0.29583,0.309508,-0.010623,0.392595,0.0,HV_0_shuffle


In [None]:
baseline = pd.read_csv('simulation_HV_baseline_vb_response_deid_v3_bert.csv')
baseline = baseline[['grid', 'bert_word_mv5', 'bert_word_mv10',
       'bert_word_k2', 'bert_word_k3', 'bert_word_k4', 'bert_word_k5',
       'bert_word_k6', 'bert_word_k7', 'bert_word_k8', 'bert_word_k9',
       'bert_word_k10', 'bert_foc', 'bert_soc']]
baseline['SSDvHC'] = 0.0
baseline['group'] = 'HV_0_shuffle'
baseline.drop('grid', axis = 1, inplace = True)

incoh10 = pd.read_csv('simulation_HV_incoh_vb_response_deid_10v3_bert.csv')
incoh10 = incoh10[['grid', 'bert_word_mv5', 'bert_word_mv10',
       'bert_word_k2', 'bert_word_k3', 'bert_word_k4', 'bert_word_k5',
       'bert_word_k6', 'bert_word_k7', 'bert_word_k8', 'bert_word_k9',
       'bert_word_k10', 'bert_foc', 'bert_soc']]
incoh10['SSDvHC'] = 0.1
incoh10['group'] = 'HV_10_shuffle'
incoh10.drop('grid', axis = 1, inplace = True)

incoh20 = pd.read_csv('simulation_HV_incoh_vb_response_deid_20v3_bert.csv')
incoh20 = incoh20[['grid', 'bert_word_mv5', 'bert_word_mv10',
       'bert_word_k2', 'bert_word_k3', 'bert_word_k4', 'bert_word_k5',
       'bert_word_k6', 'bert_word_k7', 'bert_word_k8', 'bert_word_k9',
       'bert_word_k10', 'bert_foc', 'bert_soc']]
incoh20['SSDvHC'] = 0.2
incoh20['group'] = 'HV_20_shuffle'
incoh20.drop('grid', axis = 1, inplace = True)

incoh50 = pd.read_csv('simulation_HV_incoh_vb_response_deid_50v3_bert.csv')
incoh50 = incoh50[['grid', 'bert_word_mv5', 'bert_word_mv10',
       'bert_word_k2', 'bert_word_k3', 'bert_word_k4', 'bert_word_k5',
       'bert_word_k6', 'bert_word_k7', 'bert_word_k8', 'bert_word_k9',
       'bert_word_k10', 'bert_foc', 'bert_soc']]
incoh50['SSDvHC'] = 0.5
incoh50['group'] = 'HV_50_shuffle'
incoh50.drop('grid', axis = 1, inplace = True)

incoh = pd.concat([baseline, incoh10, incoh20, incoh50])
incoh.reset_index(drop = True, inplace = True)
incoh.head(20)

incoh.to_csv('Analysis/simulation_HV_baseline_incoh_vb_response_deid_v3_bert.csv')
incoh.head()

Unnamed: 0,bert_word_mv5,bert_word_mv10,bert_word_k2,bert_word_k3,bert_word_k4,bert_word_k5,bert_word_k6,bert_word_k7,bert_word_k8,bert_word_k9,bert_word_k10,bert_foc,bert_soc,SSDvHC,group
0,0.471428,0.474674,0.41295,0.38864,0.367721,0.353955,0.342045,0.338082,0.324235,0.318819,0.318824,0.408088,0.266888,0.0,HV_0_shuffle
1,0.504627,0.501954,0.426685,0.372436,0.354607,0.339884,0.335918,0.32825,0.32619,0.308189,0.301729,0.073789,0.009538,0.0,HV_0_shuffle
2,0.4818,0.474423,0.411201,0.375869,0.353776,0.349704,0.333051,0.326555,0.318976,0.323212,0.314139,0.410767,0.568263,0.0,HV_0_shuffle
3,0.500832,0.494821,0.415659,0.388839,0.366737,0.354915,0.347091,0.341731,0.336869,0.334747,0.329355,0.361507,0.327566,0.0,HV_0_shuffle
4,0.478361,0.477872,0.384679,0.363791,0.32135,0.309973,0.297819,0.283803,0.286407,0.29583,0.309508,-0.010623,0.392595,0.0,HV_0_shuffle
