# Setup

In [None]:
# install transformers from hugging face
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 31.0 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 67.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 64.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [None]:
# T5 uses SentencePiece tokenizer, 
# which is implemented in C and is opaque to Python.
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SentencePiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 33.0 MB/s 
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.97


In [None]:
from transformers import T5Tokenizer, TFT5Model

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5Model.from_pretrained('t5-small')

# tf: tensorflow
inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf") # Batch size 1
outputs = model(inputs, decoder_input_ids=inputs)

# The last hidden-state is the first element of the output tuple
last_hidden_states = outputs[0]  

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5Model.

All the layers of TFT5Model were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5Model for predictions without further training.


In [None]:
last_hidden_states

<tf.Tensor: shape=(1, 7, 512), dtype=float32, numpy=
array([[[ 1.4377134e-01,  1.4388986e-01, -4.7584544e-03, ...,
         -2.3170823e-02,  1.0850930e-04,  1.6130598e-01],
        [ 1.1936245e-01,  3.9282650e-02,  1.6315565e-02, ...,
         -7.3915541e-02,  1.6595981e-04,  2.1468359e-01],
        [ 1.8629685e-01,  9.6943088e-02, -5.1533658e-02, ...,
         -2.0246001e-01,  2.4400113e-04, -4.0399268e-01],
        ...,
        [-1.0247106e-01,  1.7289273e-01, -9.0522192e-02, ...,
         -1.0925368e-01,  1.9884549e-04, -9.8187111e-02],
        [ 4.9900740e-02,  3.9950600e-01, -7.3473006e-02, ...,
         -4.2685643e-02, -1.4135287e-04, -1.9327033e-02],
        [ 9.0925939e-02,  3.1315956e-01, -2.9675631e-02, ...,
         -4.4643156e-02,  3.7587879e-04,  4.6855807e-02]]], dtype=float32)>

In [None]:
# for math in python
!pip install scipy
import scipy

# Get contextualized embd

In [None]:
# define a function to get contextualized embeddings
def get_last_hidden_state(sent):
  inputs = tokenizer.encode(sent, return_tensors="tf")  # Batch size 1
  outputs = model(inputs, decoder_input_ids=inputs)
  # The last hidden-state is the first element of the output tuple (c.f. Raffel et al 2020)
  last_hidden_states = outputs[0]  
  return last_hidden_states

# Prep dataframes

In [None]:
import pandas as pd
import numpy as np

In [None]:
drive_data_path = '/Gradient_shuffle/'
result = '/T5simul/'

In [None]:
baseline = pd.read_csv(drive_data_path + 'simulation_HV_baseline_vb_response_deid_v3.csv') 
baseline['t5_word_emb'] = ''
baseline.head() 

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,
1,11689,Sure . I'm thirty three years old . My name is...,159,
2,12376,Alright . um I live in not especially cool Spr...,468,
3,12630,um So I'm currently twenty-nine . I was born a...,966,
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,


In [None]:
incoh10 = pd.read_csv(drive_data_path + 'simulation_HV_incoh_vb_response_deid_10v3.csv') 
incoh10['t5_word_emb'] = ''
incoh10.head() 

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,
1,11689,Sure . I'm thirty three years good . My name i...,159,
2,12376,Alright . um I live in not especially cool Spr...,468,
3,12630,um So I'm currently twenty-nine . I was born a...,966,
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,


In [None]:
incoh20 = pd.read_csv(drive_data_path + 'simulation_HV_incoh_vb_response_deid_20v3.csv') 
incoh20['t5_word_emb'] = ''
incoh20.head() 

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an things by tr...",421,
1,11689,Sure . I'm thirty three years medical . My nam...,159,
2,12376,Alright . um I live in not especially lazy Spr...,468,
3,12630,um So I'm currently twenty-nine . I was born a...,966,
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,


In [None]:
incoh50 = pd.read_csv(drive_data_path + 'simulation_HV_incoh_vb_response_deid_50v3.csv') 
incoh50['t5_word_emb'] = ''
incoh50.head() 

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a good year , an en an en- an responsibili...",421,
1,11689,Sure . I'm thirty three years anxious . My nam...,159,
2,12376,Alright . um something live in not especially ...,468,
3,12630,um So I'm currently twenty-nine . I was born a...,966,
4,13493,Mhm . I'm a thirty five hospital old man anyth...,134,


In [None]:
ineff10 = pd.read_csv(drive_data_path + 'simulation_HV_ineff_vb_response_deid_10v3.csv') 
ineff10['t5_word_emb'] = ''
ineff10.head() 

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,
1,11689,Sure . I'm thirty three years old . My name is...,159,
2,12376,Alright . um I live in not especially cool Spr...,468,
3,12630,um So I was born and raised in South Washingto...,966,
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,


In [None]:
ineff20 = pd.read_csv(drive_data_path + 'simulation_HV_ineff_vb_response_deid_20v3.csv') 
ineff20['t5_word_emb'] = ''
ineff20.head() 

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,
1,11689,Sure . I'm thirty three years old . My name is...,159,
2,12376,Alright . um I live in not especially cool Spr...,468,
3,12630,um So I was born and raised in South Washingto...,966,
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,


In [None]:
ineff50 = pd.read_csv(drive_data_path + 'simulation_HV_ineff_vb_response_deid_50v3.csv') 
ineff50['t5_word_emb'] = ''
ineff50.head() 

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,
1,11689,We have been using that opportunity to do more...,159,
2,12376,Alright . um I live in not especially cool Spr...,468,
3,12630,"My is things are fantastic . No , I mean My uh...",966,
4,13493,Mhm . I still get to play . And my stock inves...,134,


# Get LongText Word Embeddings

In [None]:
# call the function 
# apply it to the dataframe
baseline['t5_word_emb'] = baseline['content'].apply(lambda x: get_last_hidden_state(x))

# double check if it's in good shape
baseline['t5_word_emb'][0].shape

Token indices sequence length is longer than the specified maximum sequence length for this model (601 > 512). Running this sequence through the model will result in indexing errors


TensorShape([1, 601, 512])

In [None]:
baseline.head()

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.031055467, shape=(), dtype=flo..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.1017546, shape=(), dtype=float3..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2531035, shape=(), dtype=float3..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.0014505647, shape=(), dtype=fl..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.0027231025, shape=(), dtype=flo..."


In [None]:
baseline.to_csv(result + 'simulation_HV_baseline_vb_response_deid_v3_t5.csv')

In [None]:
incoh10['t5_word_emb'] = incoh10['content'].apply(lambda x: get_last_hidden_state(x))
incoh10['t5_word_emb'][0].shape

TensorShape([1, 598, 512])

In [None]:
incoh10.tail()

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.032051105, shape=(), dtype=flo..."
1,11689,Sure . I'm thirty three years good . My name i...,159,"(((tf.Tensor(0.11520728, shape=(), dtype=float..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.24389347, shape=(), dtype=float..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.0067433813, shape=(), dtype=flo..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.009083907, shape=(), dtype=floa..."


In [None]:
incoh10.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_t5.csv')

In [None]:
incoh20['t5_word_emb'] = incoh20['content'].apply(lambda x: get_last_hidden_state(x))
incoh20['t5_word_emb'][0].shape

TensorShape([1, 603, 512])

In [None]:
incoh20.tail()

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an things by tr...",421,"(((tf.Tensor(-0.026766412, shape=(), dtype=flo..."
1,11689,Sure . I'm thirty three years medical . My nam...,159,"(((tf.Tensor(0.1074589, shape=(), dtype=float3..."
2,12376,Alright . um I live in not especially lazy Spr...,468,"(((tf.Tensor(0.24678273, shape=(), dtype=float..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.004973783, shape=(), dtype=floa..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.014040765, shape=(), dtype=floa..."


In [None]:
incoh20.to_csv(result + 'simulation_HV_incoh_vb_response_deid_20v3_t5.csv')

In [None]:
incoh50['t5_word_emb'] = incoh50['content'].apply(lambda x: get_last_hidden_state(x))
incoh50['t5_word_emb'][0].shape

TensorShape([1, 591, 512])

In [None]:
incoh50.head()

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a good year , an en an en- an responsibili...",421,"(((tf.Tensor(-0.06148188, shape=(), dtype=floa..."
1,11689,Sure . I'm thirty three years anxious . My nam...,159,"(((tf.Tensor(0.1386814, shape=(), dtype=float3..."
2,12376,Alright . um something live in not especially ...,468,"(((tf.Tensor(0.2296029, shape=(), dtype=float3..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.005068084, shape=(), dtype=flo..."
4,13493,Mhm . I'm a thirty five hospital old man anyth...,134,"(((tf.Tensor(0.086834975, shape=(), dtype=floa..."


In [None]:
incoh50.to_csv(result + 'simulation_HV_incoh_vb_response_deid_50v3_t5.csv')

In [None]:
ineff10['t5_word_emb'] = ineff10['content'].apply(lambda x: get_last_hidden_state(x))
ineff10['t5_word_emb'][0].shape

TensorShape([1, 604, 512])

In [None]:
ineff10.head()

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.028594766, shape=(), dtype=flo..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.0902944, shape=(), dtype=float3..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2577885, shape=(), dtype=float3..."
3,12630,um So I was born and raised in South Washingto...,966,"(((tf.Tensor(-0.002386301, shape=(), dtype=flo..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(-0.011912695, shape=(), dtype=flo..."


In [None]:
ineff10.to_csv(result + 'simulation_HV_ineff_vb_response_deid_10v3_t5.csv')

In [None]:
ineff20['t5_word_emb'] = ineff20['content'].apply(lambda x: get_last_hidden_state(x))
ineff20['t5_word_emb'][0].shape

TensorShape([1, 640, 512])

In [None]:
ineff20.head()

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.030041385, shape=(), dtype=flo..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.116554916, shape=(), dtype=floa..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.27444038, shape=(), dtype=float..."
3,12630,um So I was born and raised in South Washingto...,966,"(((tf.Tensor(0.0054256218, shape=(), dtype=flo..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(-0.009741404, shape=(), dtype=flo..."


In [None]:
ineff20.to_csv(result + 'simulation_HV_ineff_vb_response_deid_20v3_t5.csv')

In [None]:
ineff50['t5_word_emb'] = ineff50['content'].apply(lambda x: get_last_hidden_state(x))
ineff50['t5_word_emb'][0].shape

TensorShape([1, 529, 512])

In [None]:
ineff50.head()

Unnamed: 0,grid,content,n_words,t5_word_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.013111238, shape=(), dtype=flo..."
1,11689,We have been using that opportunity to do more...,159,"(((tf.Tensor(-0.12850149, shape=(), dtype=floa..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2802947, shape=(), dtype=float3..."
3,12630,"My is things are fantastic . No , I mean My uh...",966,"(((tf.Tensor(0.18845989, shape=(), dtype=float..."
4,13493,Mhm . I still get to play . And my stock inves...,134,"(((tf.Tensor(0.004609654, shape=(), dtype=floa..."


In [None]:
ineff50.to_csv(result + 'simulation_HV_ineff_vb_response_deid_50v3_t5.csv')

# Coherence K2:10

### baseline

In [None]:
# add new empty columns
ks=['2', '3', '4', '5', '6', '7', '8', '9', '10'] 
for k in ks:
  baseline['t5_word_k' + k] = '' 
baseline.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.031055467, shape=(), dtype=flo...",,,,,,,,,
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.1017546, shape=(), dtype=float3...",,,,,,,,,
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2531035, shape=(), dtype=float3...",,,,,,,,,
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.0014505647, shape=(), dtype=fl...",,,,,,,,,
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.0027231025, shape=(), dtype=flo...",,,,,,,,,


In [None]:
buglist = {} # always have a buglist
for k in ks:
  cur = 't5_word_k' + k
  for i in baseline.index:

    # record progress
    if i % 5 == 0:
      print('current line: ', i, " coherence k: ", k)

    if baseline['n_words'][i] > 4: # ignore short sequence
      sent = baseline['content'][i]   

      # initiate a dictionary
      # because we need to keep track of word indexing
      # in addition to the cosine similarity scores 
      # of the embedding vectors 
      baseline[cur][i] = {} 

      # loop over each token in the response sequence
      # calculate cosine similarity of token pairs at k inter-token distance
      for word_id in range(len(sent.strip().split(' '))):
        try:
          w1 = baseline['t5_word_emb'][i][[0]][word_id]
          w2 = baseline['t5_word_emb'][i][[0]][word_id+int(k)]
          baseline[cur][i][word_id] = 1 - scipy.spatial.distance.cosine(w1, w2)       
        except:
          buglist[i] = word_id
          continue
      baseline.to_csv(result + 'simulation_HV_baseline_vb_response_deid_v3_t5.csv')

current line:  0  coherence k:  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


current line:  0  coherence k:  3
current line:  0  coherence k:  4
current line:  0  coherence k:  5
current line:  0  coherence k:  6
current line:  0  coherence k:  7
current line:  0  coherence k:  8
current line:  0  coherence k:  9
current line:  0  coherence k:  10


In [None]:
baseline.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.031055467, shape=(), dtype=flo...","{0: 0.9385946393013, 1: 0.9258050918579102, 2:...","{0: 0.9019278287887573, 1: 0.920067548751831, ...","{0: 0.9029392004013062, 1: 0.9147455096244812,...","{0: 0.9051128625869751, 1: 0.8612462282180786,...","{0: 0.8570860624313354, 1: 0.9214776754379272,...","{0: 0.8903406858444214, 1: 0.8851216435432434,...","{0: 0.8832559585571289, 1: 0.8904533386230469,...","{0: 0.8601043224334717, 1: 0.9124910235404968,...","{0: 0.8664131760597229, 1: 0.8918132185935974,..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.1017546, shape=(), dtype=float3...","{0: 0.9589922428131104, 1: 0.9340031743049622,...","{0: 0.9381313920021057, 1: 0.9499139785766602,...","{0: 0.931617021560669, 1: 0.8904510140419006, ...","{0: 0.9201892614364624, 1: 0.9153001308441162,...","{0: 0.9502213001251221, 1: 0.9147319793701172,...","{0: 0.9525159001350403, 1: 0.9129528999328613,...","{0: 0.9399510025978088, 1: 0.8716737031936646,...","{0: 0.9208395481109619, 1: 0.9545168876647949,...","{0: 0.9450736045837402, 1: 0.9524898529052734,..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2531035, shape=(), dtype=float3...","{0: 0.9821363687515259, 1: 0.9487715363502502,...","{0: 0.9518461227416992, 1: 0.9405548572540283,...","{0: 0.9367880821228027, 1: 0.9300448298454285,...","{0: 0.9244054555892944, 1: 0.9083446264266968,...","{0: 0.9351586699485779, 1: 0.9212504029273987,...","{0: 0.9248273968696594, 1: 0.9024133086204529,...","{0: 0.9406172633171082, 1: 0.910184383392334, ...","{0: 0.9614045023918152, 1: 0.9525409936904907,...","{0: 0.9527647495269775, 1: 0.9149569869041443,..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.0014505647, shape=(), dtype=fl...","{0: 0.9561870694160461, 1: 0.843988299369812, ...","{0: 0.8871630430221558, 1: 0.9309291839599609,...","{0: 0.9419207572937012, 1: 0.9425077438354492,...","{0: 0.9435474276542664, 1: 0.9479793906211853,...","{0: 0.9482212066650391, 1: 0.9059748649597168,...","{0: 0.923056960105896, 1: 0.8556352853775024, ...","{0: 0.8662787079811096, 1: 0.9448565244674683,...","{0: 0.9278807640075684, 1: 0.8874595165252686,...","{0: 0.9098501205444336, 1: 0.9587600231170654,..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.0027231025, shape=(), dtype=flo...","{0: 0.9873718619346619, 1: 0.9673035144805908,...","{0: 0.9620503783226013, 1: 0.9596797227859497,...","{0: 0.9439643025398254, 1: 0.9171977639198303,...","{0: 0.8917063474655151, 1: 0.9548231959342957,...","{0: 0.9416873455047607, 1: 0.9162192940711975,...","{0: 0.9068412184715271, 1: 0.9328790903091431,...","{0: 0.9184191226959229, 1: 0.9241153001785278,...","{0: 0.9159093499183655, 1: 0.9162837266921997,...","{0: 0.9063208103179932, 1: 0.9213224649429321,..."


### incoh

In [None]:
for k in ks:
  incoh10['t5_word_k' + k] = '' 
incoh10.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.032051105, shape=(), dtype=flo...",,,,,,,,,
1,11689,Sure . I'm thirty three years good . My name i...,159,"(((tf.Tensor(0.11520728, shape=(), dtype=float...",,,,,,,,,
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.24389347, shape=(), dtype=float...",,,,,,,,,
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.0067433813, shape=(), dtype=flo...",,,,,,,,,
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.009083907, shape=(), dtype=floa...",,,,,,,,,


In [None]:
buglist = {}
for k in ks:
  cur = 't5_word_k' + k
  for i in incoh10.index:

    if i % 5 == 0:
      print('current line: ', i, " coherence k: ", k)

    if incoh10['n_words'][i] > 4:
      sent = incoh10['content'][i]   
      incoh10[cur][i] = {}
      for word_id in range(len(sent.strip().split(' '))):
        try:
          w1 = incoh10['t5_word_emb'][i][[0]][word_id]
          w2 = incoh10['t5_word_emb'][i][[0]][word_id+int(k)]
          incoh10[cur][i][word_id] = 1 - scipy.spatial.distance.cosine(w1, w2)       
        except:
          buglist[i] = word_id
          continue
      incoh10.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_t5.csv')

current line:  0  coherence k:  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


current line:  0  coherence k:  3
current line:  0  coherence k:  4
current line:  0  coherence k:  5
current line:  0  coherence k:  6
current line:  0  coherence k:  7
current line:  0  coherence k:  8
current line:  0  coherence k:  9
current line:  0  coherence k:  10


In [None]:
incoh10.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.032051105, shape=(), dtype=flo...","{0: 0.9373521208763123, 1: 0.9286782145500183,...","{0: 0.9038329720497131, 1: 0.9229267835617065,...","{0: 0.9039842486381531, 1: 0.919945240020752, ...","{0: 0.9082852005958557, 1: 0.8758352398872375,...","{0: 0.8706684112548828, 1: 0.9219260811805725,...","{0: 0.8888946771621704, 1: 0.8911484479904175,...","{0: 0.8852677345275879, 1: 0.8907956480979919,...","{0: 0.8580191731452942, 1: 0.9099339246749878,...","{0: 0.8612217307090759, 1: 0.8918495774269104,..."
1,11689,Sure . I'm thirty three years good . My name i...,159,"(((tf.Tensor(0.11520728, shape=(), dtype=float...","{0: 0.953263521194458, 1: 0.9438183903694153, ...","{0: 0.9423492550849915, 1: 0.9521161317825317,...","{0: 0.9389676451683044, 1: 0.8653549551963806,...","{0: 0.9104220867156982, 1: 0.9233895540237427,...","{0: 0.9548823237419128, 1: 0.9286745190620422,...","{0: 0.9592475295066833, 1: 0.9401805400848389,...","{0: 0.9575971961021423, 1: 0.8827087879180908,...","{0: 0.9294165968894958, 1: 0.9600871205329895,...","{0: 0.9550607800483704, 1: 0.9594836831092834,..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.24389347, shape=(), dtype=float...","{0: 0.9816436171531677, 1: 0.9363804459571838,...","{0: 0.9410749673843384, 1: 0.9352023005485535,...","{0: 0.937721848487854, 1: 0.9280068278312683, ...","{0: 0.9265216588973999, 1: 0.9037998914718628,...","{0: 0.9335740208625793, 1: 0.9175204634666443,...","{0: 0.9206990003585815, 1: 0.8973239064216614,...","{0: 0.9360299706459045, 1: 0.9074298739433289,...","{0: 0.959687352180481, 1: 0.9503894448280334, ...","{0: 0.9489991068840027, 1: 0.9080482125282288,..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.0067433813, shape=(), dtype=flo...","{0: 0.9565696120262146, 1: 0.8485756516456604,...","{0: 0.891523003578186, 1: 0.9299404621124268, ...","{0: 0.9414768218994141, 1: 0.9414905905723572,...","{0: 0.9399988651275635, 1: 0.9501848816871643,...","{0: 0.9498633742332458, 1: 0.9115596413612366,...","{0: 0.9249114990234375, 1: 0.8631559610366821,...","{0: 0.8663604259490967, 1: 0.9504864811897278,...","{0: 0.9337615370750427, 1: 0.891411542892456, ...","{0: 0.9114145040512085, 1: 0.9567343592643738,..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.009083907, shape=(), dtype=floa...","{0: 0.9885735511779785, 1: 0.965068519115448, ...","{0: 0.960785984992981, 1: 0.9622253775596619, ...","{0: 0.9488998055458069, 1: 0.9223074913024902,...","{0: 0.8982526063919067, 1: 0.9517715573310852,...","{0: 0.9391500949859619, 1: 0.9144243001937866,...","{0: 0.9067327380180359, 1: 0.9329472184181213,...","{0: 0.9201116561889648, 1: 0.9211083054542542,...","{0: 0.9139900803565979, 1: 0.9118857979774475,...","{0: 0.9017068147659302, 1: 0.9178103804588318,..."


In [None]:
buglist = {}
for k in ks:
  incoh20['t5_word_k' + k] = '' 

for k in ks:
  cur = 't5_word_k' + k
  for i in incoh20.index:

    if i % 5 == 0:
      print('current line: ', i, " coherence k: ", k)

    if incoh20['n_words'][i] > 4:
      sent = incoh20['content'][i]   
      incoh20[cur][i] = {}
      for word_id in range(len(sent.strip().split(' '))):
        try:
          w1 = incoh20['t5_word_emb'][i][[0]][word_id]
          w2 = incoh20['t5_word_emb'][i][[0]][word_id+int(k)]
          incoh20[cur][i][word_id] = 1 - scipy.spatial.distance.cosine(w1, w2)       
        except:
          buglist[i] = word_id
          continue
      incoh20.to_csv(result + 'simulation_HV_incoh_vb_response_deid_20v3_t5.csv')

current line:  0  coherence k:  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


current line:  0  coherence k:  3
current line:  0  coherence k:  4
current line:  0  coherence k:  5
current line:  0  coherence k:  6
current line:  0  coherence k:  7
current line:  0  coherence k:  8
current line:  0  coherence k:  9
current line:  0  coherence k:  10


In [None]:
incoh20.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a young man , an en an en- an things by tr...",421,"(((tf.Tensor(-0.026766412, shape=(), dtype=flo...","{0: 0.9311738610267639, 1: 0.9156302213668823,...","{0: 0.8918306231498718, 1: 0.9145900011062622,...","{0: 0.8945391774177551, 1: 0.9154523611068726,...","{0: 0.908588171005249, 1: 0.8569351434707642, ...","{0: 0.8551972508430481, 1: 0.9111636877059937,...","{0: 0.8784613013267517, 1: 0.8749086260795593,...","{0: 0.8740364909172058, 1: 0.9044381380081177,...","{0: 0.8796566128730774, 1: 0.9189912676811218,...","{0: 0.8715235590934753, 1: 0.9048242568969727,..."
1,11689,Sure . I'm thirty three years medical . My nam...,159,"(((tf.Tensor(0.1074589, shape=(), dtype=float3...","{0: 0.9399039149284363, 1: 0.9386727213859558,...","{0: 0.9383133053779602, 1: 0.9497955441474915,...","{0: 0.9280366897583008, 1: 0.8977587819099426,...","{0: 0.9301880598068237, 1: 0.9372321367263794,...","{0: 0.9585370421409607, 1: 0.9416293501853943,...","{0: 0.9623845219612122, 1: 0.9378107190132141,...","{0: 0.9588989019393921, 1: 0.8571140766143799,...","{0: 0.901921272277832, 1: 0.9490194916725159, ...","{0: 0.9390887022018433, 1: 0.9414582252502441,..."
2,12376,Alright . um I live in not especially lazy Spr...,468,"(((tf.Tensor(0.24678273, shape=(), dtype=float...","{0: 0.9819713830947876, 1: 0.9351614117622375,...","{0: 0.9405555725097656, 1: 0.9413254857063293,...","{0: 0.9461461305618286, 1: 0.9341707229614258,...","{0: 0.9330058097839355, 1: 0.9139630198478699,...","{0: 0.9385047554969788, 1: 0.9258426427841187,...","{0: 0.9225903749465942, 1: 0.9073160290718079,...","{0: 0.9364153146743774, 1: 0.9246516227722168,...","{0: 0.9563087224960327, 1: 0.9587914943695068,...","{0: 0.9566920399665833, 1: 0.9196252822875977,..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.004973783, shape=(), dtype=floa...","{0: 0.9550102353096008, 1: 0.8532222509384155,...","{0: 0.8977928757667542, 1: 0.930566132068634, ...","{0: 0.9416907429695129, 1: 0.9373212456703186,...","{0: 0.9345315098762512, 1: 0.9513927102088928,...","{0: 0.9480693340301514, 1: 0.9160832762718201,...","{0: 0.9283491373062134, 1: 0.8675307631492615,...","{0: 0.8708361387252808, 1: 0.9483070969581604,...","{0: 0.9260889887809753, 1: 0.8936328887939453,...","{0: 0.913896918296814, 1: 0.9597697257995605, ..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.014040765, shape=(), dtype=floa...","{0: 0.9883755445480347, 1: 0.9669952392578125,...","{0: 0.9618869423866272, 1: 0.96144038438797, 2...","{0: 0.9472982883453369, 1: 0.9171295762062073,...","{0: 0.8940153121948242, 1: 0.9496671557426453,...","{0: 0.936241090297699, 1: 0.9066365361213684, ...","{0: 0.8979589939117432, 1: 0.9263139367103577,...","{0: 0.9102492332458496, 1: 0.9146944284439087,...","{0: 0.905790388584137, 1: 0.9117351770401001, ...","{0: 0.9004874229431152, 1: 0.9159801602363586,..."


In [None]:
buglist = {}
for k in ks:
  incoh50['t5_word_k' + k] = '' 

for k in ks:
  cur = 't5_word_k' + k
  for i in incoh50.index:

    if i % 5 == 0:
      print('current line: ', i, " coherence k: ", k)

    if incoh50['n_words'][i] > 4:
      sent = incoh50['content'][i]   
      incoh50[cur][i] = {}
      for word_id in range(len(sent.strip().split(' '))):
        try:
          w1 = incoh50['t5_word_emb'][i][[0]][word_id]
          w2 = incoh50['t5_word_emb'][i][[0]][word_id+int(k)]
          incoh50[cur][i][word_id] = 1 - scipy.spatial.distance.cosine(w1, w2)       
        except:
          buglist[i] = word_id
          continue
      incoh50.to_csv(result + 'simulation_HV_incoh_vb_response_deid_50v3_t5.csv')

current line:  0  coherence k:  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


current line:  0  coherence k:  3
current line:  0  coherence k:  4
current line:  0  coherence k:  5
current line:  0  coherence k:  6
current line:  0  coherence k:  7
current line:  0  coherence k:  8
current line:  0  coherence k:  9
current line:  0  coherence k:  10


In [None]:
incoh50.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a good year , an en an en- an responsibili...",421,"(((tf.Tensor(-0.06148188, shape=(), dtype=floa...","{0: 0.9321799278259277, 1: 0.9218968152999878,...","{0: 0.9035277366638184, 1: 0.9098954796791077,...","{0: 0.9057110548019409, 1: 0.8806290030479431,...","{0: 0.8907807469367981, 1: 0.8589577674865723,...","{0: 0.8734230995178223, 1: 0.887036144733429, ...","{0: 0.854229748249054, 1: 0.8469346165657043, ...","{0: 0.8579909801483154, 1: 0.8821069598197937,...","{0: 0.8578127026557922, 1: 0.9118313193321228,...","{0: 0.8580976128578186, 1: 0.8822059631347656,..."
1,11689,Sure . I'm thirty three years anxious . My nam...,159,"(((tf.Tensor(0.1386814, shape=(), dtype=float3...","{0: 0.9465814828872681, 1: 0.9467255473136902,...","{0: 0.946717381477356, 1: 0.9604710340499878, ...","{0: 0.9371103048324585, 1: 0.9106454253196716,...","{0: 0.9294113516807556, 1: 0.9404101967811584,...","{0: 0.9566912651062012, 1: 0.939863383769989, ...","{0: 0.9589848518371582, 1: 0.9299648404121399,...","{0: 0.9429657459259033, 1: 0.8906495571136475,...","{0: 0.9161661863327026, 1: 0.9675136804580688,...","{0: 0.9437569975852966, 1: 0.9583231806755066,..."
2,12376,Alright . um something live in not especially ...,468,"(((tf.Tensor(0.2296029, shape=(), dtype=float3...","{0: 0.9793382883071899, 1: 0.9280641674995422,...","{0: 0.9631506204605103, 1: 0.8845401406288147,...","{0: 0.8699365258216858, 1: 0.8641678094863892,...","{0: 0.8529810309410095, 1: 0.8757504820823669,...","{0: 0.8971152305603027, 1: 0.8880547881126404,...","{0: 0.8871526718139648, 1: 0.8790754079818726,...","{0: 0.9138968586921692, 1: 0.8940346240997314,...","{0: 0.9289028644561768, 1: 0.9067235589027405,...","{0: 0.9343280792236328, 1: 0.8785805702209473,..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.005068084, shape=(), dtype=flo...","{0: 0.9611235857009888, 1: 0.8513863682746887,...","{0: 0.8886478543281555, 1: 0.9352755546569824,...","{0: 0.9448012709617615, 1: 0.9441168904304504,...","{0: 0.9476511478424072, 1: 0.955906331539154, ...","{0: 0.9564458727836609, 1: 0.9115990400314331,...","{0: 0.9250994920730591, 1: 0.8643674850463867,...","{0: 0.8707772493362427, 1: 0.9503769874572754,...","{0: 0.9378501176834106, 1: 0.8748310804367065,...","{0: 0.8926768898963928, 1: 0.9571195840835571,..."
4,13493,Mhm . I'm a thirty five hospital old man anyth...,134,"(((tf.Tensor(0.086834975, shape=(), dtype=floa...","{0: 0.9856374859809875, 1: 0.9595798850059509,...","{0: 0.9542460441589355, 1: 0.9372810125350952,...","{0: 0.9260008335113525, 1: 0.8819143176078796,...","{0: 0.8537870645523071, 1: 0.9364335536956787,...","{0: 0.9220288395881653, 1: 0.8871068358421326,...","{0: 0.8714859485626221, 1: 0.9122299551963806,...","{0: 0.8988337516784668, 1: 0.9003046751022339,...","{0: 0.8829924464225769, 1: 0.9176974296569824,...","{0: 0.9044548273086548, 1: 0.9129231572151184,..."


### ineff

In [None]:
# add empty new column
for k in ks:
  ineff10['t5_word_k' + k] = '' 

buglist = {}
for k in ks:
  cur = 't5_word_k' + k
  for i in ineff10.index:

    if i % 5 == 0:
      print('current line: ', i, " coherence k: ", k)

    if ineff10['n_words'][i] > 4:
      sent = ineff10['content'][i]   
      ineff10[cur][i] = {}
      for word_id in range(len(sent.strip().split(' '))):
        try:
          w1 = ineff10['t5_word_emb'][i][[0]][word_id]
          w2 = ineff10['t5_word_emb'][i][[0]][word_id+int(k)]
          ineff10[cur][i][word_id] = 1 - scipy.spatial.distance.cosine(w1, w2)       
        except:
          buglist[i] = word_id
          continue
      ineff10.to_csv(result + 'simulation_HV_ineff_vb_response_deid_10v3_t5.csv')

current line:  0  coherence k:  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


current line:  0  coherence k:  3
current line:  0  coherence k:  4
current line:  0  coherence k:  5
current line:  0  coherence k:  6
current line:  0  coherence k:  7
current line:  0  coherence k:  8
current line:  0  coherence k:  9
current line:  0  coherence k:  10


In [None]:
ineff10.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.028594766, shape=(), dtype=flo...","{0: 0.93770831823349, 1: 0.9271951913833618, 2...","{0: 0.9028423428535461, 1: 0.9205344319343567,...","{0: 0.9036267399787903, 1: 0.9146499037742615,...","{0: 0.9054772853851318, 1: 0.8583513498306274,...","{0: 0.8545929789543152, 1: 0.9226922988891602,...","{0: 0.8916099667549133, 1: 0.885294497013092, ...","{0: 0.8828140497207642, 1: 0.8900607824325562,...","{0: 0.8598988056182861, 1: 0.9147745966911316,...","{0: 0.8689389824867249, 1: 0.89243084192276, 2..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.0902944, shape=(), dtype=float3...","{0: 0.9474658966064453, 1: 0.9307212829589844,...","{0: 0.940139651298523, 1: 0.9485692977905273, ...","{0: 0.9309148192405701, 1: 0.8935031890869141,...","{0: 0.9258096814155579, 1: 0.9164783954620361,...","{0: 0.9545851945877075, 1: 0.9266420006752014,...","{0: 0.9633139967918396, 1: 0.920332133769989, ...","{0: 0.9488325715065002, 1: 0.8852593898773193,...","{0: 0.93753981590271, 1: 0.962739109992981, 2:...","{0: 0.9461092948913574, 1: 0.9556964635848999,..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2577885, shape=(), dtype=float3...","{0: 0.9807088375091553, 1: 0.9471968412399292,...","{0: 0.9613189101219177, 1: 0.947767436504364, ...","{0: 0.9438981413841248, 1: 0.9247361421585083,...","{0: 0.9241453409194946, 1: 0.9050881266593933,...","{0: 0.9338926076889038, 1: 0.9174119830131531,...","{0: 0.9197142720222473, 1: 0.8997848629951477,...","{0: 0.9389548897743225, 1: 0.9117051362991333,...","{0: 0.9603933095932007, 1: 0.9544113874435425,...","{0: 0.9506217837333679, 1: 0.9115670323371887,..."
3,12630,um So I was born and raised in South Washingto...,966,"(((tf.Tensor(-0.002386301, shape=(), dtype=flo...","{0: 0.9592886567115784, 1: 0.9517082571983337,...","{0: 0.9580650329589844, 1: 0.9411338567733765,...","{0: 0.9478919506072998, 1: 0.9021629691123962,...","{0: 0.9135681986808777, 1: 0.919169545173645, ...","{0: 0.911134660243988, 1: 0.9060215353965759, ...","{0: 0.9334521293640137, 1: 0.8396468758583069,...","{0: 0.8837360143661499, 1: 0.921373724937439, ...","{0: 0.9360584020614624, 1: 0.8534910082817078,...","{0: 0.882914662361145, 1: 0.9415048956871033, ..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(-0.011912695, shape=(), dtype=flo...","{0: 0.9865770936012268, 1: 0.9645763635635376,...","{0: 0.9579520225524902, 1: 0.9565127491950989,...","{0: 0.9390543103218079, 1: 0.920798122882843, ...","{0: 0.895594596862793, 1: 0.9516465067863464, ...","{0: 0.9384924173355103, 1: 0.9181991815567017,...","{0: 0.9085126519203186, 1: 0.9266952276229858,...","{0: 0.9099721908569336, 1: 0.9225653409957886,...","{0: 0.9125969409942627, 1: 0.9155382513999939,...","{0: 0.9041608572006226, 1: 0.9197299480438232,..."


In [None]:
# add new empty column
for k in ks:
  ineff20['t5_word_k' + k] = '' 

buglist = {}
for k in ks:
  cur = 't5_word_k' + k
  for i in ineff20.index:

    if i % 5 == 0:
      print('current line: ', i, " coherence k: ", k)

    if ineff20['n_words'][i] > 4:
      sent = ineff20['content'][i]   
      ineff20[cur][i] = {}
      for word_id in range(len(sent.strip().split(' '))):
        try:
          w1 = ineff20['t5_word_emb'][i][[0]][word_id]
          w2 = ineff20['t5_word_emb'][i][[0]][word_id+int(k)]
          ineff20[cur][i][word_id] = 1 - scipy.spatial.distance.cosine(w1, w2)       
        except:
          buglist[i] = word_id
          continue
      ineff20.to_csv(result + 'simulation_HV_ineff_vb_response_deid_20v3_t5.csv')

current line:  0  coherence k:  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


current line:  0  coherence k:  3
current line:  0  coherence k:  4
current line:  0  coherence k:  5
current line:  0  coherence k:  6
current line:  0  coherence k:  7
current line:  0  coherence k:  8
current line:  0  coherence k:  9
current line:  0  coherence k:  10


In [None]:
ineff20.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.030041385, shape=(), dtype=flo...","{0: 0.9393248558044434, 1: 0.9268133044242859,...","{0: 0.9028229713439941, 1: 0.9217650294303894,...","{0: 0.9052221179008484, 1: 0.9161338806152344,...","{0: 0.9069098234176636, 1: 0.863053023815155, ...","{0: 0.8604294061660767, 1: 0.924543559551239, ...","{0: 0.893572986125946, 1: 0.8857474327087402, ...","{0: 0.8853638172149658, 1: 0.8907713890075684,...","{0: 0.8618413805961609, 1: 0.9143766760826111,...","{0: 0.8681659698486328, 1: 0.8945172429084778,..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.116554916, shape=(), dtype=floa...","{0: 0.9512450098991394, 1: 0.939349353313446, ...","{0: 0.9414694905281067, 1: 0.9516593813896179,...","{0: 0.9313738346099854, 1: 0.902847945690155, ...","{0: 0.9215516448020935, 1: 0.9141810536384583,...","{0: 0.9425660371780396, 1: 0.9139177203178406,...","{0: 0.9457446932792664, 1: 0.9142404794692993,...","{0: 0.9372137784957886, 1: 0.8834353685379028,...","{0: 0.9210046529769897, 1: 0.9554736614227295,...","{0: 0.9394871592521667, 1: 0.9547345638275146,..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.27444038, shape=(), dtype=float...","{0: 0.982474684715271, 1: 0.9495822191238403, ...","{0: 0.9692144989967346, 1: 0.9290608167648315,...","{0: 0.9267149567604065, 1: 0.9037085175514221,...","{0: 0.8967114686965942, 1: 0.8978587985038757,...","{0: 0.9219183921813965, 1: 0.9057729244232178,...","{0: 0.9091338515281677, 1: 0.9018996953964233,...","{0: 0.9337750673294067, 1: 0.9130892753601074,...","{0: 0.9556854367256165, 1: 0.9470876455307007,...","{0: 0.943213939666748, 1: 0.9051385521888733, ..."
3,12630,um So I was born and raised in South Washingto...,966,"(((tf.Tensor(0.0054256218, shape=(), dtype=flo...","{0: 0.9524444341659546, 1: 0.9556344151496887,...","{0: 0.9477974772453308, 1: 0.9516834020614624,...","{0: 0.9401721358299255, 1: 0.9148193597793579,...","{0: 0.9078903198242188, 1: 0.9202605485916138,...","{0: 0.8981078863143921, 1: 0.9145378470420837,...","{0: 0.9257009029388428, 1: 0.8543397188186646,...","{0: 0.8814666271209717, 1: 0.93388831615448, 2...","{0: 0.9289781451225281, 1: 0.8605290651321411,...","{0: 0.8730365633964539, 1: 0.959425687789917, ..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(-0.009741404, shape=(), dtype=flo...","{0: 0.9895449280738831, 1: 0.9505131244659424,...","{0: 0.946247398853302, 1: 0.9623329043388367, ...","{0: 0.9502137303352356, 1: 0.9238979816436768,...","{0: 0.9034985303878784, 1: 0.9303346276283264,...","{0: 0.918174684047699, 1: 0.9229857921600342, ...","{0: 0.9171938300132751, 1: 0.9215077757835388,...","{0: 0.9072182178497314, 1: 0.9167945981025696,...","{0: 0.9079545140266418, 1: 0.9038049578666687,...","{0: 0.8897554278373718, 1: 0.9097139239311218,..."


In [None]:
# add empty new column
for k in ks:
  ineff50['t5_word_k' + k] = '' 

for k in ks:
  cur = 't5_word_k' + k
  for i in ineff50.index:

    if i % 5 == 0:
      print('current line: ', i, " coherence k: ", k)

    if ineff50['n_words'][i] > 4:
      sent = ineff50['content'][i]   
      ineff50[cur][i] = {}
      for word_id in range(len(sent.strip().split(' '))):
        try:
          w1 = ineff50['t5_word_emb'][i][[0]][word_id]
          w2 = ineff50['t5_word_emb'][i][[0]][word_id+int(k)]
          ineff50[cur][i][word_id] = 1 - scipy.spatial.distance.cosine(w1, w2)       
        except:
          buglist[i] = word_id
          continue
      ineff50.to_csv(result + 'simulation_HV_ineff_vb_response_deid_50v3_t5.csv')

current line:  0  coherence k:  2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


current line:  0  coherence k:  3
current line:  0  coherence k:  4
current line:  0  coherence k:  5
current line:  0  coherence k:  6
current line:  0  coherence k:  7
current line:  0  coherence k:  8
current line:  0  coherence k:  9
current line:  0  coherence k:  10


In [None]:
ineff50.head()

Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.013111238, shape=(), dtype=flo...","{0: 0.9365220665931702, 1: 0.9148172736167908,...","{0: 0.9017055034637451, 1: 0.8717079162597656,...","{0: 0.871260404586792, 1: 0.8609915375709534, ...","{0: 0.8649638891220093, 1: 0.7779065370559692,...","{0: 0.7916779518127441, 1: 0.9149607419967651,...","{0: 0.8805050849914551, 1: 0.8690401315689087,...","{0: 0.8658959865570068, 1: 0.8592397570610046,...","{0: 0.8236812949180603, 1: 0.914543092250824, ...","{0: 0.8554473519325256, 1: 0.8556543588638306,..."
1,11689,We have been using that opportunity to do more...,159,"(((tf.Tensor(-0.12850149, shape=(), dtype=floa...","{0: 0.9383153915405273, 1: 0.9557745456695557,...","{0: 0.931777834892273, 1: 0.9409135580062866, ...","{0: 0.9208097457885742, 1: 0.9250185489654541,...","{0: 0.9166374206542969, 1: 0.9056608080863953,...","{0: 0.9018595814704895, 1: 0.9243329763412476,...","{0: 0.9215403199195862, 1: 0.925862193107605, ...","{0: 0.9126757383346558, 1: 0.9166074395179749,...","{0: 0.9175317883491516, 1: 0.9206204414367676,...","{0: 0.9280173778533936, 1: 0.9213743805885315,..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2802947, shape=(), dtype=float3...","{0: 0.9825338125228882, 1: 0.9633914828300476,...","{0: 0.984454333782196, 1: 0.9628877639770508, ...","{0: 0.9690616726875305, 1: 0.922105610370636, ...","{0: 0.9330462217330933, 1: 0.9091662764549255,...","{0: 0.9344704151153564, 1: 0.9184996485710144,...","{0: 0.921454668045044, 1: 0.9163060188293457, ...","{0: 0.9532913565635681, 1: 0.9266148805618286,...","{0: 0.9637266397476196, 1: 0.9591965675354004,...","{0: 0.9525310397148132, 1: 0.919654369354248, ..."
3,12630,"My is things are fantastic . No , I mean My uh...",966,"(((tf.Tensor(0.18845989, shape=(), dtype=float...","{0: 0.919815719127655, 1: 0.9304170608520508, ...","{0: 0.9212266802787781, 1: 0.9312571883201599,...","{0: 0.9209229946136475, 1: 0.9350417256355286,...","{0: 0.9305492043495178, 1: 0.9560334086418152,...","{0: 0.9351797103881836, 1: 0.9267600178718567,...","{0: 0.9179043173789978, 1: 0.9416138529777527,...","{0: 0.9336761832237244, 1: 0.9579785466194153,...","{0: 0.9359200596809387, 1: 0.9408994913101196,...","{0: 0.9292818903923035, 1: 0.9607641696929932,..."
4,13493,Mhm . I still get to play . And my stock inves...,134,"(((tf.Tensor(0.004609654, shape=(), dtype=floa...","{0: 0.9850203990936279, 1: 0.9424539804458618,...","{0: 0.9499500393867493, 1: 0.9570140838623047,...","{0: 0.9476947784423828, 1: 0.9172466397285461,...","{0: 0.9070841670036316, 1: 0.895824670791626, ...","{0: 0.9051885008811951, 1: 0.8856618404388428,...","{0: 0.8997299075126648, 1: 0.9010818600654602,...","{0: 0.9113234281539917, 1: 0.8966946601867676,...","{0: 0.8873902559280396, 1: 0.9071586728096008,...","{0: 0.8998860716819763, 1: 0.9086571931838989,..."


# MV 510

In [None]:
def divide_chunks(l, n):
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]

### baseline

In [None]:
df_sim = baseline

df_sim['t5_word_mv5'] = ''
df_sim['t5_word_mv10'] = ''

ks = ['5','10']
for k in ks:
  cur = 't5_word_mv' + k 

  for i in df_sim.index:
    if df_sim['n_words'][i] > 4: 
      response = df_sim['content'][i]
      t5_embed = df_sim['t5_word_emb'][i][[0]]

      # dictionary
      # with chunk_id and word_id in that 5-token chunk as key
      # and cosine similarity scores as value
      df_sim[cur][i] = {}

      word_embed_chunk = list(divide_chunks(range(len(response.strip().split(' '))),int(k)))

      # loop over each 5-token chunk
      for chunck_id, word_embed in enumerate(word_embed_chunk):
        for word_id in word_embed:
          try:
              w1 = t5_embed[word_id]
              w2 = t5_embed[word_id+1]
              df_sim[cur][i][str(chunck_id) + '_' + str(word_id)] = 1 - scipy.spatial.distance.cosine(w1, w2)
              # things that should have been done
              # to keep consistency across LMs
              # take average of similarities for each 5-token chunk
              # then append to a list that collects similarity means for that response
              # prep for stats

          except IndexError:
            continue
      
      # check progress
      if i % 50 == 0:
        print('finished line, len T5 mv510 ', i, len(df_sim['t5_word_mv5'][i])) 

      df_sim.to_csv(result + 'simulation_HV_baseline_vb_response_deid_v3_t5.csv')

baseline = df_sim
baseline.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished line, len T5 mv510  0 451
finished line, len T5 mv510  0 451


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10,t5_word_mv5,t5_word_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.031055467, shape=(), dtype=flo...","{0: 0.9385946393013, 1: 0.9258050918579102, 2:...","{0: 0.9019278287887573, 1: 0.920067548751831, ...","{0: 0.9029392004013062, 1: 0.9147455096244812,...","{0: 0.9051128625869751, 1: 0.8612462282180786,...","{0: 0.8570860624313354, 1: 0.9214776754379272,...","{0: 0.8903406858444214, 1: 0.8851216435432434,...","{0: 0.8832559585571289, 1: 0.8904533386230469,...","{0: 0.8601043224334717, 1: 0.9124910235404968,...","{0: 0.8664131760597229, 1: 0.8918132185935974,...","{'0_0': 0.9797008633613586, '0_1': 0.915917336...","{'0_0': 0.9797008633613586, '0_1': 0.915917336..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.1017546, shape=(), dtype=float3...","{0: 0.9589922428131104, 1: 0.9340031743049622,...","{0: 0.9381313920021057, 1: 0.9499139785766602,...","{0: 0.931617021560669, 1: 0.8904510140419006, ...","{0: 0.9201892614364624, 1: 0.9153001308441162,...","{0: 0.9502213001251221, 1: 0.9147319793701172,...","{0: 0.9525159001350403, 1: 0.9129528999328613,...","{0: 0.9399510025978088, 1: 0.8716737031936646,...","{0: 0.9208395481109619, 1: 0.9545168876647949,...","{0: 0.9450736045837402, 1: 0.9524898529052734,...","{'0_0': 0.970119297504425, '0_1': 0.9612366557...","{'0_0': 0.970119297504425, '0_1': 0.9612366557..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2531035, shape=(), dtype=float3...","{0: 0.9821363687515259, 1: 0.9487715363502502,...","{0: 0.9518461227416992, 1: 0.9405548572540283,...","{0: 0.9367880821228027, 1: 0.9300448298454285,...","{0: 0.9244054555892944, 1: 0.9083446264266968,...","{0: 0.9351586699485779, 1: 0.9212504029273987,...","{0: 0.9248273968696594, 1: 0.9024133086204529,...","{0: 0.9406172633171082, 1: 0.910184383392334, ...","{0: 0.9614045023918152, 1: 0.9525409936904907,...","{0: 0.9527647495269775, 1: 0.9149569869041443,...","{'0_0': 0.9651936292648315, '0_1': 0.931818366...","{'0_0': 0.9651936292648315, '0_1': 0.931818366..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.0014505647, shape=(), dtype=fl...","{0: 0.9561870694160461, 1: 0.843988299369812, ...","{0: 0.8871630430221558, 1: 0.9309291839599609,...","{0: 0.9419207572937012, 1: 0.9425077438354492,...","{0: 0.9435474276542664, 1: 0.9479793906211853,...","{0: 0.9482212066650391, 1: 0.9059748649597168,...","{0: 0.923056960105896, 1: 0.8556352853775024, ...","{0: 0.8662787079811096, 1: 0.9448565244674683,...","{0: 0.9278807640075684, 1: 0.8874595165252686,...","{0: 0.9098501205444336, 1: 0.9587600231170654,...","{'0_0': 0.9844767451286316, '0_1': 0.946709275...","{'0_0': 0.9844767451286316, '0_1': 0.946709275..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.0027231025, shape=(), dtype=flo...","{0: 0.9873718619346619, 1: 0.9673035144805908,...","{0: 0.9620503783226013, 1: 0.9596797227859497,...","{0: 0.9439643025398254, 1: 0.9171977639198303,...","{0: 0.8917063474655151, 1: 0.9548231959342957,...","{0: 0.9416873455047607, 1: 0.9162192940711975,...","{0: 0.9068412184715271, 1: 0.9328790903091431,...","{0: 0.9184191226959229, 1: 0.9241153001785278,...","{0: 0.9159093499183655, 1: 0.9162837266921997,...","{0: 0.9063208103179932, 1: 0.9213224649429321,...","{'0_0': 0.993344247341156, '0_1': 0.9902709126...","{'0_0': 0.993344247341156, '0_1': 0.9902709126..."


### incoh

In [None]:
df_sim = incoh10

df_sim['t5_word_mv5'] = ''
df_sim['t5_word_mv10'] = ''

ks = ['5','10']
for k in ks:
  cur = 't5_word_mv' + k 

  for i in df_sim.index:
    if df_sim['n_words'][i] > 4: 
      response = df_sim['content'][i]
      t5_embed = df_sim['t5_word_emb'][i][[0]]
      df_sim[cur][i] = {}
      word_embed_chunk = list(divide_chunks(range(len(response.strip().split(' '))),int(k)))

      for chunck_id, word_embed in enumerate(word_embed_chunk):
        for word_id in word_embed:
          try:
              w1 = t5_embed[word_id]
              w2 = t5_embed[word_id+1]
              df_sim[cur][i][str(chunck_id) + '_' + str(word_id)] = 1 - scipy.spatial.distance.cosine(w1, w2)
          except IndexError:
            continue
      if i % 50 == 0:
        print('finished line, len T5 mv510 ', i, len(df_sim['t5_word_mv5'][i]))  
      df_sim.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_t5.csv')

incoh10 = df_sim
incoh10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished line, len T5 mv510  0 448
finished line, len T5 mv510  0 448


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10,t5_word_mv5,t5_word_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.032051105, shape=(), dtype=flo...","{0: 0.9373521208763123, 1: 0.9286782145500183,...","{0: 0.9038329720497131, 1: 0.9229267835617065,...","{0: 0.9039842486381531, 1: 0.919945240020752, ...","{0: 0.9082852005958557, 1: 0.8758352398872375,...","{0: 0.8706684112548828, 1: 0.9219260811805725,...","{0: 0.8888946771621704, 1: 0.8911484479904175,...","{0: 0.8852677345275879, 1: 0.8907956480979919,...","{0: 0.8580191731452942, 1: 0.9099339246749878,...","{0: 0.8612217307090759, 1: 0.8918495774269104,...","{'0_0': 0.979918360710144, '0_1': 0.9164171218...","{'0_0': 0.979918360710144, '0_1': 0.9164171218..."
1,11689,Sure . I'm thirty three years good . My name i...,159,"(((tf.Tensor(0.11520728, shape=(), dtype=float...","{0: 0.953263521194458, 1: 0.9438183903694153, ...","{0: 0.9423492550849915, 1: 0.9521161317825317,...","{0: 0.9389676451683044, 1: 0.8653549551963806,...","{0: 0.9104220867156982, 1: 0.9233895540237427,...","{0: 0.9548823237419128, 1: 0.9286745190620422,...","{0: 0.9592475295066833, 1: 0.9401805400848389,...","{0: 0.9575971961021423, 1: 0.8827087879180908,...","{0: 0.9294165968894958, 1: 0.9600871205329895,...","{0: 0.9550607800483704, 1: 0.9594836831092834,...","{'0_0': 0.9725017547607422, '0_1': 0.962101936...","{'0_0': 0.9725017547607422, '0_1': 0.962101936..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.24389347, shape=(), dtype=float...","{0: 0.9816436171531677, 1: 0.9363804459571838,...","{0: 0.9410749673843384, 1: 0.9352023005485535,...","{0: 0.937721848487854, 1: 0.9280068278312683, ...","{0: 0.9265216588973999, 1: 0.9037998914718628,...","{0: 0.9335740208625793, 1: 0.9175204634666443,...","{0: 0.9206990003585815, 1: 0.8973239064216614,...","{0: 0.9360299706459045, 1: 0.9074298739433289,...","{0: 0.959687352180481, 1: 0.9503894448280334, ...","{0: 0.9489991068840027, 1: 0.9080482125282288,...","{'0_0': 0.9651522636413574, '0_1': 0.930575370...","{'0_0': 0.9651522636413574, '0_1': 0.930575370..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.0067433813, shape=(), dtype=flo...","{0: 0.9565696120262146, 1: 0.8485756516456604,...","{0: 0.891523003578186, 1: 0.9299404621124268, ...","{0: 0.9414768218994141, 1: 0.9414905905723572,...","{0: 0.9399988651275635, 1: 0.9501848816871643,...","{0: 0.9498633742332458, 1: 0.9115596413612366,...","{0: 0.9249114990234375, 1: 0.8631559610366821,...","{0: 0.8663604259490967, 1: 0.9504864811897278,...","{0: 0.9337615370750427, 1: 0.891411542892456, ...","{0: 0.9114145040512085, 1: 0.9567343592643738,...","{'0_0': 0.9809640049934387, '0_1': 0.947322368...","{'0_0': 0.9809640049934387, '0_1': 0.947322368..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.009083907, shape=(), dtype=floa...","{0: 0.9885735511779785, 1: 0.965068519115448, ...","{0: 0.960785984992981, 1: 0.9622253775596619, ...","{0: 0.9488998055458069, 1: 0.9223074913024902,...","{0: 0.8982526063919067, 1: 0.9517715573310852,...","{0: 0.9391500949859619, 1: 0.9144243001937866,...","{0: 0.9067327380180359, 1: 0.9329472184181213,...","{0: 0.9201116561889648, 1: 0.9211083054542542,...","{0: 0.9139900803565979, 1: 0.9118857979774475,...","{0: 0.9017068147659302, 1: 0.9178103804588318,...","{'0_0': 0.9933598637580872, '0_1': 0.990649163...","{'0_0': 0.9933598637580872, '0_1': 0.990649163..."


In [None]:
df_sim = incoh20

df_sim['t5_word_mv5'] = ''
df_sim['t5_word_mv10'] = ''

ks = ['5','10']
for k in ks:
  cur = 't5_word_mv' + k 

  for i in df_sim.index:
    if df_sim['n_words'][i] > 4: 
      response = df_sim['content'][i]
      t5_embed = df_sim['t5_word_emb'][i][[0]]
      df_sim[cur][i] = {}
      word_embed_chunk = list(divide_chunks(range(len(response.strip().split(' '))),int(k)))

      for chunck_id, word_embed in enumerate(word_embed_chunk):
        for word_id in word_embed:
          try:
              w1 = t5_embed[word_id]
              w2 = t5_embed[word_id+1]
              df_sim[cur][i][str(chunck_id) + '_' + str(word_id)] = 1 - scipy.spatial.distance.cosine(w1, w2)
          except IndexError:
            continue
      if i % 50 == 0:
        print('finished line, len T5 mv510 ', i, len(df_sim['t5_word_mv5'][i]))  
      df_sim.to_csv(result + 'simulation_HV_incoh_vb_response_deid_20v3_t5.csv')

incoh20 = df_sim
incoh20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished line, len T5 mv510  0 448
finished line, len T5 mv510  0 448


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10,t5_word_mv5,t5_word_mv10
0,10455,"I'm a young man , an en an en- an things by tr...",421,"(((tf.Tensor(-0.026766412, shape=(), dtype=flo...","{0: 0.9311738610267639, 1: 0.9156302213668823,...","{0: 0.8918306231498718, 1: 0.9145900011062622,...","{0: 0.8945391774177551, 1: 0.9154523611068726,...","{0: 0.908588171005249, 1: 0.8569351434707642, ...","{0: 0.8551972508430481, 1: 0.9111636877059937,...","{0: 0.8784613013267517, 1: 0.8749086260795593,...","{0: 0.8740364909172058, 1: 0.9044381380081177,...","{0: 0.8796566128730774, 1: 0.9189912676811218,...","{0: 0.8715235590934753, 1: 0.9048242568969727,...","{'0_0': 0.9769889116287231, '0_1': 0.904984772...","{'0_0': 0.9769889116287231, '0_1': 0.904984772..."
1,11689,Sure . I'm thirty three years medical . My nam...,159,"(((tf.Tensor(0.1074589, shape=(), dtype=float3...","{0: 0.9399039149284363, 1: 0.9386727213859558,...","{0: 0.9383133053779602, 1: 0.9497955441474915,...","{0: 0.9280366897583008, 1: 0.8977587819099426,...","{0: 0.9301880598068237, 1: 0.9372321367263794,...","{0: 0.9585370421409607, 1: 0.9416293501853943,...","{0: 0.9623845219612122, 1: 0.9378107190132141,...","{0: 0.9588989019393921, 1: 0.8571140766143799,...","{0: 0.901921272277832, 1: 0.9490194916725159, ...","{0: 0.9390887022018433, 1: 0.9414582252502441,...","{'0_0': 0.9684950113296509, '0_1': 0.955493330...","{'0_0': 0.9684950113296509, '0_1': 0.955493330..."
2,12376,Alright . um I live in not especially lazy Spr...,468,"(((tf.Tensor(0.24678273, shape=(), dtype=float...","{0: 0.9819713830947876, 1: 0.9351614117622375,...","{0: 0.9405555725097656, 1: 0.9413254857063293,...","{0: 0.9461461305618286, 1: 0.9341707229614258,...","{0: 0.9330058097839355, 1: 0.9139630198478699,...","{0: 0.9385047554969788, 1: 0.9258426427841187,...","{0: 0.9225903749465942, 1: 0.9073160290718079,...","{0: 0.9364153146743774, 1: 0.9246516227722168,...","{0: 0.9563087224960327, 1: 0.9587914943695068,...","{0: 0.9566920399665833, 1: 0.9196252822875977,...","{'0_0': 0.9675301909446716, '0_1': 0.933889567...","{'0_0': 0.9675301909446716, '0_1': 0.933889567..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.004973783, shape=(), dtype=floa...","{0: 0.9550102353096008, 1: 0.8532222509384155,...","{0: 0.8977928757667542, 1: 0.930566132068634, ...","{0: 0.9416907429695129, 1: 0.9373212456703186,...","{0: 0.9345315098762512, 1: 0.9513927102088928,...","{0: 0.9480693340301514, 1: 0.9160832762718201,...","{0: 0.9283491373062134, 1: 0.8675307631492615,...","{0: 0.8708361387252808, 1: 0.9483070969581604,...","{0: 0.9260889887809753, 1: 0.8936328887939453,...","{0: 0.913896918296814, 1: 0.9597697257995605, ...","{'0_0': 0.9800193905830383, '0_1': 0.943966329...","{'0_0': 0.9800193905830383, '0_1': 0.943966329..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.014040765, shape=(), dtype=floa...","{0: 0.9883755445480347, 1: 0.9669952392578125,...","{0: 0.9618869423866272, 1: 0.96144038438797, 2...","{0: 0.9472982883453369, 1: 0.9171295762062073,...","{0: 0.8940153121948242, 1: 0.9496671557426453,...","{0: 0.936241090297699, 1: 0.9066365361213684, ...","{0: 0.8979589939117432, 1: 0.9263139367103577,...","{0: 0.9102492332458496, 1: 0.9146944284439087,...","{0: 0.905790388584137, 1: 0.9117351770401001, ...","{0: 0.9004874229431152, 1: 0.9159801602363586,...","{'0_0': 0.9935663938522339, '0_1': 0.990822911...","{'0_0': 0.9935663938522339, '0_1': 0.990822911..."


In [None]:
df_sim = incoh50

df_sim['t5_word_mv5'] = ''
df_sim['t5_word_mv10'] = ''

ks = ['5','10']
for k in ks:
  cur = 't5_word_mv' + k 

  for i in df_sim.index:
    if df_sim['n_words'][i] > 4: 
      response = df_sim['content'][i]
      t5_embed = df_sim['t5_word_emb'][i][[0]]
      df_sim[cur][i] = {}
      word_embed_chunk = list(divide_chunks(range(len(response.strip().split(' '))),int(k)))

      for chunck_id, word_embed in enumerate(word_embed_chunk):
        for word_id in word_embed:
          try:
              w1 = t5_embed[word_id]
              w2 = t5_embed[word_id+1]
              df_sim[cur][i][str(chunck_id) + '_' + str(word_id)] = 1 - scipy.spatial.distance.cosine(w1, w2)
          except IndexError:
            continue
      if i % 50 == 0:
        print('finished line, len T5 mv510 ', i, len(df_sim['t5_word_mv5'][i]))  
      df_sim.to_csv(result + 'simulation_HV_incoh_vb_response_deid_50v3_t5.csv')

incoh50 = df_sim
incoh50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished line, len T5 mv510  0 448
finished line, len T5 mv510  0 448


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10,t5_word_mv5,t5_word_mv10
0,10455,"I'm a good year , an en an en- an responsibili...",421,"(((tf.Tensor(-0.06148188, shape=(), dtype=floa...","{0: 0.9321799278259277, 1: 0.9218968152999878,...","{0: 0.9035277366638184, 1: 0.9098954796791077,...","{0: 0.9057110548019409, 1: 0.8806290030479431,...","{0: 0.8907807469367981, 1: 0.8589577674865723,...","{0: 0.8734230995178223, 1: 0.887036144733429, ...","{0: 0.854229748249054, 1: 0.8469346165657043, ...","{0: 0.8579909801483154, 1: 0.8821069598197937,...","{0: 0.8578127026557922, 1: 0.9118313193321228,...","{0: 0.8580976128578186, 1: 0.8822059631347656,...","{'0_0': 0.9749890565872192, '0_1': 0.899482250...","{'0_0': 0.9749890565872192, '0_1': 0.899482250..."
1,11689,Sure . I'm thirty three years anxious . My nam...,159,"(((tf.Tensor(0.1386814, shape=(), dtype=float3...","{0: 0.9465814828872681, 1: 0.9467255473136902,...","{0: 0.946717381477356, 1: 0.9604710340499878, ...","{0: 0.9371103048324585, 1: 0.9106454253196716,...","{0: 0.9294113516807556, 1: 0.9404101967811584,...","{0: 0.9566912651062012, 1: 0.939863383769989, ...","{0: 0.9589848518371582, 1: 0.9299648404121399,...","{0: 0.9429657459259033, 1: 0.8906495571136475,...","{0: 0.9161661863327026, 1: 0.9675136804580688,...","{0: 0.9437569975852966, 1: 0.9583231806755066,...","{'0_0': 0.974696159362793, '0_1': 0.9549526572...","{'0_0': 0.974696159362793, '0_1': 0.9549526572..."
2,12376,Alright . um something live in not especially ...,468,"(((tf.Tensor(0.2296029, shape=(), dtype=float3...","{0: 0.9793382883071899, 1: 0.9280641674995422,...","{0: 0.9631506204605103, 1: 0.8845401406288147,...","{0: 0.8699365258216858, 1: 0.8641678094863892,...","{0: 0.8529810309410095, 1: 0.8757504820823669,...","{0: 0.8971152305603027, 1: 0.8880547881126404,...","{0: 0.8871526718139648, 1: 0.8790754079818726,...","{0: 0.9138968586921692, 1: 0.8940346240997314,...","{0: 0.9289028644561768, 1: 0.9067235589027405,...","{0: 0.9343280792236328, 1: 0.8785805702209473,...","{'0_0': 0.9626310467720032, '0_1': 0.922147631...","{'0_0': 0.9626310467720032, '0_1': 0.922147631..."
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.005068084, shape=(), dtype=flo...","{0: 0.9611235857009888, 1: 0.8513863682746887,...","{0: 0.8886478543281555, 1: 0.9352755546569824,...","{0: 0.9448012709617615, 1: 0.9441168904304504,...","{0: 0.9476511478424072, 1: 0.955906331539154, ...","{0: 0.9564458727836609, 1: 0.9115990400314331,...","{0: 0.9250994920730591, 1: 0.8643674850463867,...","{0: 0.8707772493362427, 1: 0.9503769874572754,...","{0: 0.9378501176834106, 1: 0.8748310804367065,...","{0: 0.8926768898963928, 1: 0.9571195840835571,...","{'0_0': 0.9859359860420227, '0_1': 0.953445255...","{'0_0': 0.9859359860420227, '0_1': 0.953445255..."
4,13493,Mhm . I'm a thirty five hospital old man anyth...,134,"(((tf.Tensor(0.086834975, shape=(), dtype=floa...","{0: 0.9856374859809875, 1: 0.9595798850059509,...","{0: 0.9542460441589355, 1: 0.9372810125350952,...","{0: 0.9260008335113525, 1: 0.8819143176078796,...","{0: 0.8537870645523071, 1: 0.9364335536956787,...","{0: 0.9220288395881653, 1: 0.8871068358421326,...","{0: 0.8714859485626221, 1: 0.9122299551963806,...","{0: 0.8988337516784668, 1: 0.9003046751022339,...","{0: 0.8829924464225769, 1: 0.9176974296569824,...","{0: 0.9044548273086548, 1: 0.9129231572151184,...","{'0_0': 0.9921316504478455, '0_1': 0.987876653...","{'0_0': 0.9921316504478455, '0_1': 0.987876653..."


### ineff

In [None]:
df_sim = ineff10

df_sim['t5_word_mv5'] = ''
df_sim['t5_word_mv10'] = ''

ks = ['5','10']
for k in ks:
  cur = 't5_word_mv' + k 

  for i in df_sim.index:
    if df_sim['n_words'][i] > 4: 
      response = df_sim['content'][i]
      t5_embed = df_sim['t5_word_emb'][i][[0]]
      df_sim[cur][i] = {}
      word_embed_chunk = list(divide_chunks(range(len(response.strip().split(' '))),int(k)))

      for chunck_id, word_embed in enumerate(word_embed_chunk):
        for word_id in word_embed:
          try:
              w1 = t5_embed[word_id]
              w2 = t5_embed[word_id+1]
              df_sim[cur][i][str(chunck_id) + '_' + str(word_id)] = 1 - scipy.spatial.distance.cosine(w1, w2)
          except IndexError:
            continue
      if i % 50 == 0:
        print('finished line, len T5 mv510 ', i, len(df_sim['t5_word_mv5'][i]))  
      df_sim.to_csv(result + 'simulation_HV_ineff_vb_response_deid_10v3_t5.csv')

ineff10 = df_sim
ineff10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished line, len T5 mv510  0 450
finished line, len T5 mv510  0 450


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10,t5_word_mv5,t5_word_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.028594766, shape=(), dtype=flo...","{0: 0.93770831823349, 1: 0.9271951913833618, 2...","{0: 0.9028423428535461, 1: 0.9205344319343567,...","{0: 0.9036267399787903, 1: 0.9146499037742615,...","{0: 0.9054772853851318, 1: 0.8583513498306274,...","{0: 0.8545929789543152, 1: 0.9226922988891602,...","{0: 0.8916099667549133, 1: 0.885294497013092, ...","{0: 0.8828140497207642, 1: 0.8900607824325562,...","{0: 0.8598988056182861, 1: 0.9147745966911316,...","{0: 0.8689389824867249, 1: 0.89243084192276, 2...","{'0_0': 0.9800369143486023, '0_1': 0.915655970...","{'0_0': 0.9800369143486023, '0_1': 0.915655970..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.0902944, shape=(), dtype=float3...","{0: 0.9474658966064453, 1: 0.9307212829589844,...","{0: 0.940139651298523, 1: 0.9485692977905273, ...","{0: 0.9309148192405701, 1: 0.8935031890869141,...","{0: 0.9258096814155579, 1: 0.9164783954620361,...","{0: 0.9545851945877075, 1: 0.9266420006752014,...","{0: 0.9633139967918396, 1: 0.920332133769989, ...","{0: 0.9488325715065002, 1: 0.8852593898773193,...","{0: 0.93753981590271, 1: 0.962739109992981, 2:...","{0: 0.9461092948913574, 1: 0.9556964635848999,...","{'0_0': 0.9672327637672424, '0_1': 0.952582061...","{'0_0': 0.9672327637672424, '0_1': 0.952582061..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2577885, shape=(), dtype=float3...","{0: 0.9807088375091553, 1: 0.9471968412399292,...","{0: 0.9613189101219177, 1: 0.947767436504364, ...","{0: 0.9438981413841248, 1: 0.9247361421585083,...","{0: 0.9241453409194946, 1: 0.9050881266593933,...","{0: 0.9338926076889038, 1: 0.9174119830131531,...","{0: 0.9197142720222473, 1: 0.8997848629951477,...","{0: 0.9389548897743225, 1: 0.9117051362991333,...","{0: 0.9603933095932007, 1: 0.9544113874435425,...","{0: 0.9506217837333679, 1: 0.9115670323371887,...","{'0_0': 0.9676647782325745, '0_1': 0.929360747...","{'0_0': 0.9676647782325745, '0_1': 0.929360747..."
3,12630,um So I was born and raised in South Washingto...,966,"(((tf.Tensor(-0.002386301, shape=(), dtype=flo...","{0: 0.9592886567115784, 1: 0.9517082571983337,...","{0: 0.9580650329589844, 1: 0.9411338567733765,...","{0: 0.9478919506072998, 1: 0.9021629691123962,...","{0: 0.9135681986808777, 1: 0.919169545173645, ...","{0: 0.911134660243988, 1: 0.9060215353965759, ...","{0: 0.9334521293640137, 1: 0.8396468758583069,...","{0: 0.8837360143661499, 1: 0.921373724937439, ...","{0: 0.9360584020614624, 1: 0.8534910082817078,...","{0: 0.882914662361145, 1: 0.9415048956871033, ...","{'0_0': 0.982820987701416, '0_1': 0.9491128325...","{'0_0': 0.982820987701416, '0_1': 0.9491128325..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(-0.011912695, shape=(), dtype=flo...","{0: 0.9865770936012268, 1: 0.9645763635635376,...","{0: 0.9579520225524902, 1: 0.9565127491950989,...","{0: 0.9390543103218079, 1: 0.920798122882843, ...","{0: 0.895594596862793, 1: 0.9516465067863464, ...","{0: 0.9384924173355103, 1: 0.9181991815567017,...","{0: 0.9085126519203186, 1: 0.9266952276229858,...","{0: 0.9099721908569336, 1: 0.9225653409957886,...","{0: 0.9125969409942627, 1: 0.9155382513999939,...","{0: 0.9041608572006226, 1: 0.9197299480438232,...","{'0_0': 0.9932799339294434, '0_1': 0.989856183...","{'0_0': 0.9932799339294434, '0_1': 0.989856183..."


In [None]:
df_sim = ineff20

df_sim['t5_word_mv5'] = ''
df_sim['t5_word_mv10'] = ''

ks = ['5','10']
for k in ks:
  cur = 't5_word_mv' + k 

  for i in df_sim.index:
    if df_sim['n_words'][i] > 4: 
      response = df_sim['content'][i]
      t5_embed = df_sim['t5_word_emb'][i][[0]]
      df_sim[cur][i] = {}
      word_embed_chunk = list(divide_chunks(range(len(response.strip().split(' '))),int(k)))

      for chunck_id, word_embed in enumerate(word_embed_chunk):
        for word_id in word_embed:
          try:
              w1 = t5_embed[word_id]
              w2 = t5_embed[word_id+1]
              df_sim[cur][i][str(chunck_id) + '_' + str(word_id)] = 1 - scipy.spatial.distance.cosine(w1, w2)
          except IndexError:
            continue
      if i % 50 == 0:
        print('finished line, len T5 mv510 ', i, len(df_sim['t5_word_mv5'][i]))  
      df_sim.to_csv(result + 'simulation_HV_ineff_vb_response_deid_20v3_t5.csv')

ineff20 = df_sim
ineff20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished line, len T5 mv510  0 477
finished line, len T5 mv510  0 477


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10,t5_word_mv5,t5_word_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.030041385, shape=(), dtype=flo...","{0: 0.9393248558044434, 1: 0.9268133044242859,...","{0: 0.9028229713439941, 1: 0.9217650294303894,...","{0: 0.9052221179008484, 1: 0.9161338806152344,...","{0: 0.9069098234176636, 1: 0.863053023815155, ...","{0: 0.8604294061660767, 1: 0.924543559551239, ...","{0: 0.893572986125946, 1: 0.8857474327087402, ...","{0: 0.8853638172149658, 1: 0.8907713890075684,...","{0: 0.8618413805961609, 1: 0.9143766760826111,...","{0: 0.8681659698486328, 1: 0.8945172429084778,...","{'0_0': 0.9797981977462769, '0_1': 0.916694581...","{'0_0': 0.9797981977462769, '0_1': 0.916694581..."
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.116554916, shape=(), dtype=floa...","{0: 0.9512450098991394, 1: 0.939349353313446, ...","{0: 0.9414694905281067, 1: 0.9516593813896179,...","{0: 0.9313738346099854, 1: 0.902847945690155, ...","{0: 0.9215516448020935, 1: 0.9141810536384583,...","{0: 0.9425660371780396, 1: 0.9139177203178406,...","{0: 0.9457446932792664, 1: 0.9142404794692993,...","{0: 0.9372137784957886, 1: 0.8834353685379028,...","{0: 0.9210046529769897, 1: 0.9554736614227295,...","{0: 0.9394871592521667, 1: 0.9547345638275146,...","{'0_0': 0.9746569991111755, '0_1': 0.957992255...","{'0_0': 0.9746569991111755, '0_1': 0.957992255..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.27444038, shape=(), dtype=float...","{0: 0.982474684715271, 1: 0.9495822191238403, ...","{0: 0.9692144989967346, 1: 0.9290608167648315,...","{0: 0.9267149567604065, 1: 0.9037085175514221,...","{0: 0.8967114686965942, 1: 0.8978587985038757,...","{0: 0.9219183921813965, 1: 0.9057729244232178,...","{0: 0.9091338515281677, 1: 0.9018996953964233,...","{0: 0.9337750673294067, 1: 0.9130892753601074,...","{0: 0.9556854367256165, 1: 0.9470876455307007,...","{0: 0.943213939666748, 1: 0.9051385521888733, ...","{'0_0': 0.9703015089035034, '0_1': 0.936060130...","{'0_0': 0.9703015089035034, '0_1': 0.936060130..."
3,12630,um So I was born and raised in South Washingto...,966,"(((tf.Tensor(0.0054256218, shape=(), dtype=flo...","{0: 0.9524444341659546, 1: 0.9556344151496887,...","{0: 0.9477974772453308, 1: 0.9516834020614624,...","{0: 0.9401721358299255, 1: 0.9148193597793579,...","{0: 0.9078903198242188, 1: 0.9202605485916138,...","{0: 0.8981078863143921, 1: 0.9145378470420837,...","{0: 0.9257009029388428, 1: 0.8543397188186646,...","{0: 0.8814666271209717, 1: 0.93388831615448, 2...","{0: 0.9289781451225281, 1: 0.8605290651321411,...","{0: 0.8730365633964539, 1: 0.959425687789917, ...","{'0_0': 0.9843391180038452, '0_1': 0.954006373...","{'0_0': 0.9843391180038452, '0_1': 0.954006373..."
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(-0.009741404, shape=(), dtype=flo...","{0: 0.9895449280738831, 1: 0.9505131244659424,...","{0: 0.946247398853302, 1: 0.9623329043388367, ...","{0: 0.9502137303352356, 1: 0.9238979816436768,...","{0: 0.9034985303878784, 1: 0.9303346276283264,...","{0: 0.918174684047699, 1: 0.9229857921600342, ...","{0: 0.9171938300132751, 1: 0.9215077757835388,...","{0: 0.9072182178497314, 1: 0.9167945981025696,...","{0: 0.9079545140266418, 1: 0.9038049578666687,...","{0: 0.8897554278373718, 1: 0.9097139239311218,...","{'0_0': 0.9921557307243347, '0_1': 0.991401374...","{'0_0': 0.9921557307243347, '0_1': 0.991401374..."


In [None]:
df_sim = ineff50

df_sim['t5_word_mv5'] = ''
df_sim['t5_word_mv10'] = ''

ks = ['5','10']
for k in ks:
  cur = 't5_word_mv' + k 

  for i in df_sim.index:
    if df_sim['n_words'][i] > 4: 
      response = df_sim['content'][i]
      t5_embed = df_sim['t5_word_emb'][i][[0]]
      df_sim[cur][i] = {}
      word_embed_chunk = list(divide_chunks(range(len(response.strip().split(' '))),int(k)))

      for chunck_id, word_embed in enumerate(word_embed_chunk):
        for word_id in word_embed:
          try:
              w1 = t5_embed[word_id]
              w2 = t5_embed[word_id+1]
              df_sim[cur][i][str(chunck_id) + '_' + str(word_id)] = 1 - scipy.spatial.distance.cosine(w1, w2)
          except IndexError:
            continue
      if i % 50 == 0:
        print('finished line, len T5 mv510 ', i, len(df_sim['t5_word_mv5'][i]))  
      df_sim.to_csv(result + 'simulation_HV_ineff_vb_response_deid_50v3_t5.csv')

ineff50 = df_sim
ineff50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


finished line, len T5 mv510  0 399
finished line, len T5 mv510  0 399


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,t5_word_k8,t5_word_k9,t5_word_k10,t5_word_mv5,t5_word_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.013111238, shape=(), dtype=flo...","{0: 0.9365220665931702, 1: 0.9148172736167908,...","{0: 0.9017055034637451, 1: 0.8717079162597656,...","{0: 0.871260404586792, 1: 0.8609915375709534, ...","{0: 0.8649638891220093, 1: 0.7779065370559692,...","{0: 0.7916779518127441, 1: 0.9149607419967651,...","{0: 0.8805050849914551, 1: 0.8690401315689087,...","{0: 0.8658959865570068, 1: 0.8592397570610046,...","{0: 0.8236812949180603, 1: 0.914543092250824, ...","{0: 0.8554473519325256, 1: 0.8556543588638306,...","{'0_0': 0.9752288460731506, '0_1': 0.901787817...","{'0_0': 0.9752288460731506, '0_1': 0.901787817..."
1,11689,We have been using that opportunity to do more...,159,"(((tf.Tensor(-0.12850149, shape=(), dtype=floa...","{0: 0.9383153915405273, 1: 0.9557745456695557,...","{0: 0.931777834892273, 1: 0.9409135580062866, ...","{0: 0.9208097457885742, 1: 0.9250185489654541,...","{0: 0.9166374206542969, 1: 0.9056608080863953,...","{0: 0.9018595814704895, 1: 0.9243329763412476,...","{0: 0.9215403199195862, 1: 0.925862193107605, ...","{0: 0.9126757383346558, 1: 0.9166074395179749,...","{0: 0.9175317883491516, 1: 0.9206204414367676,...","{0: 0.9280173778533936, 1: 0.9213743805885315,...","{'0_0': 0.9705877900123596, '0_1': 0.964759409...","{'0_0': 0.9705877900123596, '0_1': 0.964759409..."
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2802947, shape=(), dtype=float3...","{0: 0.9825338125228882, 1: 0.9633914828300476,...","{0: 0.984454333782196, 1: 0.9628877639770508, ...","{0: 0.9690616726875305, 1: 0.922105610370636, ...","{0: 0.9330462217330933, 1: 0.9091662764549255,...","{0: 0.9344704151153564, 1: 0.9184996485710144,...","{0: 0.921454668045044, 1: 0.9163060188293457, ...","{0: 0.9532913565635681, 1: 0.9266148805618286,...","{0: 0.9637266397476196, 1: 0.9591965675354004,...","{0: 0.9525310397148132, 1: 0.919654369354248, ...","{'0_0': 0.974330723285675, '0_1': 0.9428528547...","{'0_0': 0.974330723285675, '0_1': 0.9428528547..."
3,12630,"My is things are fantastic . No , I mean My uh...",966,"(((tf.Tensor(0.18845989, shape=(), dtype=float...","{0: 0.919815719127655, 1: 0.9304170608520508, ...","{0: 0.9212266802787781, 1: 0.9312571883201599,...","{0: 0.9209229946136475, 1: 0.9350417256355286,...","{0: 0.9305492043495178, 1: 0.9560334086418152,...","{0: 0.9351797103881836, 1: 0.9267600178718567,...","{0: 0.9179043173789978, 1: 0.9416138529777527,...","{0: 0.9336761832237244, 1: 0.9579785466194153,...","{0: 0.9359200596809387, 1: 0.9408994913101196,...","{0: 0.9292818903923035, 1: 0.9607641696929932,...","{'0_0': 0.9920627474784851, '0_1': 0.925420820...","{'0_0': 0.9920627474784851, '0_1': 0.925420820..."
4,13493,Mhm . I still get to play . And my stock inves...,134,"(((tf.Tensor(0.004609654, shape=(), dtype=floa...","{0: 0.9850203990936279, 1: 0.9424539804458618,...","{0: 0.9499500393867493, 1: 0.9570140838623047,...","{0: 0.9476947784423828, 1: 0.9172466397285461,...","{0: 0.9070841670036316, 1: 0.895824670791626, ...","{0: 0.9051885008811951, 1: 0.8856618404388428,...","{0: 0.8997299075126648, 1: 0.9010818600654602,...","{0: 0.9113234281539917, 1: 0.8966946601867676,...","{0: 0.8873902559280396, 1: 0.9071586728096008,...","{0: 0.8998860716819763, 1: 0.9086571931838989,...","{'0_0': 0.9935879707336426, '0_1': 0.991262078...","{'0_0': 0.9935879707336426, '0_1': 0.991262078..."


# Add stats

### baseline

In [None]:
# create new empty columns
ks = ['2','3','4','5','6','7','8','9','10']
for k in ks:
  baseline['t5_word_sum_' + k] = ''

# loop over each k, with k in 2 through 10
# take the average of the similarity scores for each response
# note in the aforementioned calcuations,
# each response got its own list of cosine similarity scores
for k in ks:
  cur = 't5_word_k' + k
  for i in baseline.index:
    baseline['t5_word_sum_' + k][i] = sum(dict(baseline[cur][i]).values()) / len(dict(baseline[cur][i]))
    # more stats to be added

baseline.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_mv10,t5_word_sum_2,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.031055467, shape=(), dtype=flo...","{0: 0.9385946393013, 1: 0.9258050918579102, 2:...","{0: 0.9019278287887573, 1: 0.920067548751831, ...","{0: 0.9029392004013062, 1: 0.9147455096244812,...","{0: 0.9051128625869751, 1: 0.8612462282180786,...","{0: 0.8570860624313354, 1: 0.9214776754379272,...","{0: 0.8903406858444214, 1: 0.8851216435432434,...",...,"{'0_0': 0.9797008633613586, '0_1': 0.915917336...",0.974332,0.974304,0.971742,0.970383,0.971782,0.970549,0.970631,0.969079,0.970284
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.1017546, shape=(), dtype=float3...","{0: 0.9589922428131104, 1: 0.9340031743049622,...","{0: 0.9381313920021057, 1: 0.9499139785766602,...","{0: 0.931617021560669, 1: 0.8904510140419006, ...","{0: 0.9201892614364624, 1: 0.9153001308441162,...","{0: 0.9502213001251221, 1: 0.9147319793701172,...","{0: 0.9525159001350403, 1: 0.9129528999328613,...",...,"{'0_0': 0.970119297504425, '0_1': 0.9612366557...",0.966319,0.964062,0.959661,0.959033,0.961266,0.96116,0.960819,0.958714,0.959886
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2531035, shape=(), dtype=float3...","{0: 0.9821363687515259, 1: 0.9487715363502502,...","{0: 0.9518461227416992, 1: 0.9405548572540283,...","{0: 0.9367880821228027, 1: 0.9300448298454285,...","{0: 0.9244054555892944, 1: 0.9083446264266968,...","{0: 0.9351586699485779, 1: 0.9212504029273987,...","{0: 0.9248273968696594, 1: 0.9024133086204529,...",...,"{'0_0': 0.9651936292648315, '0_1': 0.931818366...",0.979953,0.978773,0.97734,0.976732,0.977259,0.976741,0.976805,0.976807,0.977098
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.0014505647, shape=(), dtype=fl...","{0: 0.9561870694160461, 1: 0.843988299369812, ...","{0: 0.8871630430221558, 1: 0.9309291839599609,...","{0: 0.9419207572937012, 1: 0.9425077438354492,...","{0: 0.9435474276542664, 1: 0.9479793906211853,...","{0: 0.9482212066650391, 1: 0.9059748649597168,...","{0: 0.923056960105896, 1: 0.8556352853775024, ...",...,"{'0_0': 0.9844767451286316, '0_1': 0.946709275...",0.983688,0.983234,0.982667,0.982869,0.981908,0.982338,0.982456,0.982929,0.982166
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.0027231025, shape=(), dtype=flo...","{0: 0.9873718619346619, 1: 0.9673035144805908,...","{0: 0.9620503783226013, 1: 0.9596797227859497,...","{0: 0.9439643025398254, 1: 0.9171977639198303,...","{0: 0.8917063474655151, 1: 0.9548231959342957,...","{0: 0.9416873455047607, 1: 0.9162192940711975,...","{0: 0.9068412184715271, 1: 0.9328790903091431,...",...,"{'0_0': 0.993344247341156, '0_1': 0.9902709126...",0.956534,0.953152,0.950388,0.945983,0.945185,0.94565,0.942052,0.942349,0.943858


In [1]:
13//5 + 1

3

In [None]:
# create new empty columns
ks = ['5','10']
for k in ks:
  baseline['t5_word_sum_mv' + k] = ''
  for i in baseline.index:
    # skip if it's nan
    if type(baseline['t5_word_mv' + k][i]) != float: 

      # get the average of cosine similarity scores for each response
      # through dividing the summation of similarities by the number of 5-token chunks
      # alternative, try np.nanmean()
      baseline['t5_word_sum_mv' + k][i] = sum(dict(baseline['t5_word_mv' + k][i]).values()) / ((len(dict(baseline['t5_word_mv' + k][i])) // int(k)) + 1) 

      # add more stats here

baseline.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10,t5_word_sum_mv5,t5_word_sum_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.031055467, shape=(), dtype=flo...","{0: 0.9385946393013, 1: 0.9258050918579102, 2:...","{0: 0.9019278287887573, 1: 0.920067548751831, ...","{0: 0.9029392004013062, 1: 0.9147455096244812,...","{0: 0.9051128625869751, 1: 0.8612462282180786,...","{0: 0.8570860624313354, 1: 0.9214776754379272,...","{0: 0.8903406858444214, 1: 0.8851216435432434,...",...,0.974304,0.971742,0.970383,0.971782,0.970549,0.970631,0.969079,0.970284,4.838074,9.570972
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.1017546, shape=(), dtype=float3...","{0: 0.9589922428131104, 1: 0.9340031743049622,...","{0: 0.9381313920021057, 1: 0.9499139785766602,...","{0: 0.931617021560669, 1: 0.8904510140419006, ...","{0: 0.9201892614364624, 1: 0.9153001308441162,...","{0: 0.9502213001251221, 1: 0.9147319793701172,...","{0: 0.9525159001350403, 1: 0.9129528999328613,...",...,0.964062,0.959661,0.959033,0.961266,0.96116,0.960819,0.958714,0.959886,4.795688,9.591375
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2531035, shape=(), dtype=float3...","{0: 0.9821363687515259, 1: 0.9487715363502502,...","{0: 0.9518461227416992, 1: 0.9405548572540283,...","{0: 0.9367880821228027, 1: 0.9300448298454285,...","{0: 0.9244054555892944, 1: 0.9083446264266968,...","{0: 0.9351586699485779, 1: 0.9212504029273987,...","{0: 0.9248273968696594, 1: 0.9024133086204529,...",...,0.978773,0.97734,0.976732,0.977259,0.976741,0.976805,0.976807,0.977098,4.876806,9.753612
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.0014505647, shape=(), dtype=fl...","{0: 0.9561870694160461, 1: 0.843988299369812, ...","{0: 0.8871630430221558, 1: 0.9309291839599609,...","{0: 0.9419207572937012, 1: 0.9425077438354492,...","{0: 0.9435474276542664, 1: 0.9479793906211853,...","{0: 0.9482212066650391, 1: 0.9059748649597168,...","{0: 0.923056960105896, 1: 0.8556352853775024, ...",...,0.983234,0.982667,0.982869,0.981908,0.982338,0.982456,0.982929,0.982166,4.91701,9.789722
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.0027231025, shape=(), dtype=flo...","{0: 0.9873718619346619, 1: 0.9673035144805908,...","{0: 0.9620503783226013, 1: 0.9596797227859497,...","{0: 0.9439643025398254, 1: 0.9171977639198303,...","{0: 0.8917063474655151, 1: 0.9548231959342957,...","{0: 0.9416873455047607, 1: 0.9162192940711975,...","{0: 0.9068412184715271, 1: 0.9328790903091431,...",...,0.953152,0.950388,0.945983,0.945185,0.94565,0.942052,0.942349,0.943858,4.681139,9.362278


In [None]:
baseline.to_csv(result + 'simulation_HV_baseline_vb_response_deid_v3_t5.csv') 

### incoh

In [None]:
ks = ['2','3','4','5','6','7','8','9','10']
for k in ks:
  incoh10['t5_word_sum_' + k] = ''
for k in ks:
  cur = 't5_word_k' + k
  for i in incoh10.index:
    incoh10['t5_word_sum_' + k][i] = sum(dict(incoh10[cur][i]).values()) / len(dict(incoh10[cur][i]))
    
incoh10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_mv10,t5_word_sum_2,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.032051105, shape=(), dtype=flo...","{0: 0.9373521208763123, 1: 0.9286782145500183,...","{0: 0.9038329720497131, 1: 0.9229267835617065,...","{0: 0.9039842486381531, 1: 0.919945240020752, ...","{0: 0.9082852005958557, 1: 0.8758352398872375,...","{0: 0.8706684112548828, 1: 0.9219260811805725,...","{0: 0.8888946771621704, 1: 0.8911484479904175,...",...,"{'0_0': 0.979918360710144, '0_1': 0.9164171218...",0.974096,0.97394,0.971546,0.970565,0.971492,0.970297,0.970401,0.968482,0.969696
1,11689,Sure . I'm thirty three years good . My name i...,159,"(((tf.Tensor(0.11520728, shape=(), dtype=float...","{0: 0.953263521194458, 1: 0.9438183903694153, ...","{0: 0.9423492550849915, 1: 0.9521161317825317,...","{0: 0.9389676451683044, 1: 0.8653549551963806,...","{0: 0.9104220867156982, 1: 0.9233895540237427,...","{0: 0.9548823237419128, 1: 0.9286745190620422,...","{0: 0.9592475295066833, 1: 0.9401805400848389,...",...,"{'0_0': 0.9725017547607422, '0_1': 0.962101936...",0.967986,0.96494,0.961824,0.960449,0.96211,0.960549,0.960175,0.958724,0.960334
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.24389347, shape=(), dtype=float...","{0: 0.9816436171531677, 1: 0.9363804459571838,...","{0: 0.9410749673843384, 1: 0.9352023005485535,...","{0: 0.937721848487854, 1: 0.9280068278312683, ...","{0: 0.9265216588973999, 1: 0.9037998914718628,...","{0: 0.9335740208625793, 1: 0.9175204634666443,...","{0: 0.9206990003585815, 1: 0.8973239064216614,...",...,"{'0_0': 0.9651522636413574, '0_1': 0.930575370...",0.980699,0.979479,0.978212,0.97754,0.977961,0.977296,0.977518,0.977373,0.977629
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.0067433813, shape=(), dtype=flo...","{0: 0.9565696120262146, 1: 0.8485756516456604,...","{0: 0.891523003578186, 1: 0.9299404621124268, ...","{0: 0.9414768218994141, 1: 0.9414905905723572,...","{0: 0.9399988651275635, 1: 0.9501848816871643,...","{0: 0.9498633742332458, 1: 0.9115596413612366,...","{0: 0.9249114990234375, 1: 0.8631559610366821,...",...,"{'0_0': 0.9809640049934387, '0_1': 0.947322368...",0.983063,0.982748,0.982253,0.982338,0.981037,0.981811,0.981926,0.982003,0.981675
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.009083907, shape=(), dtype=floa...","{0: 0.9885735511779785, 1: 0.965068519115448, ...","{0: 0.960785984992981, 1: 0.9622253775596619, ...","{0: 0.9488998055458069, 1: 0.9223074913024902,...","{0: 0.8982526063919067, 1: 0.9517715573310852,...","{0: 0.9391500949859619, 1: 0.9144243001937866,...","{0: 0.9067327380180359, 1: 0.9329472184181213,...",...,"{'0_0': 0.9933598637580872, '0_1': 0.990649163...",0.957021,0.953261,0.95065,0.946347,0.945511,0.946702,0.942809,0.943825,0.945415


In [None]:
ks = ['5','10']
for k in ks:
  incoh10['t5_word_sum_mv' + k] = ''

  for i in incoh10.index:
    if type(incoh10['t5_word_mv' + k][i]) != float:
      incoh10['t5_word_sum_mv' + k][i] = sum(dict(incoh10['t5_word_mv' + k][i]).values()) / ((len(dict(incoh10['t5_word_mv' + k][i])) // int(k)) + 1) 
incoh10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10,t5_word_sum_mv5,t5_word_sum_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.032051105, shape=(), dtype=flo...","{0: 0.9373521208763123, 1: 0.9286782145500183,...","{0: 0.9038329720497131, 1: 0.9229267835617065,...","{0: 0.9039842486381531, 1: 0.919945240020752, ...","{0: 0.9082852005958557, 1: 0.8758352398872375,...","{0: 0.8706684112548828, 1: 0.9219260811805725,...","{0: 0.8888946771621704, 1: 0.8911484479904175,...",...,0.97394,0.971546,0.970565,0.971492,0.970297,0.970401,0.968482,0.969696,4.860547,9.721094
1,11689,Sure . I'm thirty three years good . My name i...,159,"(((tf.Tensor(0.11520728, shape=(), dtype=float...","{0: 0.953263521194458, 1: 0.9438183903694153, ...","{0: 0.9423492550849915, 1: 0.9521161317825317,...","{0: 0.9389676451683044, 1: 0.8653549551963806,...","{0: 0.9104220867156982, 1: 0.9233895540237427,...","{0: 0.9548823237419128, 1: 0.9286745190620422,...","{0: 0.9592475295066833, 1: 0.9401805400848389,...",...,0.96494,0.961824,0.960449,0.96211,0.960549,0.960175,0.958724,0.960334,4.750697,9.501394
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.24389347, shape=(), dtype=float...","{0: 0.9816436171531677, 1: 0.9363804459571838,...","{0: 0.9410749673843384, 1: 0.9352023005485535,...","{0: 0.937721848487854, 1: 0.9280068278312683, ...","{0: 0.9265216588973999, 1: 0.9037998914718628,...","{0: 0.9335740208625793, 1: 0.9175204634666443,...","{0: 0.9206990003585815, 1: 0.8973239064216614,...",...,0.979479,0.978212,0.97754,0.977961,0.977296,0.977518,0.977373,0.977629,4.881069,9.670042
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.0067433813, shape=(), dtype=flo...","{0: 0.9565696120262146, 1: 0.8485756516456604,...","{0: 0.891523003578186, 1: 0.9299404621124268, ...","{0: 0.9414768218994141, 1: 0.9414905905723572,...","{0: 0.9399988651275635, 1: 0.9501848816871643,...","{0: 0.9498633742332458, 1: 0.9115596413612366,...","{0: 0.9249114990234375, 1: 0.8631559610366821,...",...,0.982748,0.982253,0.982338,0.981037,0.981811,0.981926,0.982003,0.981675,4.915239,9.830479
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.009083907, shape=(), dtype=floa...","{0: 0.9885735511779785, 1: 0.965068519115448, ...","{0: 0.960785984992981, 1: 0.9622253775596619, ...","{0: 0.9488998055458069, 1: 0.9223074913024902,...","{0: 0.8982526063919067, 1: 0.9517715573310852,...","{0: 0.9391500949859619, 1: 0.9144243001937866,...","{0: 0.9067327380180359, 1: 0.9329472184181213,...",...,0.953261,0.95065,0.946347,0.945511,0.946702,0.942809,0.943825,0.945415,4.68083,9.36166


In [None]:
incoh10.to_csv(result + 'simulation_HV_incoh_vb_response_deid_10v3_t5.csv')

In [None]:
ks = ['2','3','4','5','6','7','8','9','10']
for k in ks:
  incoh20['t5_word_sum_' + k] = ''
for k in ks:
  cur = 't5_word_k' + k
  for i in incoh20.index:
    incoh20['t5_word_sum_' + k][i] = sum(dict(incoh20[cur][i]).values()) / len(dict(incoh20[cur][i]))

ks = ['5','10']
for k in ks:
  incoh20['t5_word_sum_mv' + k] = ''

  for i in incoh20.index:
    if type(incoh20['t5_word_mv' + k][i]) != float:
      incoh20['t5_word_sum_mv' + k][i] = sum(dict(incoh20['t5_word_mv' + k][i]).values()) / ((len(dict(incoh20['t5_word_mv' + k][i])) // int(k)) + 1) 
incoh20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10,t5_word_sum_mv5,t5_word_sum_mv10
0,10455,"I'm a young man , an en an en- an things by tr...",421,"(((tf.Tensor(-0.026766412, shape=(), dtype=flo...","{0: 0.9311738610267639, 1: 0.9156302213668823,...","{0: 0.8918306231498718, 1: 0.9145900011062622,...","{0: 0.8945391774177551, 1: 0.9154523611068726,...","{0: 0.908588171005249, 1: 0.8569351434707642, ...","{0: 0.8551972508430481, 1: 0.9111636877059937,...","{0: 0.8784613013267517, 1: 0.8749086260795593,...",...,0.971878,0.969595,0.968784,0.969359,0.967657,0.969173,0.967359,0.968008,4.851917,9.703834
1,11689,Sure . I'm thirty three years medical . My nam...,159,"(((tf.Tensor(0.1074589, shape=(), dtype=float3...","{0: 0.9399039149284363, 1: 0.9386727213859558,...","{0: 0.9383133053779602, 1: 0.9497955441474915,...","{0: 0.9280366897583008, 1: 0.8977587819099426,...","{0: 0.9301880598068237, 1: 0.9372321367263794,...","{0: 0.9585370421409607, 1: 0.9416293501853943,...","{0: 0.9623845219612122, 1: 0.9378107190132141,...",...,0.965618,0.963099,0.961573,0.963618,0.962368,0.962349,0.959771,0.961705,4.749319,9.498638
2,12376,Alright . um I live in not especially lazy Spr...,468,"(((tf.Tensor(0.24678273, shape=(), dtype=float...","{0: 0.9819713830947876, 1: 0.9351614117622375,...","{0: 0.9405555725097656, 1: 0.9413254857063293,...","{0: 0.9461461305618286, 1: 0.9341707229614258,...","{0: 0.9330058097839355, 1: 0.9139630198478699,...","{0: 0.9385047554969788, 1: 0.9258426427841187,...","{0: 0.9225903749465942, 1: 0.9073160290718079,...",...,0.980153,0.978914,0.978022,0.978869,0.977863,0.978303,0.978239,0.978393,4.883491,9.674841
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(0.004973783, shape=(), dtype=floa...","{0: 0.9550102353096008, 1: 0.8532222509384155,...","{0: 0.8977928757667542, 1: 0.930566132068634, ...","{0: 0.9416907429695129, 1: 0.9373212456703186,...","{0: 0.9345315098762512, 1: 0.9513927102088928,...","{0: 0.9480693340301514, 1: 0.9160832762718201,...","{0: 0.9283491373062134, 1: 0.8675307631492615,...",...,0.984224,0.984123,0.984177,0.982881,0.983639,0.983732,0.984045,0.983358,4.923423,9.846845
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(0.014040765, shape=(), dtype=floa...","{0: 0.9883755445480347, 1: 0.9669952392578125,...","{0: 0.9618869423866272, 1: 0.96144038438797, 2...","{0: 0.9472982883453369, 1: 0.9171295762062073,...","{0: 0.8940153121948242, 1: 0.9496671557426453,...","{0: 0.936241090297699, 1: 0.9066365361213684, ...","{0: 0.8979589939117432, 1: 0.9263139367103577,...",...,0.953108,0.950905,0.946472,0.946095,0.947003,0.94379,0.944168,0.945834,4.680146,9.360293


In [None]:
incoh20.to_csv(result + 'simulation_HV_incoh_vb_response_deid_20v3_t5.csv')

In [None]:
ks = ['2','3','4','5','6','7','8','9','10']
for k in ks:
  incoh50['t5_word_sum_' + k] = ''
for k in ks:
  cur = 't5_word_k' + k
  for i in incoh50.index:
    incoh50['t5_word_sum_' + k][i] = sum(dict(incoh50[cur][i]).values()) / len(dict(incoh50[cur][i]))

ks = ['5','10']
for k in ks:
  incoh50['t5_word_sum_mv' + k] = ''

  for i in incoh50.index:
    if type(incoh50['t5_word_mv' + k][i]) != float:
      incoh50['t5_word_sum_mv' + k][i] = sum(dict(incoh50['t5_word_mv' + k][i]).values()) / ((len(dict(incoh50['t5_word_mv' + k][i])) // int(k)) + 1) 
incoh50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10,t5_word_sum_mv5,t5_word_sum_mv10
0,10455,"I'm a good year , an en an en- an responsibili...",421,"(((tf.Tensor(-0.06148188, shape=(), dtype=floa...","{0: 0.9321799278259277, 1: 0.9218968152999878,...","{0: 0.9035277366638184, 1: 0.9098954796791077,...","{0: 0.9057110548019409, 1: 0.8806290030479431,...","{0: 0.8907807469367981, 1: 0.8589577674865723,...","{0: 0.8734230995178223, 1: 0.887036144733429, ...","{0: 0.854229748249054, 1: 0.8469346165657043, ...",...,0.974263,0.973638,0.971318,0.971767,0.970237,0.971645,0.969698,0.97023,4.862575,9.72515
1,11689,Sure . I'm thirty three years anxious . My nam...,159,"(((tf.Tensor(0.1386814, shape=(), dtype=float3...","{0: 0.9465814828872681, 1: 0.9467255473136902,...","{0: 0.946717381477356, 1: 0.9604710340499878, ...","{0: 0.9371103048324585, 1: 0.9106454253196716,...","{0: 0.9294113516807556, 1: 0.9404101967811584,...","{0: 0.9566912651062012, 1: 0.939863383769989, ...","{0: 0.9589848518371582, 1: 0.9299648404121399,...",...,0.966897,0.965193,0.962541,0.965338,0.964937,0.96287,0.962024,0.96373,4.750188,9.500376
2,12376,Alright . um something live in not especially ...,468,"(((tf.Tensor(0.2296029, shape=(), dtype=float3...","{0: 0.9793382883071899, 1: 0.9280641674995422,...","{0: 0.9631506204605103, 1: 0.8845401406288147,...","{0: 0.8699365258216858, 1: 0.8641678094863892,...","{0: 0.8529810309410095, 1: 0.8757504820823669,...","{0: 0.8971152305603027, 1: 0.8880547881126404,...","{0: 0.8871526718139648, 1: 0.8790754079818726,...",...,0.98165,0.980632,0.979736,0.980423,0.980043,0.979899,0.979851,0.980099,4.891376,9.690462
3,12630,um So I'm currently twenty-nine . I was born a...,966,"(((tf.Tensor(-0.005068084, shape=(), dtype=flo...","{0: 0.9611235857009888, 1: 0.8513863682746887,...","{0: 0.8886478543281555, 1: 0.9352755546569824,...","{0: 0.9448012709617615, 1: 0.9441168904304504,...","{0: 0.9476511478424072, 1: 0.955906331539154, ...","{0: 0.9564458727836609, 1: 0.9115990400314331,...","{0: 0.9250994920730591, 1: 0.8643674850463867,...",...,0.984141,0.984068,0.983559,0.982582,0.983318,0.983399,0.983793,0.982988,4.920508,9.841017
4,13493,Mhm . I'm a thirty five hospital old man anyth...,134,"(((tf.Tensor(0.086834975, shape=(), dtype=floa...","{0: 0.9856374859809875, 1: 0.9595798850059509,...","{0: 0.9542460441589355, 1: 0.9372810125350952,...","{0: 0.9260008335113525, 1: 0.8819143176078796,...","{0: 0.8537870645523071, 1: 0.9364335536956787,...","{0: 0.9220288395881653, 1: 0.8871068358421326,...","{0: 0.8714859485626221, 1: 0.9122299551963806,...",...,0.951984,0.949056,0.944841,0.943335,0.944527,0.941225,0.941018,0.945464,4.669101,9.338201


In [None]:
incoh50.to_csv(result + 'simulation_HV_incoh_vb_response_deid_50v3_t5.csv')

### ineff

In [None]:
ks = ['2','3','4','5','6','7','8','9','10']
for k in ks:
  ineff10['t5_word_sum_' + k] = ''
for k in ks:
  cur = 't5_word_k' + k
  for i in ineff10.index:
    ineff10['t5_word_sum_' + k][i] = sum(dict(ineff10[cur][i]).values()) / len(dict(ineff10[cur][i]))

ks = ['5','10']
for k in ks:
  ineff10['t5_word_sum_mv' + k] = ''

  for i in ineff10.index:
    if type(ineff10['t5_word_mv' + k][i]) != float:
      ineff10['t5_word_sum_mv' + k][i] = sum(dict(ineff10['t5_word_mv' + k][i]).values()) / ((len(dict(ineff10['t5_word_mv' + k][i])) // int(k)) + 1) 
ineff10.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10,t5_word_sum_mv5,t5_word_sum_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.028594766, shape=(), dtype=flo...","{0: 0.93770831823349, 1: 0.9271951913833618, 2...","{0: 0.9028423428535461, 1: 0.9205344319343567,...","{0: 0.9036267399787903, 1: 0.9146499037742615,...","{0: 0.9054772853851318, 1: 0.8583513498306274,...","{0: 0.8545929789543152, 1: 0.9226922988891602,...","{0: 0.8916099667549133, 1: 0.885294497013092, ...",...,0.973491,0.971034,0.96953,0.970743,0.969479,0.96922,0.968062,0.969276,4.824231,9.543587
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.0902944, shape=(), dtype=float3...","{0: 0.9474658966064453, 1: 0.9307212829589844,...","{0: 0.940139651298523, 1: 0.9485692977905273, ...","{0: 0.9309148192405701, 1: 0.8935031890869141,...","{0: 0.9258096814155579, 1: 0.9164783954620361,...","{0: 0.9545851945877075, 1: 0.9266420006752014,...","{0: 0.9633139967918396, 1: 0.920332133769989, ...",...,0.963321,0.961352,0.962042,0.961782,0.961776,0.961778,0.960069,0.961392,4.794674,9.589349
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2577885, shape=(), dtype=float3...","{0: 0.9807088375091553, 1: 0.9471968412399292,...","{0: 0.9613189101219177, 1: 0.947767436504364, ...","{0: 0.9438981413841248, 1: 0.9247361421585083,...","{0: 0.9241453409194946, 1: 0.9050881266593933,...","{0: 0.9338926076889038, 1: 0.9174119830131531,...","{0: 0.9197142720222473, 1: 0.8997848629951477,...",...,0.980036,0.978682,0.978147,0.978696,0.978061,0.977977,0.978022,0.978419,4.88322,9.676011
3,12630,um So I was born and raised in South Washingto...,966,"(((tf.Tensor(-0.002386301, shape=(), dtype=flo...","{0: 0.9592886567115784, 1: 0.9517082571983337,...","{0: 0.9580650329589844, 1: 0.9411338567733765,...","{0: 0.9478919506072998, 1: 0.9021629691123962,...","{0: 0.9135681986808777, 1: 0.919169545173645, ...","{0: 0.911134660243988, 1: 0.9060215353965759, ...","{0: 0.9334521293640137, 1: 0.8396468758583069,...",...,0.983674,0.982677,0.982593,0.982377,0.982412,0.983002,0.98251,0.982417,4.91048,9.779695
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(-0.011912695, shape=(), dtype=flo...","{0: 0.9865770936012268, 1: 0.9645763635635376,...","{0: 0.9579520225524902, 1: 0.9565127491950989,...","{0: 0.9390543103218079, 1: 0.920798122882843, ...","{0: 0.895594596862793, 1: 0.9516465067863464, ...","{0: 0.9384924173355103, 1: 0.9181991815567017,...","{0: 0.9085126519203186, 1: 0.9266952276229858,...",...,0.953368,0.951297,0.945886,0.946492,0.946837,0.943176,0.943114,0.945557,4.656752,9.313504


In [None]:
ineff10.to_csv(result + 'simulation_HV_ineff_vb_response_deid_10v3_t5.csv')

In [None]:
ks = ['2','3','4','5','6','7','8','9','10']
for k in ks:
  ineff20['t5_word_sum_' + k] = ''
for k in ks:
  cur = 't5_word_k' + k
  for i in ineff20.index:
    ineff20['t5_word_sum_' + k][i] = sum(dict(ineff20[cur][i]).values()) / len(dict(ineff20[cur][i]))

ks = ['5','10']
for k in ks:
  ineff20['t5_word_sum_mv' + k] = ''

  for i in ineff20.index:
    if type(ineff20['t5_word_mv' + k][i]) != float:
      ineff20['t5_word_sum_mv' + k][i] = sum(dict(ineff20['t5_word_mv' + k][i]).values()) / ((len(dict(ineff20['t5_word_mv' + k][i])) // int(k)) + 1) 
ineff20.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10,t5_word_sum_mv5,t5_word_sum_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.030041385, shape=(), dtype=flo...","{0: 0.9393248558044434, 1: 0.9268133044242859,...","{0: 0.9028229713439941, 1: 0.9217650294303894,...","{0: 0.9052221179008484, 1: 0.9161338806152344,...","{0: 0.9069098234176636, 1: 0.863053023815155, ...","{0: 0.8604294061660767, 1: 0.924543559551239, ...","{0: 0.893572986125946, 1: 0.8857474327087402, ...",...,0.97501,0.972225,0.971233,0.972128,0.971022,0.971059,0.969482,0.970404,4.853092,9.706183
1,11689,Sure . I'm thirty three years old . My name is...,159,"(((tf.Tensor(0.116554916, shape=(), dtype=floa...","{0: 0.9512450098991394, 1: 0.939349353313446, ...","{0: 0.9414694905281067, 1: 0.9516593813896179,...","{0: 0.9313738346099854, 1: 0.902847945690155, ...","{0: 0.9215516448020935, 1: 0.9141810536384583,...","{0: 0.9425660371780396, 1: 0.9139177203178406,...","{0: 0.9457446932792664, 1: 0.9142404794692993,...",...,0.963927,0.96038,0.958247,0.95844,0.954237,0.955275,0.953828,0.952628,4.726826,9.19105
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.27444038, shape=(), dtype=float...","{0: 0.982474684715271, 1: 0.9495822191238403, ...","{0: 0.9692144989967346, 1: 0.9290608167648315,...","{0: 0.9267149567604065, 1: 0.9037085175514221,...","{0: 0.8967114686965942, 1: 0.8978587985038757,...","{0: 0.9219183921813965, 1: 0.9057729244232178,...","{0: 0.9091338515281677, 1: 0.9018996953964233,...",...,0.980885,0.979998,0.978869,0.979786,0.979669,0.979542,0.979545,0.979517,4.865641,9.731282
3,12630,um So I was born and raised in South Washingto...,966,"(((tf.Tensor(0.0054256218, shape=(), dtype=flo...","{0: 0.9524444341659546, 1: 0.9556344151496887,...","{0: 0.9477974772453308, 1: 0.9516834020614624,...","{0: 0.9401721358299255, 1: 0.9148193597793579,...","{0: 0.9078903198242188, 1: 0.9202605485916138,...","{0: 0.8981078863143921, 1: 0.9145378470420837,...","{0: 0.9257009029388428, 1: 0.8543397188186646,...",...,0.983867,0.982994,0.983493,0.983025,0.982978,0.983745,0.983377,0.983471,4.920959,9.841918
4,13493,Mhm . I'm a thirty five year old man who uh um...,134,"(((tf.Tensor(-0.009741404, shape=(), dtype=flo...","{0: 0.9895449280738831, 1: 0.9505131244659424,...","{0: 0.946247398853302, 1: 0.9623329043388367, ...","{0: 0.9502137303352356, 1: 0.9238979816436768,...","{0: 0.9034985303878784, 1: 0.9303346276283264,...","{0: 0.918174684047699, 1: 0.9229857921600342, ...","{0: 0.9171938300132751, 1: 0.9215077757835388,...",...,0.950532,0.946926,0.941542,0.94007,0.943255,0.938469,0.937093,0.939248,4.684293,9.033993


In [None]:
ineff20.to_csv(result + 'simulation_HV_ineff_vb_response_deid_20v3_t5.csv')

In [None]:
ks = ['2','3','4','5','6','7','8','9','10']
for k in ks:
  ineff50['t5_word_sum_' + k] = ''
for k in ks:
  cur = 't5_word_k' + k
  for i in ineff50.index:
    ineff50['t5_word_sum_' + k][i] = sum(dict(ineff50[cur][i]).values()) / len(dict(ineff50[cur][i]))

ks = ['5','10']
for k in ks:
  ineff50['t5_word_sum_mv' + k] = ''

  for i in ineff50.index:
    if type(ineff50['t5_word_mv' + k][i]) != float:
      ineff50['t5_word_sum_mv' + k][i] = sum(dict(ineff50['t5_word_mv' + k][i]).values()) / ((len(dict(ineff50['t5_word_mv' + k][i])) // int(k)) + 1) 
ineff50.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,grid,content,n_words,t5_word_emb,t5_word_k2,t5_word_k3,t5_word_k4,t5_word_k5,t5_word_k6,t5_word_k7,...,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10,t5_word_sum_mv5,t5_word_sum_mv10
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"(((tf.Tensor(-0.013111238, shape=(), dtype=flo...","{0: 0.9365220665931702, 1: 0.9148172736167908,...","{0: 0.9017055034637451, 1: 0.8717079162597656,...","{0: 0.871260404586792, 1: 0.8609915375709534, ...","{0: 0.8649638891220093, 1: 0.7779065370559692,...","{0: 0.7916779518127441, 1: 0.9149607419967651,...","{0: 0.8805050849914551, 1: 0.8690401315689087,...",...,0.968477,0.965851,0.963291,0.964732,0.96386,0.963183,0.960896,0.962762,4.842705,9.68541
1,11689,We have been using that opportunity to do more...,159,"(((tf.Tensor(-0.12850149, shape=(), dtype=floa...","{0: 0.9383153915405273, 1: 0.9557745456695557,...","{0: 0.931777834892273, 1: 0.9409135580062866, ...","{0: 0.9208097457885742, 1: 0.9250185489654541,...","{0: 0.9166374206542969, 1: 0.9056608080863953,...","{0: 0.9018595814704895, 1: 0.9243329763412476,...","{0: 0.9215403199195862, 1: 0.925862193107605, ...",...,0.966824,0.964902,0.964007,0.9635,0.961351,0.960774,0.959771,0.958194,4.797153,9.594306
2,12376,Alright . um I live in not especially cool Spr...,468,"(((tf.Tensor(0.2802947, shape=(), dtype=float3...","{0: 0.9825338125228882, 1: 0.9633914828300476,...","{0: 0.984454333782196, 1: 0.9628877639770508, ...","{0: 0.9690616726875305, 1: 0.922105610370636, ...","{0: 0.9330462217330933, 1: 0.9091662764549255,...","{0: 0.9344704151153564, 1: 0.9184996485710144,...","{0: 0.921454668045044, 1: 0.9163060188293457, ...",...,0.978785,0.976111,0.97577,0.975143,0.975195,0.974943,0.97458,0.975446,4.868101,9.736201
3,12630,"My is things are fantastic . No , I mean My uh...",966,"(((tf.Tensor(0.18845989, shape=(), dtype=float...","{0: 0.919815719127655, 1: 0.9304170608520508, ...","{0: 0.9212266802787781, 1: 0.9312571883201599,...","{0: 0.9209229946136475, 1: 0.9350417256355286,...","{0: 0.9305492043495178, 1: 0.9560334086418152,...","{0: 0.9351797103881836, 1: 0.9267600178718567,...","{0: 0.9179043173789978, 1: 0.9416138529777527,...",...,0.984322,0.983959,0.983258,0.983213,0.983381,0.984108,0.983412,0.983441,4.909972,9.777616
4,13493,Mhm . I still get to play . And my stock inves...,134,"(((tf.Tensor(0.004609654, shape=(), dtype=floa...","{0: 0.9850203990936279, 1: 0.9424539804458618,...","{0: 0.9499500393867493, 1: 0.9570140838623047,...","{0: 0.9476947784423828, 1: 0.9172466397285461,...","{0: 0.9070841670036316, 1: 0.895824670791626, ...","{0: 0.9051885008811951, 1: 0.8856618404388428,...","{0: 0.8997299075126648, 1: 0.9010818600654602,...",...,0.96123,0.959101,0.956043,0.955288,0.955881,0.953745,0.951857,0.951779,4.731948,9.20101


In [None]:
ineff50.to_csv(result + 'simulation_HV_ineff_vb_response_deid_50v3_t5.csv')

# TLC merge

In [None]:
tlc = pd.read_csv('/Users/yancong/Desktop/4 clinical/00 Project Files/crossdx_clin.csv', index_col=0)
tlc = tlc[['grid', 'SSDvHC', 'group', 'tlc_01povspeech', 'tlc_02povcontent',	'tlc_03pressure',	'tlc_04distract',
	'tlc_05tangent', 'tlc_06derail', 'tlc_07incoh',	'tlc_08illogic',	'tlc_09clang',	'tlc_10neologism',
    	'tlc_11wordapprox',	'tlc_12circum',	'tlc_13lossgoal',	'tlc_14persev',	'tlc_15echo',	'tlc_16block',
        	'tlc_17stilt',	'tlc_18selfref', 'tlc_3f_inefficient',	'tlc_3f_incoherent',	'tlc_3f_impexpress']]
tlc.head()

Unnamed: 0,grid,SSDvHC,group,tlc_01povspeech,tlc_02povcontent,tlc_03pressure,tlc_04distract,tlc_05tangent,tlc_06derail,tlc_07incoh,...,tlc_12circum,tlc_13lossgoal,tlc_14persev,tlc_15echo,tlc_16block,tlc_17stilt,tlc_18selfref,tlc_3f_inefficient,tlc_3f_incoherent,tlc_3f_impexpress
1,10308,1.0,SSD,0,1,0,0,0,0,0,...,0,0,1,0,0,2,0,-0.406404,-0.069358,-0.018896
2,10311,0.0,HC,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.713934,-0.463481,0.197262
3,10316,1.0,SSD,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.687947,-0.387319,-0.710348
4,10455,0.0,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.687947,-0.387319,-0.710348
5,10582,1.0,SSD,0,0,0,1,0,1,0,...,1,1,0,0,0,0,0,-0.059923,-0.107333,-0.458561


In [None]:
vb = pd.read_csv('T5_word_LongText_emb_vb.csv', index_col=0)
vb.drop(['n_tokens', 'content', 't5_word_emb', 't5_word_k2', 't5_word_k3', 't5_word_k4', 't5_word_k5',
       't5_word_k6', 't5_word_k7', 't5_word_k8', 't5_word_k9', 't5_word_k10',
       't5_word_mv5', 't5_word_mv10'], axis=1, inplace=True)
vb.head()

Unnamed: 0,grid,study,group,SSDvHC,t5_word_sum_2,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,t5_word_sum_8,t5_word_sum_9,t5_word_sum_10,t5_word_sum_mv5,t5_word_sum_mv10
0,5195,lpop,SSD,1,0.972583,0.971772,0.968676,0.968522,0.969242,0.966855,0.967668,0.968324,0.966018,4.82952,9.659039
1,6798,aces,SSD,1,0.97377,0.973053,0.970613,0.971635,0.970762,0.971595,0.971035,0.971521,0.97084,4.809522,9.501738
2,9202,aces,SSD,1,0.963587,0.962061,0.962367,0.95779,0.959843,0.95926,0.959872,0.958496,0.960442,4.672055,9.08455
3,9394,Remora,HC,0,0.980765,0.981047,0.978921,0.978727,0.979465,0.977928,0.979849,0.979064,0.97876,4.895304,9.790607
4,10308,Remora,SSD,1,0.966373,0.964298,0.960382,0.959926,0.959147,0.955821,0.955732,0.95532,0.955571,4.792469,9.584938


In [None]:
vb['grid'] = vb['grid'].astype(str)
final_gt = pd.merge(vb, tlc, on=['grid', 'group', 'SSDvHC'])
final_gt.to_csv('/Users/yancong/Desktop/4 clinical/02 projects_parsely/05 ssd-lm-stanglab/13 remora_lpop_aces/data_analysis/T5_LongText/T5_word_LongText_open_3df_vb.csv')
final_gt.head()

Unnamed: 0,grid,study,group,SSDvHC,t5_word_sum_2,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,...,tlc_12circum,tlc_13lossgoal,tlc_14persev,tlc_15echo,tlc_16block,tlc_17stilt,tlc_18selfref,tlc_3f_inefficient,tlc_3f_incoherent,tlc_3f_impexpress
0,5195,lpop,SSD,1,0.972583,0.971772,0.968676,0.968522,0.969242,0.966855,...,2,2,3,0,0,0,0,2.278186,0.675166,0.544449
1,6798,aces,SSD,1,0.97377,0.973053,0.970613,0.971635,0.970762,0.971595,...,1,1,1,0,0,0,2,2.36556,1.094802,-0.313254
2,9202,aces,SSD,1,0.963587,0.962061,0.962367,0.95779,0.959843,0.95926,...,0,0,0,0,0,0,0,-0.528266,-0.136307,0.037839
3,9394,Remora,HC,0,0.980765,0.981047,0.978921,0.978727,0.979465,0.977928,...,0,0,0,0,0,0,0,-0.141396,-0.553826,-0.859843
4,10308,Remora,SSD,1,0.966373,0.964298,0.960382,0.959926,0.959147,0.955821,...,0,0,1,0,0,2,0,-0.406404,-0.069358,-0.018896


In [None]:
clean = pd.read_csv('T5_word_LongText_emb_clean.csv', index_col=0)
clean.drop(['n_tokens', 'content', 't5_word_emb', 't5_word_k2', 't5_word_k3', 't5_word_k4', 't5_word_k5',
       't5_word_k6', 't5_word_k7', 't5_word_k8', 't5_word_k9', 't5_word_k10',
       't5_word_mv5', 't5_word_mv10'], axis=1, inplace=True)

clean['grid'] = clean['grid'].astype(str)
final_gt = pd.merge(clean, tlc, on=['grid', 'group', 'SSDvHC'])
final_gt.to_csv('/Users/yancong/Desktop/4 clinical/02 projects_parsely/05 ssd-lm-stanglab/13 remora_lpop_aces/data_analysis/T5_LongText/T5_word_LongText_open_3df_clean.csv')
final_gt.head()

Unnamed: 0,grid,study,group,SSDvHC,t5_word_sum_2,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,...,tlc_12circum,tlc_13lossgoal,tlc_14persev,tlc_15echo,tlc_16block,tlc_17stilt,tlc_18selfref,tlc_3f_inefficient,tlc_3f_incoherent,tlc_3f_impexpress
0,5195,lpop,SSD,1,0.969386,0.966725,0.96297,0.963208,0.964505,0.961151,...,2,2,3,0,0,0,0,2.278186,0.675166,0.544449
1,6798,aces,SSD,1,0.968258,0.963345,0.962945,0.963329,0.963908,0.963746,...,1,1,1,0,0,0,2,2.36556,1.094802,-0.313254
2,9202,aces,SSD,1,0.952411,0.941971,0.944559,0.941434,0.946136,0.940437,...,0,0,0,0,0,0,0,-0.528266,-0.136307,0.037839
3,9394,Remora,HC,0,0.982528,0.981971,0.980177,0.9796,0.980205,0.9791,...,0,0,0,0,0,0,0,-0.141396,-0.553826,-0.859843
4,10308,Remora,SSD,1,0.970987,0.968106,0.965707,0.966308,0.965072,0.964659,...,0,0,1,0,0,2,0,-0.406404,-0.069358,-0.018896


In [None]:
nosw = pd.read_csv('T5_word_LongText_emb_nosw.csv', index_col=0)
nosw.drop(['n_tokens', 'content', 't5_word_emb', 't5_word_k2', 't5_word_k3', 't5_word_k4', 't5_word_k5',
       't5_word_k6', 't5_word_k7', 't5_word_k8', 't5_word_k9', 't5_word_k10',
       't5_word_mv5', 't5_word_mv10'], axis=1, inplace=True)

nosw['grid'] = nosw['grid'].astype(str)
final_gt = pd.merge(nosw, tlc, on=['grid', 'group', 'SSDvHC'])
final_gt.to_csv('/Users/yancong/Desktop/4 clinical/02 projects_parsely/05 ssd-lm-stanglab/13 remora_lpop_aces/data_analysis/T5_LongText/T5_word_LongText_open_3df_nosw.csv')
final_gt.head()

Unnamed: 0,grid,study,group,SSDvHC,t5_word_sum_2,t5_word_sum_3,t5_word_sum_4,t5_word_sum_5,t5_word_sum_6,t5_word_sum_7,...,tlc_12circum,tlc_13lossgoal,tlc_14persev,tlc_15echo,tlc_16block,tlc_17stilt,tlc_18selfref,tlc_3f_inefficient,tlc_3f_incoherent,tlc_3f_impexpress
0,5195,lpop,SSD,1,0.964925,0.969756,0.967009,0.9645,0.965212,0.964595,...,2,2,3,0,0,0,0,2.278186,0.675166,0.544449
1,9202,aces,SSD,1,0.953958,0.955874,0.948047,0.953649,0.949531,0.951206,...,0,0,0,0,0,0,0,-0.528266,-0.136307,0.037839
2,9394,Remora,HC,0,0.981163,0.98125,0.980579,0.979361,0.980196,0.979537,...,0,0,0,0,0,0,0,-0.141396,-0.553826,-0.859843
3,10308,Remora,SSD,1,0.969674,0.968667,0.967226,0.96824,0.966768,0.963587,...,0,0,1,0,0,2,0,-0.406404,-0.069358,-0.018896
4,10311,Remora,HC,0,0.962105,0.961939,0.961017,0.957468,0.956625,0.958732,...,0,0,0,0,0,0,0,-0.713934,-0.463481,0.197262
