# Prep dataset

In [12]:
import pandas as pd
import numpy as np
import os
import scipy
import scipy.stats

from google.colab import drive
drive.mount('/content/drive')
data_foloder = '/content/drive/My Drive/simulation/Gradient_shuffle/'
result = '/content/drive/My Drive/simulation/Output_simul/BERT/word/'

baseline = pd.read_csv(data_foloder + 'simulation_HV_baseline_vb_response_deid_v3.csv')
incoh10 = pd.read_csv(data_foloder + 'simulation_HV_incoh_vb_response_deid_10v3.csv')
incoh20 = pd.read_csv(data_foloder + 'simulation_HV_incoh_vb_response_deid_20v3.csv')
incoh50 = pd.read_csv(data_foloder + 'simulation_HV_incoh_vb_response_deid_50v3.csv')
ineff10 = pd.read_csv(data_foloder + 'simulation_HV_ineff_vb_response_deid_10v3.csv')
ineff20 = pd.read_csv(data_foloder + 'simulation_HV_ineff_vb_response_deid_20v3.csv')
ineff50 = pd.read_csv(data_foloder + 'simulation_HV_ineff_vb_response_deid_50v3.csv')

baseline.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,grid,content,n_words
0,10455,"I'm a young man , an en an en- an engineer by ...",421
1,11689,Sure . I'm thirty three years old . My name is...,159
2,12376,Alright . um I live in not especially cool Spr...,468
3,12630,um So I'm currently twenty-nine . I was born a...,966
4,13493,Mhm . I'm a thirty five year old man who uh um...,134


In [2]:
os.getcwd()

'/content'

# Install lib and dependencies

In [10]:
# Install the transformers package from Hugging Face 
# which will give us a pytorch interface for working with BERT.

import torch
from collections import OrderedDict

In [None]:
!pip install transformers

In [4]:
# check if any GPU instance is availabe
# https://www.databricks.com/blog/2021/10/28/gpu-accelerated-sentiment-analysis-using-pytorch-and-huggingface-on-databricks.html
# 'cude': GPU
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
DEVICE

device(type='cuda')

In [5]:
# load the pre-trained BERT model and tokenizer
from transformers import BertModel, BertTokenizer

model = BertModel.from_pretrained('bert-base-uncased', # for both word level and utterance level
           output_hidden_states = True) #.to(DEVICE); the above cell already takes care of this
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

# Contextualized word embeddings

Put the input text into a specific format that BERT can read. add the ```[CLS]``` to the beginning and ```[SEP]``` to the end of the input. convert the tokenized BERT input to the tensor format.

In [6]:
def bert_text_preparation(text, tokenizer):
  """
  Preprocesses text input in a way that BERT can interpret.
  """
  marked_text = "[CLS] " + text + " [SEP]" # add special tokens
  tokenized_text = tokenizer.tokenize(marked_text)
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # find token IDs
  segments_ids = [1]*len(indexed_tokens) # for formating and vectors matrix calculations [not really an ID]

  # convert inputs to tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensor = torch.tensor([segments_ids])

  return tokenized_text, tokens_tensor, segments_tensor

to obtain the actual BERT embeddings, we take preprocessed input text, which now is represented by tensors, put it into our pre-trained BERT model.

which vector works best as a contextualized embedding depends on the task. 

according to Devlin et al (2019), the sum of the last four layers of the model worked well for NLP tasks

In [7]:
def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    """
    Obtains BERT embeddings for tokens, in context of the given response.
    """
    # gradient calculation id disabled
    with torch.no_grad():
      # obtain hidden states
      outputs = model(tokens_tensor, segments_tensor)
      hidden_states = outputs[2]

    # concatenate the tensors for all layers
    # use "stack" to create new dimension in tensor
    token_embeddings = torch.stack(hidden_states, dim=0)

    # remove dimension 1, the "batches"
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # swap dimensions 0 and 1 so we can loop over tokens 
    token_embeddings = token_embeddings.permute(1,0,2)

    # intialized list to store embeddings
    token_vecs_sum = []

    # "token_embeddings" is a [Y x 12 x 768] tensor
    # where Y is the number of tokens in the response

    # loop over tokens in response
    for token in token_embeddings:

        # "token" is a [12 x 768] tensor

        # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)

    return token_vecs_sum

create contextual embeddings for a response.

In [13]:
# call the function 
# apply it to the dataframe
dfs = [baseline, incoh10, incoh20, incoh50, ineff10, ineff20, ineff50]
temp = -1
for df in dfs:
  temp += 1
  df["bert_emb"] = ''
  df.to_csv(result + str(temp) + '.csv')

df.head()

Unnamed: 0,grid,content,n_words,bert_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,
1,11689,We have been using that opportunity to do more...,159,
2,12376,Alright . um I live in not especially cool Spr...,468,
3,12630,"My is things are fantastic . No , I mean My uh...",966,
4,13493,Mhm . I still get to play . And my stock inves...,134,


In [14]:
for df in dfs: 
  for i in df.index:
    context_embeddings = []
    if df['n_words'][i] > 4: # only process lines with response len bigger than 4
        # only keep the first 430 tokens. BERT uses a subword tokenizer (WordPiece), 
        # so the maximum length corresponds to 512 subword tokens.
        lst = df['content'][i].split(' ')[:431] 
        sentence = ' '.join(lst)
        tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(sentence, tokenizer)
        list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)

        # make ordered dictionary to keep track of the position of each word
        tokens = OrderedDict()

        # loop over tokens in sensitive sentence
        for token in tokenized_text[1:-1]:
          # keep track of position of word and whether it occurs multiple times
          if token in tokens:
            tokens[token] += 1
          else:
            tokens[token] = 1

          # compute the position of the current token
          token_indices = [i for i, t in enumerate(tokenized_text) if t == token]
          current_index = token_indices[tokens[token]-1]

          # get the corresponding embedding
          token_vec = list_token_embeddings[current_index]
          
          # save values
          context_embeddings.append(token_vec)

        df['bert_emb'][i] = context_embeddings

    if i % 5 == 0:
        print('progress: ', i)

  df.to_csv(result + str(temp) + '.csv')  
  
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


progress:  0
progress:  0
progress:  0
progress:  0
progress:  0
progress:  0
progress:  0


Unnamed: 0,grid,content,n_words,bert_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[[tensor(1.0818), tensor(1.0875), tensor(-0.18..."
1,11689,We have been using that opportunity to do more...,159,"[[tensor(3.7089), tensor(0.8028), tensor(-1.74..."
2,12376,Alright . um I live in not especially cool Spr...,468,"[[tensor(5.3717), tensor(1.6071), tensor(5.350..."
3,12630,"My is things are fantastic . No , I mean My uh...",966,"[[tensor(-2.2958), tensor(2.2740), tensor(1.50..."
4,13493,Mhm . I still get to play . And my stock inves...,134,"[[tensor(1.7302), tensor(3.9717), tensor(6.281..."


# Stats and similarities functions

In [15]:
# stats ignoring nan, apply to all LMs 
from numpy import nanmedian

import scipy
def iqr(x):
  return scipy.stats.iqr(np.array(x), nan_policy='omit')

from numpy import nanquantile
def q5(x):
    return np.nanquantile(np.array(x), 0.05)

def q95(x):
    return np.nanquantile(np.array(x), 0.95)

In [16]:
# cosine_similarity, apply to all LMs
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# MV5/10

In [17]:
# Average semantic similarity of each word in 5- or 10- words window

def divide_chunks(l, n):
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]
  
# n: How many elements each
# list should have
test = ['Alex','broke','the','vase','accidentally','.','But','Kai','did','it','on','purpose','.']
divide_chunks(test,5)
chopped = list(divide_chunks(test,5))
print(chopped)

[['Alex', 'broke', 'the', 'vase', 'accidentally'], ['.', 'But', 'Kai', 'did', 'it'], ['on', 'purpose', '.']]


In [18]:
def combinations(lst): # get w1, w2 combinations
    # input: a list of <= 5 tokens
    cmb = []
    rightside = lst[:] # initialize a list
    for wid, w1 in enumerate(lst): # each token gets a chance to be w1
        rightside = lst[wid:] # dynamically chop off w1 from the rest of the list
        while rightside: # loop until the rest of the list is empty
            w2 = rightside.pop(0) # stack up w2
            if w2 != w1: # get rid of ['Alex', 'Alex']
                cmb.append([w1, w2])  
    return cmb

testing = ['Alex', 'broke', 'the', 'vase', 'accidentally']
test_result = combinations(testing)
print(test_result)

[['Alex', 'broke'], ['Alex', 'the'], ['Alex', 'vase'], ['Alex', 'accidentally'], ['broke', 'the'], ['broke', 'vase'], ['broke', 'accidentally'], ['the', 'vase'], ['the', 'accidentally'], ['vase', 'accidentally']]


In [19]:
# prep dataframe
# append columns to the embeddings df

mvs=['5', '10']
stats = ['_median', '_iqr', '_q5', '_q95'] 
for df in dfs:
    # create new empty columns
    for mv in mvs:
        for stat in stats:
            cur = 'bert_word_mv' + mv + stat
            df[cur] = ''
df.head()

Unnamed: 0,grid,content,n_words,bert_emb,bert_word_mv5_median,bert_word_mv5_iqr,bert_word_mv5_q5,bert_word_mv5_q95,bert_word_mv10_median,bert_word_mv10_iqr,bert_word_mv10_q5,bert_word_mv10_q95
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[[tensor(1.0818), tensor(1.0875), tensor(-0.18...",,,,,,,,
1,11689,We have been using that opportunity to do more...,159,"[[tensor(3.7089), tensor(0.8028), tensor(-1.74...",,,,,,,,
2,12376,Alright . um I live in not especially cool Spr...,468,"[[tensor(5.3717), tensor(1.6071), tensor(5.350...",,,,,,,,
3,12630,"My is things are fantastic . No , I mean My uh...",966,"[[tensor(-2.2958), tensor(2.2740), tensor(1.50...",,,,,,,,
4,13493,Mhm . I still get to play . And my stock inves...,134,"[[tensor(1.7302), tensor(3.9717), tensor(6.281...",,,,,,,,


In [24]:
len(df['bert_emb'][0]) # it's a list of 431 tensor

431

In [34]:
df['bert_emb'][0][1].shape # each tensor is 768 dimension

torch.Size([768])

In [51]:
# combinations 'alex broke the vase': 
# alex broke, alex the, alex vase; broke the, broke vase; the vase
# Parola et al is inspired by Pauselli et al (p76): 
# ''Coherence is the average similarity of each word to each of the other
# words in the list, regardless of order or proximity. ''
# w2 = word_embed[word_id+1] # adjacent neighbour only
temp_file = -1
for df in dfs:
    temp_file += 1
    print(temp_file)
    for mv in mvs:
        # print progress
        cur = 'bert_word_mv' + mv
        print('current: ', cur)
        df[cur + '_similarity'] = '' # save the cosine similarities; all stats are derived from there

        # loop over each response
        for i in df.index:
            if type(df['bert_emb'][i]) != float: 
                # chop 1 big response sequence into 5/10-token chunks
                word_embed_chunk = list(divide_chunks(df['bert_emb'][i], int(mv))) 
                chunk_temp_collection = [] 
                # loop over each 5/10 chunk in the response
                for chunck_id, word_embed in enumerate(word_embed_chunk):
                    temp_collection = []
                    # add this for BERT to convert a list of tensors to a list of lists
                    word_embed = [x.numpy().tolist() for x in word_embed]
                    # calculate average similarities for that chunk (5 or 10 window)                   
                    cmbs = combinations(word_embed) # apply function 
                    for cmb in cmbs:
                        w1 = cmb[0]
                        w2 = cmb[1]
                        temp = cosine_similarity(w1, w2)
                        temp_collection.append(temp)
                    temp_sim = np.nanmean(temp_collection)
                    chunk_temp_collection.append(temp_sim) # incrementally append similarity mean to the list 

            # get a list of similarity means for that response, 
            # its len is the number of chunks that the response can be chopped into
            df[cur + '_similarity'][i] = chunk_temp_collection # similarity mv 5 or 10; store it for later reference/stats

            # add other stats here
            df[cur + '_median'][i] = np.nanmedian(chunk_temp_collection)
            df[cur + '_q5'][i] = q5(chunk_temp_collection)
            df[cur + '_q95'][i] = q95(chunk_temp_collection)
            df[cur + '_iqr'][i] = iqr(chunk_temp_collection)
    df.to_csv(result + str(temp_file) + '.csv')
df.head()

0
current:  bert_word_mv5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

current:  bert_word_mv10
1
current:  bert_word_mv5
current:  bert_word_mv10
2
current:  bert_word_mv5
current:  bert_word_mv10
3
current:  bert_word_mv5
current:  bert_word_mv10
4
current:  bert_word_mv5
current:  bert_word_mv10
5
current:  bert_word_mv5
current:  bert_word_mv10
6
current:  bert_word_mv5
current:  bert_word_mv10


Unnamed: 0,grid,content,n_words,bert_emb,bert_word_mv5_median,bert_word_mv5_iqr,bert_word_mv5_q5,bert_word_mv5_q95,bert_word_mv10_median,bert_word_mv10_iqr,bert_word_mv10_q5,bert_word_mv10_q95,bert_word_mv5_similarity,bert_word_mv10_similarity
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[[tensor(1.0818), tensor(1.0875), tensor(-0.18...",0.444363,0.178672,0.270066,0.609532,0.407414,0.089588,0.329988,0.482169,"[0.4030668692226545, 0.575975332456087, 0.4492...","[0.4353834378915009, 0.38135975338542855, 0.59..."
1,11689,We have been using that opportunity to do more...,159,"[[tensor(3.7089), tensor(0.8028), tensor(-1.74...",0.445333,0.156376,0.277829,0.611766,0.408117,0.111262,0.27509,0.516932,"[0.560723852896776, 0.6652568537423886, 0.2939...","[0.5731638521990964, 0.3416750962955687, 0.295..."
2,12376,Alright . um I live in not especially cool Spr...,468,"[[tensor(5.3717), tensor(1.6071), tensor(5.350...",0.428292,0.154923,0.305069,0.600318,0.371323,0.078519,0.310821,0.545604,"[0.4233996030097953, 0.5237559335731028, 0.438...","[0.40351062774445984, 0.40308428274939717, 0.4..."
3,12630,"My is things are fantastic . No , I mean My uh...",966,"[[tensor(-2.2958), tensor(2.2740), tensor(1.50...",0.432861,0.097303,0.305933,0.576322,0.388507,0.071858,0.300331,0.480308,"[0.5440019592228229, 0.5248212334141555, 0.432...","[0.456005437744915, 0.4696179784669962, 0.4740..."
4,13493,Mhm . I still get to play . And my stock inves...,134,"[[tensor(1.7302), tensor(3.9717), tensor(6.281...",0.439209,0.158458,0.280327,0.61143,0.387445,0.070586,0.279698,0.495055,"[0.43610278779597794, 0.4054440814917245, 0.52...","[0.40723352675116464, 0.4939310403243854, 0.42..."


In [52]:
df.columns

Index(['grid', 'content', 'n_words', 'bert_emb', 'bert_word_mv5_median',
       'bert_word_mv5_iqr', 'bert_word_mv5_q5', 'bert_word_mv5_q95',
       'bert_word_mv10_median', 'bert_word_mv10_iqr', 'bert_word_mv10_q5',
       'bert_word_mv10_q95', 'bert_word_mv5_similarity',
       'bert_word_mv10_similarity'],
      dtype='object')

# K1:10

In [53]:
import ast # a module that evaluates mathematical expressions and statements

ks=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
stats = ['_median', '_iqr', '_q5', '_q95']
for df in dfs:
    # create new empty columns
    for k in ks:
        for stat in stats:
            cur = 'bert_word_k' + k + stat
            df[cur] = ''
df.head()

Unnamed: 0,grid,content,n_words,bert_emb,bert_word_mv5_median,bert_word_mv5_iqr,bert_word_mv5_q5,bert_word_mv5_q95,bert_word_mv10_median,bert_word_mv10_iqr,...,bert_word_k8_q5,bert_word_k8_q95,bert_word_k9_median,bert_word_k9_iqr,bert_word_k9_q5,bert_word_k9_q95,bert_word_k10_median,bert_word_k10_iqr,bert_word_k10_q5,bert_word_k10_q95
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[[tensor(1.0818), tensor(1.0875), tensor(-0.18...",0.444363,0.178672,0.270066,0.609532,0.407414,0.089588,...,,,,,,,,,,
1,11689,We have been using that opportunity to do more...,159,"[[tensor(3.7089), tensor(0.8028), tensor(-1.74...",0.445333,0.156376,0.277829,0.611766,0.408117,0.111262,...,,,,,,,,,,
2,12376,Alright . um I live in not especially cool Spr...,468,"[[tensor(5.3717), tensor(1.6071), tensor(5.350...",0.428292,0.154923,0.305069,0.600318,0.371323,0.078519,...,,,,,,,,,,
3,12630,"My is things are fantastic . No , I mean My uh...",966,"[[tensor(-2.2958), tensor(2.2740), tensor(1.50...",0.432861,0.097303,0.305933,0.576322,0.388507,0.071858,...,,,,,,,,,,
4,13493,Mhm . I still get to play . And my stock inves...,134,"[[tensor(1.7302), tensor(3.9717), tensor(6.281...",0.439209,0.158458,0.280327,0.61143,0.387445,0.070586,...,,,,,,,,,,


In [55]:
temp_file = -1
ks=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

for df in dfs:
    temp_file += 1
    # loop through each k
    for k in ks:
        cur = 'bert_word_k' + k 
        print('Coherence k ', k, 'temp_file: ', temp_file) # progress
        df[cur + '_similarity'] = ''
        # loop through each individual's response 
        for i in df.index:
            if type(df['bert_emb'][i]) != float:
                temp = []
                # calcuate similarity of word pairs at k inter-token distance
                for id,v in enumerate(df['bert_emb'][i]): 
                    w1 = v
                    try:
                        w2 = df['bert_emb'][i][id + int(k)] 
                    except IndexError:
                        continue
                    sim = cosine_similarity(w1, w2)
                    temp.append(sim) # a list of similarity scores for that response

                # intermediate df, save 
                df[cur + '_similarity'][i] = temp
                df[cur + '_iqr'][i] = iqr(temp) # add other stats here
                df[cur + '_median'][i] = np.nanmedian(temp)
                df[cur + '_q5'][i] = q5(temp)
                df[cur + '_q95'][i] = q95(temp)
    df.to_csv(result + str(temp_file) + '.csv')
df.head()

Coherence k  1 temp_file:  0
Coherence k  2 temp_file:  0
Coherence k  3 temp_file:  0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Coherence k  4 temp_file:  0
Coherence k  5 temp_file:  0
Coherence k  6 temp_file:  0
Coherence k  7 temp_file:  0
Coherence k  8 temp_file:  0
Coherence k  9 temp_file:  0
Coherence k  10 temp_file:  0
Coherence k  1 temp_file:  1
Coherence k  2 temp_file:  1
Coherence k  3 temp_file:  1
Coherence k  4 temp_file:  1
Coherence k  5 temp_file:  1
Coherence k  6 temp_file:  1
Coherence k  7 temp_file:  1
Coherence k  8 temp_file:  1
Coherence k  9 temp_file:  1
Coherence k  10 temp_file:  1
Coherence k  1 temp_file:  2
Coherence k  2 temp_file:  2
Coherence k  3 temp_file:  2
Coherence k  4 temp_file:  2
Coherence k  5 temp_file:  2
Coherence k  6 temp_file:  2
Coherence k  7 temp_file:  2
Coherence k  8 temp_file:  2
Coherence k  9 temp_file:  2
Coherence k  10 temp_file:  2
Coherence k  1 temp_file:  3
Coherence k  2 temp_file:  3
Coherence k  3 temp_file:  3
Coherence k  4 temp_file:  3
Coherence k  5 temp_file:  3
Coherence k  6 temp_file:  3
Coherence k  7 temp_file:  3
Coherence k

Unnamed: 0,grid,content,n_words,bert_emb,bert_word_mv5_median,bert_word_mv5_iqr,bert_word_mv5_q5,bert_word_mv5_q95,bert_word_mv10_median,bert_word_mv10_iqr,...,bert_word_k1_similarity,bert_word_k2_similarity,bert_word_k3_similarity,bert_word_k4_similarity,bert_word_k5_similarity,bert_word_k6_similarity,bert_word_k7_similarity,bert_word_k8_similarity,bert_word_k9_similarity,bert_word_k10_similarity
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[[tensor(1.0818), tensor(1.0875), tensor(-0.18...",0.444363,0.178672,0.270066,0.609532,0.407414,0.089588,...,"[0.11699516, 0.15833694, 0.7050986, 0.6519621,...","[0.71463287, 0.120548256, 0.45634466, 0.583628...","[0.58883893, 0.07900833, 0.48695877, 0.5497451...","[0.43890297, 0.08526741, 0.51181865, 0.601614,...","[0.4982653, 0.118715934, 0.46696645, 0.3737738...","[0.4660052, 0.11239307, 0.34002537, 0.5239003,...","[0.44306654, 0.077830255, 0.4145039, 0.4377555...","[0.34704378, 0.0988978, 0.36672312, 0.38744497...","[0.40791243, 0.070459016, 0.3719444, 0.6565817...","[0.37299016, 0.041924503, 0.4759185, 0.3932944..."
1,11689,We have been using that opportunity to do more...,159,"[[tensor(3.7089), tensor(0.8028), tensor(-1.74...",0.445333,0.156376,0.277829,0.611766,0.408117,0.111262,...,"[0.5515479, 0.6505364, 0.69764054, 0.62689614,...","[0.5534375, 0.5728669, 0.51219124, 0.5966172, ...","[0.5125358, 0.4722217, 0.5306855, 0.6569906, 0...","[0.45736465, 0.44188166, 0.64849734, 0.5448049...","[0.46813637, 0.52411044, 0.5473028, 0.5799935,...","[0.55969113, 0.48314965, 0.5999355, 0.5002708,...","[0.5385768, 0.50476295, 0.49556985, 0.48057625...","[0.53392816, 0.4097621, 0.45947057, 0.13068396...","[0.4928571, 0.40582836, 0.13978846, 0.29789257...","[0.50191694, 0.15662588, 0.29720053, 0.0497375..."
2,12376,Alright . um I live in not especially cool Spr...,468,"[[tensor(5.3717), tensor(1.6071), tensor(5.350...",0.428292,0.154923,0.305069,0.600318,0.371323,0.078519,...,"[0.51181096, 0.40245765, 0.31970045, 0.6195469...","[0.5607296, 0.47527143, 0.22263032, 0.4783405,...","[0.42315328, 0.34736887, 0.17104891, 0.4792861...","[0.35132682, 0.3143299, 0.26811114, 0.40899155...","[0.30643082, 0.36325487, 0.21205339, 0.4000393...","[0.37929794, 0.27494228, 0.21570645, 0.335643,...","[0.34115946, 0.29785895, 0.252828, 0.4729747, ...","[0.34653705, 0.2767291, 0.1648226, 0.2563655, ...","[0.3231972, 0.408678, 0.6641225, 0.55769473, 0...","[0.3050225, 0.23216723, 0.30159816, 0.48772332..."
3,12630,"My is things are fantastic . No , I mean My uh...",966,"[[tensor(-2.2958), tensor(2.2740), tensor(1.50...",0.432861,0.097303,0.305933,0.576322,0.388507,0.071858,...,"[0.64131474, 0.5480628, 0.6396768, 0.6251538, ...","[0.48011062, 0.52021354, 0.5717638, 0.46825495...","[0.4479184, 0.47657725, 0.38942608, 0.4054251,...","[0.4892283, 0.41964006, 0.34207383, 0.36616522...","[0.4240641, 0.41798735, 0.2924404, 0.4722935, ...","[0.46419668, 0.39367008, 0.35573572, 0.3631486...","[0.37378308, 0.3696485, 0.35296106, 0.44296795...","[0.43028837, 0.3915435, 0.39460862, 0.11840748...","[0.40280768, 0.4156596, 0.1747571, 0.40403363,...","[0.5811956, 0.3142407, 0.41179627, 0.4817863, ..."
4,13493,Mhm . I still get to play . And my stock inves...,134,"[[tensor(1.7302), tensor(3.9717), tensor(6.281...",0.439209,0.158458,0.280327,0.61143,0.387445,0.070586,...,"[0.582798, 0.4852483, 0.46138352, 0.61389256, ...","[0.40011308, 0.40333024, 0.28970823, 0.5673752...","[0.46729577, 0.34406006, 0.28540775, 0.5900379...","[0.31319818, 0.32526517, 0.26612005, 0.4202023...","[0.28280032, 0.2593202, 0.24619617, 0.46284238...","[0.27548426, 0.27080727, 0.7501428, 0.5533116,...","[0.25338143, 0.3664747, 0.503086, 0.7536473, 0...","[0.369249, 0.37200704, 0.38204885, 0.29634637,...","[0.35037884, 0.30237693, 0.18081817, 0.3673495...","[0.37812087, 0.18694621, 0.19634564, 0.4736290..."


In [56]:
df.columns

Index(['grid', 'content', 'n_words', 'bert_emb', 'bert_word_mv5_median',
       'bert_word_mv5_iqr', 'bert_word_mv5_q5', 'bert_word_mv5_q95',
       'bert_word_mv10_median', 'bert_word_mv10_iqr', 'bert_word_mv10_q5',
       'bert_word_mv10_q95', 'bert_word_mv5_similarity',
       'bert_word_mv10_similarity', 'bert_word_k1_median', 'bert_word_k1_iqr',
       'bert_word_k1_q5', 'bert_word_k1_q95', 'bert_word_k2_median',
       'bert_word_k2_iqr', 'bert_word_k2_q5', 'bert_word_k2_q95',
       'bert_word_k3_median', 'bert_word_k3_iqr', 'bert_word_k3_q5',
       'bert_word_k3_q95', 'bert_word_k4_median', 'bert_word_k4_iqr',
       'bert_word_k4_q5', 'bert_word_k4_q95', 'bert_word_k5_median',
       'bert_word_k5_iqr', 'bert_word_k5_q5', 'bert_word_k5_q95',
       'bert_word_k6_median', 'bert_word_k6_iqr', 'bert_word_k6_q5',
       'bert_word_k6_q95', 'bert_word_k7_median', 'bert_word_k7_iqr',
       'bert_word_k7_q5', 'bert_word_k7_q95', 'bert_word_k8_median',
       'bert_word_k8_iqr', 'be

# TLC merge

In [None]:
tlc = pd.read_csv('/Users/yancong/Desktop/4 clinical/00 Project Files/crossdx_clin.csv', index_col=0)
tlc = tlc[['grid', 'SSDvHC', 'group', 'tlc_01povspeech', 'tlc_02povcontent',	'tlc_03pressure',	'tlc_04distract',
	'tlc_05tangent', 'tlc_06derail', 'tlc_07incoh',	'tlc_08illogic',	'tlc_09clang',	'tlc_10neologism',
    	'tlc_11wordapprox',	'tlc_12circum',	'tlc_13lossgoal',	'tlc_14persev',	'tlc_15echo',	'tlc_16block',
        	'tlc_17stilt',	'tlc_18selfref', 'tlc_3f_inefficient',	'tlc_3f_incoherent',	'tlc_3f_impexpress']]
tlc.head()

In [None]:
# only keep the processed stats columns
temp = -1
for df in dfs:
    temp += 1
    df.drop(['content', 'n_words', 'bert_emb'], axis = 1, inplace=True)
    df['grid'] = df['grid'].astype(str)
    tlc['grid'] = tlc['grid'].astype(str)
    df = df.merge(tlc, on=['grid'])
    df.to_csv(result + str(temp) + '_GT.csv')
df.head()

In [None]:
df.columns