# Prep dataset

In [1]:
import pandas as pd # data wrangling
import numpy as np # math and data analytics
import os
import scipy
import scipy.stats

from google.colab import drive
drive.mount('/content/drive')
data_foloder = '/content/drive/My Drive/simulation/Gradient_shuffle/'
result = '/content/drive/My Drive/simulation/Output_simul/T5/sentence/'

baseline = pd.read_csv(data_foloder + 'simulation_HV_baseline_vb_response_deid_v3.csv')
incoh10 = pd.read_csv(data_foloder + 'simulation_HV_incoh_vb_response_deid_10v3.csv')
incoh20 = pd.read_csv(data_foloder + 'simulation_HV_incoh_vb_response_deid_20v3.csv')
incoh50 = pd.read_csv(data_foloder + 'simulation_HV_incoh_vb_response_deid_50v3.csv')
ineff10 = pd.read_csv(data_foloder + 'simulation_HV_ineff_vb_response_deid_10v3.csv')
ineff20 = pd.read_csv(data_foloder + 'simulation_HV_ineff_vb_response_deid_20v3.csv')
ineff50 = pd.read_csv(data_foloder + 'simulation_HV_ineff_vb_response_deid_50v3.csv')

baseline.head()

Mounted at /content/drive


Unnamed: 0,grid,content,n_words
0,10455,"I'm a young man , an en an en- an engineer by ...",421
1,11689,Sure . I'm thirty three years old . My name is...,159
2,12376,Alright . um I live in not especially cool Spr...,468
3,12630,um So I'm currently twenty-nine . I was born a...,966
4,13493,Mhm . I'm a thirty five year old man who uh um...,134


# Install lib and dependencies

In [2]:
# Install T5 sentence encoders from TensorFlow Hub
# https://tfhub.dev/google/sentence-t5/st5-large/1
# Sentence encoders for English built on top of T5 models.

# Use colab because M1 keeps having issues with tf [version incompatible?]; 
# faster with TPU; all deidentified; don't need API key

!pip install tensorflow_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_text
  Downloading tensorflow_text-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 34.3 MB/s 
Collecting tensorflow<2.12,>=2.11.0
  Downloading tensorflow-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[K     |████████████████████████████████| 588.3 MB 6.2 kB/s 
Collecting tensorboard<2.12,>=2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 59.4 MB/s 
[?25hCollecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 52.5 MB/s 
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
[K     |████████████████████████████████| 439 kB 70.4 MB/s 
Collecting flatbuffer

In [3]:
# encode sentences using tensorflow

import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  # Registers the ops.

english_sentences = tf.constant(["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."])

hub_url = "https://tfhub.dev/google/sentence-t5/st5-base/1" # the largest model we can do using cpu
encoder = hub.KerasLayer(hub_url)

english_embeds = encoder(english_sentences)

print (english_embeds) # 3 phrases; 768 dimension



[<tf.Tensor: shape=(3, 768), dtype=float32, numpy=
array([[-0.02498951, -0.01846411,  0.01713568, ..., -0.03794743,
        -0.06852311,  0.00769102],
       [-0.01642575, -0.01902249,  0.01045546, ..., -0.00347666,
        -0.0248219 , -0.02178053],
       [-0.01585932, -0.00118521,  0.01167279, ...,  0.02128684,
        -0.03940554,  0.01317421]], dtype=float32)>]


# Get contextualized sentence embeddings

In [6]:
# define a function
# get contextualized sentence embeddings
# using t5 sentence encoder
def get_sent_emb(s):
  result = encoder(s)[0].numpy()
  return result

dfs = [baseline, incoh10, incoh20, incoh50, ineff10, ineff20, ineff50]
temp = -1
for df in dfs:
  temp += 1
  df["t5_sent_emb"] = ''
  # apply the embedding function to the data frame
  df["t5_sent_emb"] = df['content'].apply(lambda x: get_sent_emb(x.split('.'))) 
  df.to_csv(result + str(temp) + '_sent.csv')
df.head()

Unnamed: 0,grid,content,n_words,t5_sent_emb
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[[-0.018183345, -0.033156883, -0.012168732, 0...."
1,11689,We have been using that opportunity to do more...,159,"[[0.0028181453, 0.026697509, -0.0026316773, 0...."
2,12376,Alright . um I live in not especially cool Spr...,468,"[[-0.052137572, -0.002461396, 0.022676356, -0...."
3,12630,"My is things are fantastic . No , I mean My uh...",966,"[[-0.020024542, -0.025058616, 0.019643957, 0.0..."
4,13493,Mhm . I still get to play . And my stock inves...,134,"[[-0.029634507, -0.02434324, 0.009210587, 0.01..."


In [None]:
len(baseline['content'][0].split('.')) # number of sentences in the response

14

In [None]:
len(baseline['t5_sent_emb'][0]) # number of sentence emb in the response vector

14

In [None]:
len(baseline['t5_sent_emb'][0][0]) # each sentence vector is 768 dimension

768

# Stats and similarities functions

In [15]:
# stats ignoring nan, apply to all LMs 
from numpy import nanmedian

import scipy
def iqr(x):
  return scipy.stats.iqr(np.array(x), nan_policy='omit')

from numpy import nanquantile
def q5(x):
    return np.nanquantile(np.array(x), 0.05)

def q95(x):
    return np.nanquantile(np.array(x), 0.95)

In [8]:
# cosine_similarity, apply to all LMs
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# FOC and SOC

In [9]:
cs=['foc', 'soc']
stats = ['_median', '_iqr', '_q5', '_q95']
for df in dfs:
    # create new empty columns
    for c in cs:
        for stat in stats:
            cur = 't5_' + c + stat
            df[cur] = ''
df.head()

Unnamed: 0,grid,content,n_words,t5_sent_emb,t5_foc_median,t5_foc_iqr,t5_foc_q5,t5_foc_q95,t5_soc_median,t5_soc_iqr,t5_soc_q5,t5_soc_q95
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[[-0.018183345, -0.033156883, -0.012168732, 0....",,,,,,,,
1,11689,We have been using that opportunity to do more...,159,"[[0.0028181453, 0.026697509, -0.0026316773, 0....",,,,,,,,
2,12376,Alright . um I live in not especially cool Spr...,468,"[[-0.052137572, -0.002461396, 0.022676356, -0....",,,,,,,,
3,12630,"My is things are fantastic . No , I mean My uh...",966,"[[-0.020024542, -0.025058616, 0.019643957, 0.0...",,,,,,,,
4,13493,Mhm . I still get to play . And my stock inves...,134,"[[-0.029634507, -0.02434324, 0.009210587, 0.01...",,,,,,,,


In [16]:
temp_file = -1 # keep track of file names
for df in dfs:
    temp_file += 1
    df['t5_foc_similarity'] = ''
    df['t5_soc_similarity'] = ''
    # loop over each response
    for i in df.index:
        temp_foc = [] # for each individual, store the list of cos similarity
        temp_soc = []
        # calculate average similarity for sentence pairs, either adjacent or with one intervening
        for idx, sent in enumerate(df['t5_sent_emb'][i]):
            try:
                temp_foc.append(cosine_similarity(sent, df['t5_sent_emb'][i][idx+1])) # get a list of similarities for that response
                
                df['t5_foc_similarity'][i] = temp_foc # record intermediate similarities 
                
                df['t5_foc_median'][i] = np.nanmedian(temp_foc) # add more stats here
                df['t5_foc_iqr'][i] = iqr(temp_foc)
                df['t5_foc_q5'][i] = q5(temp_foc)
                df['t5_foc_q95'][i] = q95(temp_foc)

                temp_soc.append(cosine_similarity(sent, df['t5_sent_emb'][i][idx+2]))

                df['t5_soc_similarity'][i] = temp_soc # record intermediate similarities 

                df['t5_soc_median'][i] = np.nanmedian(temp_soc)
                df['t5_soc_iqr'][i] = iqr(temp_soc)
                df['t5_soc_q5'][i] = q5(temp_soc)
                df['t5_soc_q95'][i] = q95(temp_soc)

            except IndexError:
                continue
    df.to_csv(result + str(temp_file) + '_sent.csv')
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pand

Unnamed: 0,grid,content,n_words,t5_sent_emb,t5_foc_median,t5_foc_iqr,t5_foc_q5,t5_foc_q95,t5_soc_median,t5_soc_iqr,t5_soc_q5,t5_soc_q95,t5_foc_similarity,t5_soc_similarity
0,10455,"I'm a young man , an en an en- an engineer by ...",421,"[[-0.018183345, -0.033156883, -0.012168732, 0....",0.729684,0.046072,0.682548,0.791078,0.727438,0.052963,0.682374,0.777172,"[0.6837058, 0.6721271, 0.7188821, 0.78218544, ...","[0.7161523, 0.71497416, 0.7062463, 0.73367435,..."
1,11689,We have been using that opportunity to do more...,159,"[[0.0028181453, 0.026697509, -0.0026316773, 0....",0.708511,0.030569,0.678175,0.766187,0.717915,0.032804,0.685283,0.772528,"[0.67008454, 0.75791574, 0.722253, 0.7052737, ...","[0.67682105, 0.70451593, 0.72805715, 0.7179151..."
2,12376,Alright . um I live in not especially cool Spr...,468,"[[-0.052137572, -0.002461396, 0.022676356, -0....",0.72068,0.032059,0.65714,0.840767,0.724853,0.04153,0.672862,0.831958,"[0.7140091, 0.64988375, 0.7672888, 0.7436726, ...","[0.7878721, 0.73721933, 0.6830267, 0.78073, 0...."
3,12630,"My is things are fantastic . No , I mean My uh...",966,"[[-0.020024542, -0.025058616, 0.019643957, 0.0...",0.726292,0.047298,0.692184,0.836199,0.716972,0.056054,0.655936,0.816365,"[0.7125414, 0.80707777, 0.7225398, 0.72936887,...","[0.73643196, 0.6978459, 0.7162848, 0.68907344,..."
4,13493,Mhm . I still get to play . And my stock inves...,134,"[[-0.029634507, -0.02434324, 0.009210587, 0.01...",0.746641,0.030826,0.737419,0.793395,0.734013,0.055656,0.720703,0.804657,"[0.780653, 0.740466, 0.79764295, 0.74527013, 0...","[0.72583014, 0.73401254, 0.78148615, 0.8104502..."


# Clinical factors

In [None]:
Rfolder = '/Users/yancong/Desktop/4 clinical/02 projects_parsely/05 ssd-lm-stanglab/13 remora_lpop_aces/data_analysis/T5_LongText/'
tlc = pd.read_csv('/Users/yancong/Desktop/4 clinical/00 Project Files/crossdx_clin.csv', index_col=0)
tlc = tlc[['grid', 'SSDvHC', 'group', 'tlc_01povspeech', 'tlc_02povcontent',	'tlc_03pressure',	'tlc_04distract',
	'tlc_05tangent', 'tlc_06derail', 'tlc_07incoh',	'tlc_08illogic',	'tlc_09clang',	'tlc_10neologism',
    	'tlc_11wordapprox',	'tlc_12circum',	'tlc_13lossgoal',	'tlc_14persev',	'tlc_15echo',	'tlc_16block',
        	'tlc_17stilt',	'tlc_18selfref', 'tlc_3f_inefficient',	'tlc_3f_incoherent',	'tlc_3f_impexpress']]
tlc.head()

Unnamed: 0,grid,SSDvHC,group,tlc_01povspeech,tlc_02povcontent,tlc_03pressure,tlc_04distract,tlc_05tangent,tlc_06derail,tlc_07incoh,...,tlc_12circum,tlc_13lossgoal,tlc_14persev,tlc_15echo,tlc_16block,tlc_17stilt,tlc_18selfref,tlc_3f_inefficient,tlc_3f_incoherent,tlc_3f_impexpress
1,10308,1.0,SSD,0,1,0,0,0,0,0,...,0,0,1,0,0,2,0,-0.406404,-0.069358,-0.018896
2,10311,0.0,HC,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.713934,-0.463481,0.197262
3,10316,1.0,SSD,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.687947,-0.387319,-0.710348
4,10455,0.0,HC,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.687947,-0.387319,-0.710348
5,10582,1.0,SSD,0,0,0,1,0,1,0,...,1,1,0,0,0,0,0,-0.059923,-0.107333,-0.458561


In [None]:
# only keep the processed stats columns
temp = -1
for df in dfs:
    temp += 1
    df.drop(['content', 'n_words', 't5_sent_embed'], axis = 1, inplace=True)
    df['grid'] = df['grid'].astype(str)
    tlc['grid'] = tlc['grid'].astype(str)
    df = df.merge(tlc, on=['grid'])
    df.to_csv(result + str(temp) + '_sent_GT.csv')
df.head()

Index(['grid', 'n_tokens', 'content', 'study', 'group', 'SSDvHC', 't5_foc',
       't5_soc', 't5_sent_emb'],
      dtype='object')

In [None]:
df.columns