# Prep dataset

In [None]:
import pandas as pd # data wrangling
import numpy as np # math and data analytics
import os
import scipy
import scipy.stats

from google.colab import drive
drive.mount('/content/drive')
data_foloder = 'here'
result = 'here'

baseline = pd.read_csv(data_foloder + 'here.csv')
incoh10 = pd.read_csv(data_foloder + 'here.csv')
incoh20 = pd.read_csv(data_foloder + 'here.csv')
incoh50 = pd.read_csv(data_foloder + 'here.csv')
ineff10 = pd.read_csv(data_foloder + 'here.csv')
ineff20 = pd.read_csv(data_foloder + 'here.csv')
ineff50 = pd.read_csv(data_foloder + 'here.csv')

baseline.head()

# Install lib and dependencies

In [2]:
# Install T5 sentence encoders from TensorFlow Hub
# https://tfhub.dev/google/sentence-t5/st5-large/1
# Sentence encoders for English built on top of T5 models.

# Use colab because M1 keeps having issues with tf [version incompatible?]; 
# faster with TPU; all deidentified; don't need API key

!pip install tensorflow_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_text
  Downloading tensorflow_text-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 34.3 MB/s 
Collecting tensorflow<2.12,>=2.11.0
  Downloading tensorflow-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[K     |████████████████████████████████| 588.3 MB 6.2 kB/s 
Collecting tensorboard<2.12,>=2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 59.4 MB/s 
[?25hCollecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 52.5 MB/s 
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
[K     |████████████████████████████████| 439 kB 70.4 MB/s 
Collecting flatbuffer

In [3]:
# encode sentences using tensorflow

import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  # Registers the ops.

english_sentences = tf.constant(["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."])

hub_url = "https://tfhub.dev/google/sentence-t5/st5-base/1" # the largest model we can do using cpu
encoder = hub.KerasLayer(hub_url)

english_embeds = encoder(english_sentences)

print (english_embeds) # 3 phrases; 768 dimension



[<tf.Tensor: shape=(3, 768), dtype=float32, numpy=
array([[-0.02498951, -0.01846411,  0.01713568, ..., -0.03794743,
        -0.06852311,  0.00769102],
       [-0.01642575, -0.01902249,  0.01045546, ..., -0.00347666,
        -0.0248219 , -0.02178053],
       [-0.01585932, -0.00118521,  0.01167279, ...,  0.02128684,
        -0.03940554,  0.01317421]], dtype=float32)>]


# Get contextualized sentence embeddings

In [None]:
# define a function
# get contextualized sentence embeddings
# using t5 sentence encoder
def get_sent_emb(s):
  result = encoder(s)[0].numpy()
  return result

dfs = [baseline, incoh10, incoh20, incoh50, ineff10, ineff20, ineff50]
temp = -1
for df in dfs:
  temp += 1
  df["t5_sent_emb"] = ''
  # apply the embedding function to the data frame
  df["t5_sent_emb"] = df['content'].apply(lambda x: get_sent_emb(x.split('.'))) 
  df.to_csv(result + str(temp) + '_sent.csv')
df.head()

In [None]:
len(baseline['content'][0].split('.')) # number of sentences in the response

14

In [None]:
len(baseline['t5_sent_emb'][0]) # number of sentence emb in the response vector

14

In [None]:
len(baseline['t5_sent_emb'][0][0]) # each sentence vector is 768 dimension

768

# Stats and similarities functions

In [15]:
# stats ignoring nan, apply to all LMs 
from numpy import nanmedian

import scipy
def iqr(x):
  return scipy.stats.iqr(np.array(x), nan_policy='omit')

from numpy import nanquantile
def q5(x):
    return np.nanquantile(np.array(x), 0.05)

def q95(x):
    return np.nanquantile(np.array(x), 0.95)

In [8]:
# cosine_similarity, apply to all LMs
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# FOC and SOC

In [None]:
cs=['foc', 'soc']
stats = ['_median', '_iqr', '_q5', '_q95']
for df in dfs:
    # create new empty columns
    for c in cs:
        for stat in stats:
            cur = 't5_' + c + stat
            df[cur] = ''
df.head()

In [None]:
temp_file = -1 # keep track of file names
for df in dfs:
    temp_file += 1
    df['t5_foc_similarity'] = ''
    df['t5_soc_similarity'] = ''
    # loop over each response
    for i in df.index:
        temp_foc = [] # for each individual, store the list of cos similarity
        temp_soc = []
        # calculate average similarity for sentence pairs, either adjacent or with one intervening
        for idx, sent in enumerate(df['t5_sent_emb'][i]):
            try:
                temp_foc.append(cosine_similarity(sent, df['t5_sent_emb'][i][idx+1])) # get a list of similarities for that response
                
                df['t5_foc_similarity'][i] = temp_foc # record intermediate similarities 
                
                df['t5_foc_median'][i] = np.nanmedian(temp_foc) # add more stats here
                df['t5_foc_iqr'][i] = iqr(temp_foc)
                df['t5_foc_q5'][i] = q5(temp_foc)
                df['t5_foc_q95'][i] = q95(temp_foc)

                temp_soc.append(cosine_similarity(sent, df['t5_sent_emb'][i][idx+2]))

                df['t5_soc_similarity'][i] = temp_soc # record intermediate similarities 

                df['t5_soc_median'][i] = np.nanmedian(temp_soc)
                df['t5_soc_iqr'][i] = iqr(temp_soc)
                df['t5_soc_q5'][i] = q5(temp_soc)
                df['t5_soc_q95'][i] = q95(temp_soc)

            except IndexError:
                continue
    df.to_csv(result + str(temp_file) + '_sent.csv')
df.head()