# note: This program needs openAI API

# Prep dataset

In [None]:
import pandas as pd
import numpy as np
import os
import scipy
from torch import tensor

os.getcwd()

In [None]:
# reading in data by sample
# each sample is collapsed from the two open-ended prompts

data_foloder = 'your folder'

baseline = pd.read_csv(data_foloder + 'test.csv')

baseline.head() 

# Install lib and dependencies

In [None]:
# no need to install it every time you open it
# for GPT-3
# install GPT-3 from huggingface
!pip install transformers 

In [3]:
# import tokenizers from GPT2TokenizerFast (GPT-3 uses the same tokenizer)

import transformers
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [4]:
# use openai API
from openai.embeddings_utils import get_embeddings, cosine_similarity # important
import openai
openai.api_key = 'your key'

# Get word embeddings from whole response embeddings GPT3 3df

In [None]:
buglist = {} # initialize dic to log bugs
def calc_gpt3(r): # defining function for extracting contextualized embeddings from verbatim transcripts
  if r[0] % 2 != 0: # keeping track of progress - print out even grid as we process them
    print('current grid: ', r[0])
  if type(r['content']) == str and r['n_words'] < 2048: # if the content is string [non-empty]; 2048 is max for GPT-3
    temp = r['content'].split(' ') # gives a list of words
    input = ' '.join([i for i in temp if (i != '') & ('{' not in i) & ('}' not in i) & ('#' not in i)]) # exclude NSV and the restart symbol
    try:
      # take in the whole response as a list of words; 2048 dimensions
      # https://beta.openai.com/docs/guides/embeddings/what-are-embeddings
      vec = get_embeddings(input.split(' '), engine = 'your engine') 
    except:
      buglist[r[0]] = input.split(' ') # store grid if there is a bug
    return vec
  else:
    return np.nan

# double check your input df
# make sure there is no '  ' or '   '

df = baseline
temp = 0
df = df[df['grid'] == 'test000'] # only select 1 individual for demo purpose
df["gpt3_embed"] = '' # create new empty column
df["gpt3_embed"] = df.apply(lambda r: calc_gpt3(r), axis = 1) # apply embedding function to df
df.to_csv(str(temp) + '.csv')

df.head()

In [8]:
len(df['gpt3_embed'][1]) # the number of tokens in that response

177

In [9]:
# every token get a len (i.e. dimension) 2048 embedding vector
len(list(df['gpt3_embed'][1])[0]) 

2048

In [10]:
buglist

{}

# MV 5/10

In [6]:
# Average semantic similarity of each word in 5- or 10- words window

def divide_chunks(l, n):
      
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]
  
# n: How many elements each
# list should have
test = ['Alex','broke','the','vase','accidentally','.','But','Kai','did','it','on','purpose','.']
divide_chunks(test,5)
chopped = list(divide_chunks(test,5))
print(chopped)

[['Alex', 'broke', 'the', 'vase', 'accidentally'], ['.', 'But', 'Kai', 'did', 'it'], ['on', 'purpose', '.']]


In [7]:
def combinations(lst): # get w1, w2 combinations
    # input: a list of <= 5 tokens
    cmb = []
    rightside = lst[:] # initialize a list
    for wid, w1 in enumerate(lst): # each token gets a chance to be w1
        rightside = lst[wid:] # dynamically chop off w1 from the rest of the list
        while rightside: # loop until the rest of the list is empty
            w2 = rightside.pop(0) # stack up w2
            if w2 != w1: # get rid of ['Alex', 'Alex']
                cmb.append([w1, w2])  
    return cmb

testing = ['Alex', 'broke', 'the', 'vase', 'accidentally']
result = combinations(testing)
print(result)

[['Alex', 'broke'], ['Alex', 'the'], ['Alex', 'vase'], ['Alex', 'accidentally'], ['broke', 'the'], ['broke', 'vase'], ['broke', 'accidentally'], ['the', 'vase'], ['the', 'accidentally'], ['vase', 'accidentally']]


In [8]:
# stats ignoring nan
from numpy import nanmedian

import scipy
def iqr(x):
  return scipy.stats.iqr(np.array(x), nan_policy='omit')

from numpy import quantile
def q5(x):
    return np.quantile(np.array(x), 0.05)

def q95(x):
    return np.quantile(np.array(x), 0.95)

In [None]:
temp_file = 0 # give df a temp name; the intermediate df will be discarded bcs otherwise it occupies too much space
mvs=['5', '10']
stats = ['_median', '_iqr', '_q5', '_q95'] # appended to the embeddings df
dfs = [df]
for df in dfs:
    temp += 1
    # create new empty columns
    for mv in mvs:
        for stat in stats:
            cur = 'gpt3_word_mv' + mv + stat
            df[cur] = ''
            df.to_csv(str(temp) + '.csv')
df.head()

In [None]:
for mv in mvs:
    # print progress
    cur = 'gpt3_word_mv' + mv
    print('current: ', cur)
    df[cur + '_similarity'] = '' # ssave the cosine similarities; all stats are derived from there

    # loop over each response
    for i in df.index:
        if type(df['gpt3_embed'][i]) != float: 
            # chop 1 big response sequence into 5/10-token chunks
            word_embed_chunk = list(divide_chunks(df['gpt3_embed'][i], int(mv)))
            chunk_temp_collection = [] 
            # loop over each 5/10 chunk in the response
            for chunck_id, word_embed in enumerate(word_embed_chunk):
                temp_collection = []
                # calculate average similarities for that chunk (5 or 10 window)
                cmbs = combinations(word_embed) # apply function 
                for cmb in cmbs:
                    w1 = cmb[0]
                    w2 = cmb[1]
                    temp = cosine_similarity(w1, w2)
                    temp_collection.append(temp)
                temp_sim = np.nanmean(temp_collection)
                chunk_temp_collection.append(temp_sim) # incrementally append similarity mean to the list 

        # get a list of similarity means for that response, 
        # its len is the number of chunks that the response can be chopped into
        df[cur + '_similarity'][i] = chunk_temp_collection # similarity mv 5 or 10; store it for later reference/stats

        # add other stats here
        df[cur + '_median'][i] = np.nanmedian(chunk_temp_collection)
        df[cur + '_q5'][i] = q5(chunk_temp_collection)
        df[cur + '_q95'][i] = q95(chunk_temp_collection)
        df[cur + '_iqr'][i] = iqr(chunk_temp_collection)
        
df.to_csv(str(temp_file) + '.csv')
df.head()

# K1:10

In [None]:
import ast # a module that evaluates mathematical expressions and statements

temp = -1
ks=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
stats = ['_median', '_iqr', '_q5', '_q95']
dfs = [df]
for df in dfs:
    temp += 1
    # create new empty columns
    for k in ks:
        for stat in stats:
            cur = 'gpt3_word_k' + k + stat
            df[cur] = ''
            df.to_csv(str(temp) + '.csv')
df.head()

In [None]:
temp_file = -1
ks=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

for df in dfs:
    temp_file += 1
    # loop through each k
    for k in ks:
        cur = 'gpt3_word_k' + k 
        print('Coherence k ', k) # progress
        df[cur + '_similarity'] = ''
        # loop through each individual's response 
        for i in df.index:
            if type(df['gpt3_embed'][i]) != float:
                temp = []
                # calcuate similarity of word pairs at k inter-token distance
                for id,v in enumerate(df['gpt3_embed'][i]):
                    w1 = v
                    try:
                        w2 = df['gpt3_embed'][i][id + int(k)]
                    except IndexError:
                        continue
                    sim = cosine_similarity(w1, w2)
                    temp.append(sim) # a list of similarity scores for that response

                # intermediate df, save 
                df[cur + '_similarity'][i] = temp
                df[cur + '_iqr'][i] = iqr(temp) # add other stats here
                df[cur + '_median'][i] = np.nanmedian(temp)
                df[cur + '_q5'][i] = q5(temp)
                df[cur + '_q95'][i] = q95(temp)
    df.to_csv(str(temp_file) + '.csv')
df.head()