# Prep dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
temp = pd.read_csv('sentence_clean.csv', index_col=0)
temp.head()

In [None]:
len(temp.grid.unique())

In [None]:
ssd = temp[temp.group.isin(['SSD'])]
len(ssd.grid.unique())

In [None]:
hc = temp[temp.group.isin(['HC'])]
len(hc.grid.unique())

In [None]:
!pip install transformers

In [None]:
import transformers

In [None]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Get embeddings

In [None]:
!pip install plotly

In [None]:
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt

from openai.embeddings_utils import get_embedding

In [None]:
import openai
openai.api_key = 'here'

# Word - Add embeddings to the dataframe

## load data and similarity comparison

In [None]:
# To load the data from a saved file, you can run the following:

df = pd.read_csv('sentence_clean.csv', index_col=0)
df['babbage_similarity'] = df.babbage_similarity.apply(eval).apply(np.array)
df['babbage_search'] = df.babbage_search.apply(eval).apply(np.array)

In [None]:
df.head()

In [None]:
# the embedding representation is very rich and information dense.
matrix = np.vstack(df.babbage_similarity.values)
matrix.shape

## traditional way get word embed

In [None]:
# for subset dataframe calculations, change here
# new directory
df_full = pd.read_csv('word_level_deident.csv', index_col=0)
df_full.speaker.unique()

In [None]:
df_full.task.unique()

In [None]:
taskin = ['AboutYourself', 'HowsItGoing', 'HowsItGoingx', ' HowsItGoing']
df_full = df_full[df_full.task.isin(taskin)]
df_full = df_full[df_full.speaker == "Subject"]
df_word = df_full
df_word.head()

In [None]:
def calc_gpt3(r):
  if r["n_words"] == 1 and r["is_partial"] == 0 and r["is_speech_pause"] == 0 and r['is_punctuation'] == 0 and r['is_stopword'] == 0:
    return get_embedding(r['word_lower'], engine = 'text-similarity-babbage-001')
  else:
    return np.nan

df_full["gpt3_embed"] = df_full.apply(lambda r: calc_gpt3(r), axis = 1)

In [None]:
# filter word level dataframe 
df_word = df_full
df_coh_word = df_word.loc[(df_word.is_speech_pause == 0) & (df_word.is_partial == 0) & (df_word.is_punctuation == 0) & (df_word.is_stopword == 0)]
df_coh_word = df_coh_word[["uid", "task", "word_lower", "sentence_id", "seg_id", "content", "sp.tokenized", "sp.lemma", "gpt3_embed"]] 

### Word level: add min, max, mean

In [None]:
gt = pd.read_csv('sentence_clean.csv', index_col=0)
gt.head()

In [None]:
def mean_embedding_of_sentence(sentence_embeddings):
  print(sentence_embeddings.shape)
  if sentence_embeddings.shape[0] > 0:
    return np.average(sentence_embeddings.astype(np.float),0)
  else:
    return np.NaN

def calc_response_cosine_similarity(sentence_embeddings):
  from sklearn.metrics.pairwise import cosine_similarity
  sim_matrix = cosine_similarity(sentence_embeddings)
  resp_sim = 0
  for i in range(sentence_embeddings.shape[0]-1):
    resp_sim += sim_matrix[i][i+1]
  resp_sim /= sentence_embeddings.shape[0] - 1
  return resp_sim

document_group = "uid"
term_col = "word_lower"

def get_idf_stats(r, N):
  d = {}
  d['doc_list'] = r[document_group].unique()
  d['doc_count'] =  len(r[document_group].unique())
  d["idf"] = N / d['doc_count']
  return pd.Series(d, index=['doc_list', 'doc_count', "idf"])

N = len(df_coh_word[document_group].unique())

# document frequency
word_idf = df_coh_word.groupby([term_col]).apply(lambda x: get_idf_stats(x, N))
word_idf

# term frequency per document
word_tf = df_coh_word.groupby([document_group,term_col]).agg({term_col: 'count'}).groupby(level=0).apply(lambda x: x / float(x.sum())).rename(columns={term_col:"tf"})
word_tf

# tf(t, d) * log(idf)
def get_td_idf(term, document):
  tf = word_tf.loc[document, term]["tf"]
  idf = word_idf.loc[term]["idf"]
  return tf * np.log(idf)

embeddings = ["gpt3_embed"]

In [None]:
# convert columns type from list to numpy.array [only saved in memory]
df_coh_word['gpt3_embed'] = df_coh_word['gpt3_embed'].apply(lambda x: np.array(x))


In [None]:
df_coh_word = df_coh_word.reset_index()

#### mean

In [None]:
grouping = ["uid", "task"] 

df_coh_group = pd.DataFrame(columns = grouping)

for idx, uid_df in df_coh_word.groupby(grouping):
  embed_results = {}

  # loop through all embeddings
  for embed in embeddings:
    turn_coherences_mean = []
    turn_coherences_tf_idf = []

    # create sentence embeddings
    for idx2, sent_df in uid_df.groupby(["sentence_id"]):
      embedd_array = []
      tf_idf_weights = []
      for _,r in sent_df.iterrows():
        #manually converting back the entries to float (were string in df)
        if type(r[embed]) != float and r[embed].shape[0] > 0:
          embedd_array.append(r[embed])
          tf_idf = get_td_idf(document = r[document_group], term = r[term_col])
          tf_idf_weights.append(tf_idf)
      # no embeddings for sentence
      if len(embedd_array) < 1:
        continue

      # calc mean sentence:
      mean_embed = np.average(embedd_array,0)
      tf_idf_mean_embed = np.average(embedd_array, weights = tf_idf_weights, axis = 0)
      
      # save results for LSA back to array
      if type(mean_embed) == np.ndarray:
        turn_coherences_mean.append(mean_embed)
      # else:
      #   print(type(mean_embed))
      if type(tf_idf_mean_embed) == np.ndarray:
        turn_coherences_tf_idf.append(tf_idf_mean_embed)
      # else:
      #   print(type(tf_idf_mean_embed))

    # end loop trhough sentences
    # calculate cosine distance for the grouped embedding
    if len(turn_coherences_mean) > 0:
      turn_coherences_mean = np.stack( turn_coherences_mean, axis=0 ) # to np array
      if turn_coherences_mean.shape[0] > 1:
        mean_coh = calc_response_cosine_similarity(turn_coherences_mean)
      else:
        mean_coh = np.nan
    else:
      mean_coh = np.nan

    embed_results["mean_" + embed] = mean_coh

    # calculate cosine distance for the grouped embedding
    if len(turn_coherences_tf_idf) > 0:
      turn_coherences_tf_idf = np.stack( turn_coherences_tf_idf, axis=0 ) # to np array
      if turn_coherences_tf_idf.shape[0] > 1:
        tf_idf_coh = calc_response_cosine_similarity(turn_coherences_tf_idf)
      else:
        tf_idf_coh = np.nan
    else:
      tf_idf_coh = np.nan

    embed_results["tf_idf_" + embed] = tf_idf_coh

  # little nugget to depending on the grouping create the final DF of results
  if(len(grouping) > 1):
    i = 0
    for g in grouping:
      embed_results[g] = idx[i]
      i = i + 1
  else:
    embed_results[grouping[0]] = idx

  # write final results
  df_coh_group = df_coh_group.append(pd.Series(embed_results), ignore_index=True)

#### min, max, std

In [None]:
df_coh_word = df_full
df_coh_word = df_word.loc[(df_word.is_speech_pause == 0) & (df_word.is_partial == 0) & (df_word.is_punctuation == 0) & (df_word.is_stopword == 0)]
df_coh_word = df_coh_word[["uid", "task", "word_lower", "sentence_id", "seg_id", "content", "sp.tokenized", "sp.lemma", "gpt3_embed"]] 
df_coh_word = df_coh_word.reset_index()
df_coh_word.head()

In [None]:
df_coh_word = df_coh_word.loc[(df_coh_word.time == 'BL')]
len(df_coh_word.uid.unique())

In [None]:
# if you load csv with embeddings, instead of use source dataframe, you need to
# convert columns type from list to numpy.array, 
def convert(item):
    item = str(item).strip()  # remove spaces at the end
    item = str(item)[1:-1]    # remove `[ ]`
    item = np.fromstring(item, sep=',')  # convert string to `numpy.array`
    return item
df_coh_word['gpt3_embed'] = df_coh_word['gpt3_embed'].apply(convert)

In [None]:
df_coh_word['gpt3_embed'][9]

In [None]:
print("Array Dimension = ",len(df_coh_word['gpt3_embed'][22].shape))

In [None]:
df_coh_word.head()

In [None]:
# MIN
grouping = ["uid", "task"] 

df_coh_group = pd.DataFrame(columns = grouping)

for idx, uid_df in df_coh_word.groupby(grouping):
  embed_results = {}

  # loop through all embeddings
  for embed in embeddings:
    turn_coherences_min = []

    # create sentence embeddings
    for idx2, sent_df in uid_df.groupby(["sentence_id"]):
      embedd_array = []
      for i in sent_df.index:

        try:
          if type(sent_df[embed][i]) != 'float' and (sent_df[embed][i]).shape[0] > 0: # for aces and lpop
            embedd_array.append(sent_df[embed][i])
            tf_idf = get_td_idf(document = sent_df[document_group][i], term = sent_df[term_col][i])
            tf_idf_weights.append(tf_idf)
        except AttributeError:
          continue

      # no embeddings for sentence
      if len(embedd_array) < 1:
        continue
   
      # calc min sentence:
      min_embed = np.min(np.array(embedd_array).astype(float),0) # 

      # save results for LSA back to array
      if type(min_embed) == np.ndarray:
        turn_coherences_min.append(min_embed)

    # end loop trhough sentences
    # calculate cosine distance for the grouped embedding
    if len(turn_coherences_min) > 0:
      turn_coherences_min = np.stack( turn_coherences_min, axis=0 ) # to np array
      if turn_coherences_min.shape[0] > 1:
        min_coh = calc_response_cosine_similarity(turn_coherences_min)
      else:
        min_coh = np.nan
    else:
      min_coh = np.nan

    embed_results["min_" + embed] = min_coh

  # little nugget to depending on the grouping create the final DF of results
  if(len(grouping) > 1):
    i = 0
    for g in grouping:
      embed_results[g] = idx[i]
      i = i + 1
  else:
    embed_results[grouping[0]] = idx

  # write final results
  df_coh_group = df_coh_group.append(pd.Series(embed_results), ignore_index=True)

In [None]:
embedd_array

In [None]:
df_coh_group_min = df_coh_group
df_coh_group_min.head()

In [None]:
# MAX

grouping = ["uid", "task"]

df_coh_group = pd.DataFrame(columns = grouping)

for idx, uid_df in df_coh_word.groupby(grouping):
  embed_results = {}

  # loop through all embeddings
  for embed in embeddings:
    turn_coherences_max = []

    # create sentence embeddings
    for idx2, sent_df in uid_df.groupby(["sentence_id"]):
      embedd_array = []
      for i in sent_df.index:
        try:
          if type(sent_df[embed][i]) != 'float' and (sent_df[embed][i]).shape[0] > 0: 
            embedd_array.append(sent_df[embed][i])
        except AttributeError:
          continue

      # no embeddings for sentence
      if len(embedd_array) < 1:
        continue
   
      # calc max sentence:
      max_embed = np.max(np.array(embedd_array).astype(float),0) # 

      # save results for LSA back to array
      if type(max_embed) == np.ndarray:
        turn_coherences_max.append(max_embed)

    # end loop trhough sentences
    # calculate cosine distance for the grouped embedding
    if len(turn_coherences_max) > 0:
      turn_coherences_max = np.stack( turn_coherences_max, axis=0 ) # to np array
      if turn_coherences_max.shape[0] > 1:
        max_coh = calc_response_cosine_similarity(turn_coherences_max)
      else:
        max_coh = np.nan
    else:
      max_coh = np.nan

    embed_results["max_" + embed] = max_coh

  # little nugget to depending on the grouping create the final DF of results
  if(len(grouping) > 1):
    i = 0
    for g in grouping:
      embed_results[g] = idx[i]
      i = i + 1
  else:
    embed_results[grouping[0]] = idx

  # write final results
  df_coh_group = df_coh_group.append(pd.Series(embed_results), ignore_index=True)

In [None]:
df_coh_group_max = df_coh_group
df_coh_group_max.head()

In [None]:
# STD

grouping = ["uid", "task"] 

df_coh_group = pd.DataFrame(columns = grouping)

for idx, uid_df in df_coh_word.groupby(grouping):
  embed_results = {}

  # loop through all embeddings
  for embed in embeddings:
    turn_coherences_sd = []

    # create sentence embeddings
    for idx2, sent_df in uid_df.groupby(["sentence_id"]):
      embedd_array = []
      for i in sent_df.index:
        try:
          if type(sent_df[embed][i]) != 'float' and (sent_df[embed][i]).shape[0] > 0: # for aces and lpop
            embedd_array.append(sent_df[embed][i])
        except AttributeError:
          continue

      # no embeddings for sentence
      if len(embedd_array) < 1:
        continue
   
      # calc std sentence:
      std_embed = np.std(np.array(embedd_array).astype(float),0) # 

      # save results for LSA back to array
      if type(std_embed) == np.ndarray:
        turn_coherences_sd.append(std_embed)
  

    # end loop trhough sentences
    # calculate cosine distance for the grouped embedding
    if len(turn_coherences_sd) > 0:
      turn_coherences_sd = np.stack( turn_coherences_sd, axis=0 ) # to np array
      if turn_coherences_sd.shape[0] > 1:
        std_coh = calc_response_cosine_similarity(turn_coherences_sd)
      else:
        std_coh = np.nan
    else:
      std_coh = np.nan

    embed_results["sd_" + embed] = std_coh

  # little nugget to depending on the grouping create the final DF of results
  if(len(grouping) > 1):
    i = 0
    for g in grouping:
      embed_results[g] = idx[i]
      i = i + 1
  else:
    embed_results[grouping[0]] = idx

  # write final results
  df_coh_group = df_coh_group.append(pd.Series(embed_results), ignore_index=True)

In [None]:
df_coh_group_sd = df_coh_group
df_coh_group_sd.head()

#### merge with gt and save

In [None]:
final = pd.merge(df_coh_group_min, df_coh_group_max, how='left', on=['uid', 'task'])
final = final.merge(df_coh_group_sd, how = 'left', on = ['uid', 'task'])
final = final.merge(gt, how = 'left', on = ['uid', 'task'])
final.head()

In [None]:
len(final.uid.unique())

## Combine with baseline embeddings

In [None]:
baseline = pd.read_csv('sentence_clean.csv', index_col=0)
baseline = baseline[baseline.task.isin(['HowsItGoing', 'AboutYourself'])]
baseline.head()

### add min, max, std

In [None]:
source = pd.read_csv('word_level_deindent.csv', index_col=0)
source.head()

In [None]:
len(source.uid.unique())

In [None]:
baseline_emb = source

In [None]:
# filter word level dataframe 
baseline_emb_word = baseline_emb
baseline_coh_word = baseline_emb_word.loc[(baseline_emb_word.is_speech_pause == 0) & (baseline_emb_word.is_partial == 0) & (baseline_emb_word.is_punctuation == 0) & (baseline_emb_word.is_stopword == 0)]
baseline_coh_word = baseline_coh_word[["uid", "grid", "time", "speaker", "task", "word_lower", "sentence_id", "seg_id", "content", "sp.tokenized", "sp.lemma", "lsa_embed","glove_embed", "w2v_embed"]] 
baseline_coh_word = baseline_coh_word[baseline_coh_word.speaker == 'Subject']
baseline_coh_word = baseline_coh_word[baseline_coh_word.time == 'BL']
baseline_coh_word = baseline_coh_word[baseline_coh_word.task.isin(['AboutYourself', 'HowsItGoing'])]

baseline_coh_word = baseline_coh_word.reset_index()
baseline_coh_word.head()

In [None]:
len(baseline_coh_word.uid.unique())

In [None]:
baseline_coh_word['lsa_embed'].head()

In [None]:
baseline_coh_word['glove_embed'].head()

In [None]:
baseline_coh_word['w2v_embed'].head()

In [None]:
df_coh_word = baseline_coh_word

In [None]:
def convert_lsa(item):
    item = str(item).strip()  # remove spaces at the end
    item = str(item)[1:-1]    # remove `[ ]`
    item = ' '.join(x.strip("'") for x in item.split(' '))
    item = np.fromstring(item, sep=' ')  # convert string to `numpy.array`
    return item

df_coh_word['lsa_embed'] = df_coh_word['lsa_embed'].apply(convert_lsa)

In [None]:
df_coh_word['lsa_embed'].head()

In [None]:
def convert_glove_w2v(item):
    item = str(item).strip()  # remove spaces at the end
    item = str(item)[1:-1]    # remove `[ ]`
    item = np.fromstring(item, sep=' ')  # convert string to `numpy.array`
    return item

df_coh_word['glove_embed'] = df_coh_word['glove_embed'].apply(convert_glove_w2v)
df_coh_word['w2v_embed'] = df_coh_word['w2v_embed'].apply(convert_glove_w2v) 

In [None]:
def mean_embedding_of_sentence(sentence_embeddings):
  print(sentence_embeddings.shape)
  if sentence_embeddings.shape[0] > 0:
    return np.average(sentence_embeddings.astype(np.float),0)
  else:
    return np.NaN

def calc_response_cosine_similarity(sentence_embeddings):
  from sklearn.metrics.pairwise import cosine_similarity
  sim_matrix = cosine_similarity(sentence_embeddings)
  resp_sim = 0
  for i in range(sentence_embeddings.shape[0]-1):
    resp_sim += sim_matrix[i][i+1]
  resp_sim /= sentence_embeddings.shape[0] - 1
  return resp_sim

document_group = "uid"
term_col = "word_lower"

def get_idf_stats(r, N):
  d = {}
  d['doc_list'] = r[document_group].unique()
  d['doc_count'] =  len(r[document_group].unique())
  d["idf"] = N / d['doc_count']
  return pd.Series(d, index=['doc_list', 'doc_count', "idf"])

N = len(df_coh_word[document_group].unique())

# document frequency
word_idf = df_coh_word.groupby([term_col]).apply(lambda x: get_idf_stats(x, N))
word_idf

# term frequency per document
word_tf = df_coh_word.groupby([document_group,term_col]).agg({term_col: 'count'}).groupby(level=0).apply(lambda x: x / float(x.sum())).rename(columns={term_col:"tf"})
word_tf

# tf(t, d) * log(idf)
def get_td_idf(term, document):
  tf = word_tf.loc[document, term]["tf"]
  idf = word_idf.loc[term]["idf"]
  return tf * np.log(idf)

embeddings = ['lsa_embed', 'glove_embed', 'w2v_embed']

#### min

In [None]:
grouping = ["uid", "task"] 

df_coh_group = pd.DataFrame(columns = grouping)

for idx, uid_df in df_coh_word.groupby(grouping):
  embed_results = {}

  # loop through all embeddings
  for embed in embeddings:
    turn_coherences_min = []

    # create sentence embeddings
    for idx2, sent_df in uid_df.groupby(["sentence_id"]):
      embedd_array = []
      for i in sent_df.index:
        try:
          if type(sent_df[embed][i]) != 'float' and (sent_df[embed][i]).shape[0] > 0: # for aces and lpop
            embedd_array.append(sent_df[embed][i])
            tf_idf = get_td_idf(document = sent_df[document_group][i], term = sent_df[term_col][i])
            tf_idf_weights.append(tf_idf)
        except AttributeError:
          continue

      # no embeddings for sentence
      if len(embedd_array) < 1:
        continue
   
      # calc min sentence:
      min_embed = np.min(np.array(embedd_array).astype(float),0) # 

      # save results for LSA back to array
      if type(min_embed) == np.ndarray:
        turn_coherences_min.append(min_embed)

    # end loop trhough sentences
    # calculate cosine distance for the grouped embedding
    if len(turn_coherences_min) > 0:
      turn_coherences_min = np.stack( turn_coherences_min, axis=0 ) # to np array
      if turn_coherences_min.shape[0] > 1:
        min_coh = calc_response_cosine_similarity(turn_coherences_min)
      else:
        min_coh = np.nan
    else:
      min_coh = np.nan

    embed_results["min_" + embed] = min_coh

  # little nugget to depending on the grouping create the final DF of results
  if(len(grouping) > 1):
    i = 0
    for g in grouping:
      embed_results[g] = idx[i]
      i = i + 1
  else:
    embed_results[grouping[0]] = idx

  # write final results
  df_coh_group = df_coh_group.append(pd.Series(embed_results), ignore_index=True)

In [None]:
baseline_df_coh_group_min = df_coh_group
baseline_df_coh_group_min.head()

#### max

In [None]:
grouping = ["uid", "task"] 

df_coh_group = pd.DataFrame(columns = grouping)

for idx, uid_df in df_coh_word.groupby(grouping):
  embed_results = {}

  # loop through all embeddings
  for embed in embeddings:
    turn_coherences_max = []

    # create sentence embeddings
    for idx2, sent_df in uid_df.groupby(["sentence_id"]):
      embedd_array = []
      for i in sent_df.index:
        try:
          if type(sent_df[embed][i]) != 'float' and (sent_df[embed][i]).shape[0] > 0: # for aces and lpop
            embedd_array.append(sent_df[embed][i])
        except AttributeError:
          continue

      # no embeddings for sentence
      if len(embedd_array) < 1:
        continue
   
      # calc max sentence:
      max_embed = np.max(np.array(embedd_array).astype(float),0) # 

      # save results for LSA back to array
      if type(max_embed) == np.ndarray:
        turn_coherences_max.append(max_embed)

    # end loop trhough sentences
    # calculate cosine distance for the grouped embedding
    if len(turn_coherences_max) > 0:
      turn_coherences_max = np.stack( turn_coherences_max, axis=0 ) # to np array
      if turn_coherences_max.shape[0] > 1:
        max_coh = calc_response_cosine_similarity(turn_coherences_max)
      else:
        max_coh = np.nan
    else:
      max_coh = np.nan

    embed_results["max_" + embed] = max_coh

  # little nugget to depending on the grouping create the final DF of results
  if(len(grouping) > 1):
    i = 0
    for g in grouping:
      embed_results[g] = idx[i]
      i = i + 1
  else:
    embed_results[grouping[0]] = idx

  # write final results
  df_coh_group = df_coh_group.append(pd.Series(embed_results), ignore_index=True)

In [None]:
baseline_df_coh_group_max = df_coh_group
baseline_df_coh_group_max.head()

#### std

In [None]:
# STD

grouping = ["uid", "task"]

df_coh_group = pd.DataFrame(columns = grouping)

for idx, uid_df in df_coh_word.groupby(grouping):
  embed_results = {}

  # loop through all embeddings
  for embed in embeddings:
    turn_coherences_sd = []

    # create sentence embeddings
    for idx2, sent_df in uid_df.groupby(["sentence_id"]):
      embedd_array = []
      for i in sent_df.index:
        
        try:
          if type(sent_df[embed][i]) != 'float' and (sent_df[embed][i]).shape[0] > 0: # for aces and lpop
            embedd_array.append(sent_df[embed][i])
        except AttributeError:
          continue

      # no embeddings for sentence
      if len(embedd_array) < 1:
        continue
   
      # calc std sentence:
      std_embed = np.std(np.array(embedd_array).astype(float),0) # 

      # save results for LSA back to array
      if type(std_embed) == np.ndarray:
        turn_coherences_sd.append(std_embed)
  

    # end loop trhough sentences
    # calculate cosine distance for the grouped embedding
    if len(turn_coherences_sd) > 0:
      turn_coherences_sd = np.stack( turn_coherences_sd, axis=0 ) # to np array
      if turn_coherences_sd.shape[0] > 1:
        std_coh = calc_response_cosine_similarity(turn_coherences_sd)
      else:
        std_coh = np.nan
    else:
      std_coh = np.nan

    embed_results["sd_" + embed] = std_coh

  # little nugget to depending on the grouping create the final DF of results
  if(len(grouping) > 1):
    i = 0
    for g in grouping:
      embed_results[g] = idx[i]
      i = i + 1
  else:
    embed_results[grouping[0]] = idx

  # write final results
  df_coh_group = df_coh_group.append(pd.Series(embed_results), ignore_index=True)

In [None]:
baseline_df_coh_group_sd = df_coh_group
baseline_df_coh_group_sd.head()

### new merge and save 

In [None]:
final.head()

In [None]:
len(final.uid.unique())

In [None]:
finalfinal = final.merge(baseline_df_coh_group_min, how='left', on=['uid', 'task'])
finalfinal = finalfinal.merge(baseline_df_coh_group_max, how='left', on=['uid', 'task'])
finalfinal = finalfinal.merge(baseline_df_coh_group_sd, how='left', on = ['uid', 'task'])
finalfinal.head()

In [None]:
len(finalfinal.uid.unique())

## Ground Truth clinical

In [None]:
clinical = pd.read_csv('features_table.csv', index_col=0)
clinical.head()

In [None]:
len(gt.uid.unique())

# Sentence similarity get_emb(sentence pair)

In [None]:
#import libraries
import numpy as np
import pandas as pd
import re
import string
import os
import math
import torch
import csv
import torch
import scipy
from torch import tensor

In [None]:
!pip install scipy

### run through clean sentence

In [None]:
df = pd.read_csv('sentence_clean.csv', index_col=0)
df = df[df.task.isin(['HowsItGoing', 'AboutYourself'])]
df = df.rename(columns={'roberta_similarity': 'gpt_similarity'})
df.head()

In [None]:
df.task.unique()

In [None]:
buglst = {}
for i,r in df.iterrows():
  if r['speaker'] != 'Interviewer':
    try:
      sentence1 = r['content']
      sentence2 = df['content'][i+1]

      # encode sentences to get their embeddings
      embedding1 = get_embedding(sentence1, engine = 'text-similarity-babbage-001')
      embedding2 = get_embedding(sentence2, engine = 'text-similarity-babbage-001')

      # compute similarity scores of two embeddings
      cosine_similarity = 1 - scipy.spatial.distance.cosine(embedding1, embedding2)
      print("Sentence 1:", sentence1)
      print("Sentence 2:", sentence2)
      print("Similarity score:", cosine_similarity)

      df['gpt_similarity'][i+1] = cosine_similarity

    except KeyError:
      buglst[df['Unnamed: 0.1'][i]] = df['content'][i]
      continue

### Merge word, sentence, and clinical

In [None]:
word = pd.read_csv('data_analysis.csv', index_col=0)
word = word[["grid", "task",  "group",
"tlc_3f_psy", "tlc_3f_nonsp", "tlc_3f_negative", 
'mean_gpt3_embed', 'mean_lsa_embed', 'mean_glove_embed', 'mean_w2v_embed',
'min_gpt3_embed', 'min_lsa_embed', 'min_glove_embed', 'min_w2v_embed', 
'max_gpt3_embed', 'max_lsa_embed', 'max_glove_embed', 'max_w2v_embed', 
'sd_gpt3_embed', 'sd_lsa_embed', 'sd_glove_embed', 'sd_w2v_embed'
]]
word.head()

In [None]:
len(word.grid.unique())

In [None]:
sentence = pd.read_csv('/Users/yancong/Desktop/5 zili research/clinicalNLP/gpt3_embeddings/stats/remora_sentence_clean_similarity_gt.csv', index_col=0)
sentence = sentence[["grid", "task",  "group",
"tlc_3f_psy", "tlc_3f_nonsp", "tlc_3f_negative", 
'similarity_mean_roberta', 'similarity_mean_t5', 'gpt_similarity_mean', 
'roberta_similarity_min', 't5small_similarity_min', 'gpt_similarity_min', 
'roberta_similarity_max', 't5small_similarity_max', 'gpt_similarity_max', 
'roberta_similarity_std', 't5small_similarity_std', 'gpt_similarity_std'
]]
sentence = sentence[sentence.task.isin(['AboutYourself', 'HowsItGoing'])]
sentence.head()

In [None]:
len(sentence.grid.unique())

In [None]:
fdf = pd.merge(word, sentence, on=['grid', 'group', 'task', "tlc_3f_psy", "tlc_3f_nonsp", "tlc_3f_negative", ], how='left')
fdf.head()

In [None]:
fdf.columns

In [None]:
len(fdf.grid.unique())

In [None]:
fdf.to_csv('data_analysis.csv')