# Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
#import libraries
import numpy as np
import pandas as pd
import re
import string
import os
import math
import torch
import csv
import torch
import tensorflow as tf

from torch import tensor

In [None]:
#set variables for folder names:
datain = 'here'
dataout = 'here' #make sure dir starts with /
clinical = 'here'

# Check GPU availability

In [None]:
#%% check if GPU is active
torch.cuda.is_available() #True
print(torch.cuda.device_count()) #1
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0)) #NVIDIA GeForce RTX 2080
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

# Launch transformers

In [None]:
!pip install transformers

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-large')
type(model) 

# Adjacent sententence similarity

In [None]:
df_sent = pd.read_csv(datain + 'sentence_clean.csv')
df_sent.head(5)

In [None]:
for i in df_sent.index:
  text = df_sent['content'][i]
  text = re.sub(r'\s([?.!,"](?:\s|$))', r' ', text) #replace ' .' with ' '
  df_sent['content'][i] = text

In [None]:
buglst = {}
for i,r in df_sent.iterrows():
  if r['speaker'] != 'Interviewer':
    try:
      sentence1 = r['content']
      sentence2 = df_sent['content'][i+1]

      # encode sentences to get their embeddings
      embedding1 = model.encode(sentence1, convert_to_tensor=True)
      embedding2 = model.encode(sentence2, convert_to_tensor=True)
      # compute similarity scores of two embeddings
      cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
      print("Sentence 1:", sentence1)
      print("Sentence 2:", sentence2)
      print("Similarity score:", cosine_scores.item())

      df_sent['roberta_similarity'][i+1] = cosine_scores.item()

    except KeyError:
      buglst[df_sent['Unnamed: 0'][i]] = df_sent['content'][i]
      continue
# why negatives!!!

## add min, max, std

In [None]:
import numpy as np
import pandas as pd

roberta = pd.read_csv('sentence_clean.csv', index_col=0)
roberta.head()

In [None]:
roberta['sentence_count'] = ''
buglst = {}
s = 0
for i in roberta.index:
  try:
    if roberta['task'][i] == roberta['task'][i+1]:
      s += 1
    else: 
      roberta['sentence_count'][i] = s + 1
      s = 0
  except:
    buglst[roberta['Unnamed: 0.1'][i]] = roberta['content'][i]

In [None]:
roberta = roberta[['uid', 'grid', 'timepoint', 'speaker', 'task', 'roberta_similarity', 'sentence_count']]

In [None]:
roberta.columns

In [None]:
roberta = roberta[roberta.timepoint == 'BL']
roberta = roberta[roberta.task.isin(['AboutYourself', 'HowsItGoing'])]

In [None]:
roberta = roberta.reset_index(drop=True)
len(roberta.uid.unique())

In [None]:
clinical = pd.read_csv('data_analysis.csv', index_col=0)
clinical.head()

In [None]:
roberta['sentence_count'] = pd.to_numeric(roberta['sentence_count'], errors='coerce')
similarity_sentence_clean_gt = roberta.groupby(['grid']).agg({ 
                                            "roberta_similarity": ['min', 'max', 'std']
                                            })

similarity_sentence_clean_gt.sort_values(by='grid', ascending=True, inplace=True)
similarity_sentence_clean_gt.head()

In [None]:
final = pd.merge(similarity_sentence_clean_gt, clinical, on='grid', how='left')
final.head()

In [None]:
final = final.rename(columns = {"('roberta_similarity', 'min')": "roberat_similarity_min",
"('roberta_similarity', 'max')": "roberta_similarity_max", "('roberta_similarity', 'std')": "roberta_similarity_sd"})

## add mean

In [None]:
df_clinical = pd.read_csv(clinical + 'data_analysis.csv')
df_sent['sentence_count'] = pd.to_numeric(df_sent['sentence_count'], errors='coerce')
similarity_sentence_clean_gt = df_sent.groupby(['grid', 'task']).agg({ 
                                            "roberta_similarity_sum": lambda x: x.sum(skipna=True),
                                            'sentence_count': lambda x: x.sum(skipna=True)
                                            })

similarity_sentence_clean_gt['similarity_mean'] = similarity_sentence_clean_gt['roberta_similarity_sum'] / (similarity_sentence_clean_gt['sentence_count']/2)

similarity_sentence_clean_gt.sort_values(by='grid', ascending=True, inplace=True)
