# data-driven approach

In [1]:
# !pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
# Initialize the model
NLPmodel = SentenceTransformer('./sentence-transformers_all-MiniLM-L6-v2') # SentenceTransformer('all-MiniLM-L6-v2')

In [2]:
import pandas as pd
import numpy as np
from nlp_utils import *

In [3]:
sentences = ['This is how it looks.','let me see']

In [30]:
import os, sys
import numpy as np
import pandas as pd
import regex as re
import torch

def load_mlm_model(model_name, cache_dir=None):
	'''
	Use a model from the sentence-transformers library to get
	sentence embeddings. Models used are trained on a next-sentence
	prediction task and evaluate the likelihood of S2 following S1.
	'''
	# set the path of where to download models
	# this NEEDS to be run before loading from transformers
	if cache_dir:
		os.environ['TRANSFORMERS_CACHE'] = cache_dir

	from transformers import AutoTokenizer, AutoModel

	# Load model from HuggingFace Hub
	tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
	model = AutoModel.from_pretrained(model_name)
	
	model.eval()

	return tokenizer, model

import re

def subwords_to_words(sentence, tokenizer):
	
	word_token_pairs = []
	
	# split the sentence on spaces + punctuation (excluding apostrophes and hyphens within words)
	regex_split_pattern = r'(\w|\.\w|\:\w|\'\w|\'\w|\-\w|\S)+'

	# regex_split_pattern = r"[\w]+[''.-:]?[\w]*"

	for m in re.finditer(regex_split_pattern, sentence):
		word = m.group(0)
		tokens = tokenizer.encode(word, add_special_tokens=False)
		char_idxs = (m.start(), m.end()-1)
		
		word_token_pairs.append((word, tokens, char_idxs))
	
	return word_token_pairs

def extract_word_embeddings(sentences, tokenizer, model):#, word_indices=None):
	'''
	Given a list of sentences, pass them through the tokenizer/model. Then pair
	sub-word tokens into the words of the actual sentence and extract the true
	word embeddings. 
	
	If wanted, can return only certain indices (specified by word_indices)
	
	Currently not robust to different length strings MBMB
	'''
	
	if isinstance(sentences, str):
		sentences = [sentences]
	
	if not sentences:
		return []
	
	# get the full sentence tokenized
	encoded_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
	# print('encoded_inputs',encoded_inputs)
	
	# get the embeddings
	with torch.no_grad():
		model_output = model(**encoded_inputs, output_hidden_states=True)
	
	print("len(model_output['hidden_states']",len(model_output['hidden_states']))#7
	all_embeddings = []
	
	# bring together the current sentence, its tokens, and its embeddings
	for i, sent in enumerate(sentences):
		# now pair subwords into words for the current sentence
		subword_word_pairs = subwords_to_words(sent, tokenizer)
		print(sent, subword_word_pairs)
		
		embeddings = []
		
		# for the current set of word subword pairs, get the embeddings
		for (word, tokens, char_span) in subword_word_pairs:
			
			# given the character to token mapping in the sentence, 
			# find the first and last token indices
			start_token = encoded_inputs.char_to_token(batch_or_char_index=i, char_index=char_span[0])
			end_token = encoded_inputs.char_to_token(batch_or_char_index=i, char_index=char_span[-1])
			
			# extract the embedding for the given word
			word_embed = torch.stack([layer[i, start_token:end_token+1, :].sum(0) for layer in model_output['hidden_states']])
			print(word_embed.shape)
			embeddings.append(word_embed)
		
		print(len(embeddings))
		# stack the embeddings together
		embeddings = torch.stack(embeddings)
		print(len(embeddings))
		
		# make sure the mapping happened correctly
		if len(sent.split()) != embeddings.shape[0]:
			print (subword_word_pairs)
			print (len(subword_word_pairs))
			print (embeddings.shape)
			print (len(sent.split()))

		assert (len(sent.split()) == embeddings.shape[0])
		
		print(len(embeddings))
		all_embeddings.append(embeddings)
	
	# all_embeddings = torch.stack(all_embeddings)
	
	# if word_indices:
	# 	return all_embeddings[:, word_indices, :]
	# else:
	# 	return all_embeddings

	return all_embeddings

In [31]:
tokenizer, model = load_mlm_model('./sentence-transformers_all-MiniLM-L6-v2')
all_embeddings = extract_word_embeddings(sentences, tokenizer, model)

len(model_output['hidden_states'] 7
This is how it looks. [('This', [2023], (0, 3)), ('is', [2003], (5, 6)), ('how', [2129], (8, 10)), ('it', [2009], (12, 13)), ('looks.', [3504, 1012], (15, 20))]
torch.Size([7, 384])
torch.Size([7, 384])
torch.Size([7, 384])
torch.Size([7, 384])
torch.Size([7, 384])
5
5
5
let me see [('let', [2292], (0, 2)), ('me', [2033], (4, 5)), ('see', [2156], (7, 9))]
torch.Size([7, 384])
torch.Size([7, 384])
torch.Size([7, 384])
3
3
3


In [19]:
all_embeddings[0].shape

torch.Size([5, 7, 384])

In [12]:
all_embeddings[1].shape

torch.Size([3, 7, 384])

In [7]:
# run if needed!!
# df with all verbs and their embeddings

# import nltk
# # nltk.download('wordnet')

# from nltk.corpus import wordnet as wn

# # Retrieve all English lemmas in WordNet
# lemmas = wn.all_lemma_names(pos=wn.VERB)#NOUN)

# # Convert lemmas to a list
# english_words = list(lemmas)

# print(english_words[:100]), len(english_words)

# english_words = [i for i in english_words if '-' not in i]
# english_words = [i for i in english_words if '_' not in i]
# english_words_embedding = NLPmodel.encode(english_words)

# print(english_words_embedding.shape)

# df_words = pd.DataFrame({'words':english_words, 'word_len':[len(i) for i in english_words], 'embedding':english_words_embedding.tolist()})
# df_words

In [8]:
# df_words.to_csv('../../data/text_responses/text_responses_all_verbs.csv')

In [16]:
df_words = pd.read_csv('../../data/text_responses/text_responses_all_verbs.csv',index_col=[0])
df_words.head()

Unnamed: 0,words,word_len,embedding
0,absent,6,"[0.019467102363705635, 0.05805996060371399, -0..."
1,abstract,8,"[-0.07030732184648514, 0.08017940074205399, 0...."
2,ace,3,"[-0.12659801542758942, 0.01382419653236866, -0..."
3,acuminate,9,"[-0.06811799108982086, 0.09903689473867416, -0..."
4,adulterate,10,"[0.03436809778213501, 0.04313870146870613, -0...."


In [17]:
english_words = df_words['words'].values
english_words_embedding = df_words['embedding'].values

In [10]:
param_name = 'subtlety'
expt_type = 'subtlety/without_cover_story/'
rootfile_loc = f'../../data/text_responses/{expt_type}'

In [12]:
df = pd.read_csv(f'{rootfile_loc}/data_triallevel_sorted.csv',index_col=[0])
df.head()

Unnamed: 0,subID,stimset_rows,subtlety,responses,movie,trial_num
0,15000,0,0,grey dot following black dot,07cdb9d7-6577-428d-9498-8a7243527554,0
6,15001,1,0,A grey dot appeared to chase or follow a black...,53613c0d-b07e-4b22-b6ec-56dda26edc8f,6
0,15002,2,0,the black dot seemed to be running away from t...,19c65966-3d88-4851-afe4-ae34c498748c,0
2,15003,3,0,Black dot was following the grey dot,c188e383-94d7-475e-9da5-6d1f0a1184bb,2
3,15004,6,0,Black moved from right to left chasing the grey,49e5ac90-6e7f-4243-9552-8c4c10545895,3


In [13]:
words_to_compare = df['responses'].to_list()
words_to_compare = [s.lower() for s in words_to_compare]
words_to_compare

['grey dot following black dot',
 'a grey dot appeared to chase or follow a black dot right and then up.',
 'the black dot seemed to be running away from the gray one',
 'black dot was following the grey dot',
 'black moved from right to left chasing the grey',
 'black dot was being chased by the grey one',
 'it seemed like the grey dot was trying to chase the black dot',
 'lighter dot is being pressured by dark one',
 'grey sot following black dot',
 'the black dot was following the grey dot in a beeline.',
 'black dot being chased by white dot',
 'again the black dot moved toward the grey dot which moved to get away from the black dot',
 'the grey dot approached the black dot and both curved in a downwards sweep to the bottom of the box',
 'the grey dot was chasing the black dot',
 'the grey dot was moving closer to the black dot, as they reached a distance, the dot did not move any closer but mimicked the moves of the other one ',
 'black chasseing dray dot',
 'the white dot is chas

In [29]:
excluded_words_list = ['dot','dots','grey','gray','black'] 

In [31]:
# word_embedding per chargeSpeed level - comparison to the whole verbs list
mean_embedding = []
for subt in np.unique(df['subtlety']):
    print('subt:', subt)
    df_subt = df.loc[df['subtlety']==subt,:]
    words_to_compare = df_subt['responses'].to_list()
    words_to_compare =  [s.lower() for s in words_to_compare]
    mean_embedding_curr = NLPmodel.encode(words_to_compare)
    mean_embedding_curr = np.nanmedian(mean_embedding_curr,axis=0) 

    similarity_with_each_word = []
    for embedding in english_words_embedding: #english words
        similarity = util.pytorch_cos_sim(mean_embedding_curr, eval(embedding))
        similarity_with_each_word.append(similarity.mean().item())
    
    similarity_with_each_word = np.array(similarity_with_each_word)

    df_temp = pd.DataFrame({'english_words':english_words, 'similarity_with_each_word':similarity_with_each_word})
    df_temp = df_temp[~df_temp['english_words'].isin(excluded_words_list)]
    df_temp = df_temp.reset_index()
    df_temp.sort_values(by='similarity_with_each_word',inplace=True, ascending=False)
    print(f"most representative words: {df_temp.iloc[:10,1].values}")
    # take top 10 words

subt: 0
most representative words: ['blackball' 'blackleg' 'blacktop' 'focus' 'gaze' 'dusk' 'circle'
 'encircle' 'clatter' 'motion']
subt: 30
most representative words: ['blackball' 'blackleg' 'gaze' 'clatter' 'focus' 'motion' 'circle' 'dusk'
 'misalign' 'blacktop']
subt: 60
most representative words: ['blackball' 'blackleg' 'focus' 'gaze' 'blacktop' 'clatter' 'dusk' 'sight'
 'foreground' 'circle']
subt: 90
most representative words: ['misalign' 'motion' 'clatter' 'focus' 'blackball' 'gaze' 'foreground'
 'flicker' 'shift' 'circle']
subt: 120
most representative words: ['motion' 'misalign' 'clatter' 'focus' 'circle' 'shift' 'gaze' 'flicker'
 'encircle' 'foreground']
subt: 150
most representative words: ['clatter' 'motion' 'misalign' 'circle' 'circularise' 'scatter' 'encircle'
 'disorganize' 'circularize' 'smatter']
subt: 180
most representative words: ['clatter' 'motion' 'circle' 'circularize' 'circularise' 'misalign'
 'encircle' 'scatter' 'smatter' 'move']
