In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize
import torch
from transformers import AutoTokenizer, AutoModel
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
sp = spacy.load('en_core_web_sm')

In [2]:
hudsoncsv = pd.read_csv("RayHudsonSanitized.csv")

In [3]:
texts = hudsoncsv['punctuated transcripts']

In [4]:
tf_idf_df = pd.read_csv("hudson_word_freqs.csv")

In [5]:
tf_idf_df.head()

Unnamed: 0.1,Unnamed: 0,000,10,100,101,10th,11,110,12,128,...,zombie,zomboni,zone,zoo,zoom,zooming,zorro,zubizarreta,zuzu,álvarez
0,0,0.000599,0.003444,0.000599,0.00015,0.00015,0.003743,0.00015,0.001348,0.00015,...,0.00015,0.00015,0.001797,0.00015,0.000599,0.00015,0.000299,0.00015,0.00015,0.00015


In [6]:
def tag_pos(line):
    sent_pos_arr = []
    sent_tag_arr = []
    sp_out = sp(line)
    tokenized = []
    for token in sp_out:
        tokenized.append(token)
        sent_pos_arr.append(token.pos_)
        sent_tag_arr.append(token.tag_)
    text_pos_dict = {"text": tokenized, "pos": sent_pos_arr, "tag": sent_tag_arr}
    return text_pos_dict

In [9]:
print(tag_pos('Fred is a gem'))

{'text': [Fred, is, a, gem], 'pos': ['PROPN', 'AUX', 'DET', 'NOUN'], 'tag': ['NNP', 'VBZ', 'DT', 'NN']}


In [32]:
def filter_hudson_df(text_in):
    metaphor_noun_list = []
    metaphor_sentence_list = []
    sentence_list = sent_tokenize(text_in)
    for idx in range(len(sentence_list)):
        index_memory = []
        is_idx = []
        verb_idx = []
        start_noun = False
        start_verb = False
        sentence_noun_list = []
        sentence_dict = tag_pos(sentence_list[idx])
        for i in range(len(sentence_dict['text'])):
            if start_verb:
                if sentence_dict['pos'][i] == 'VERB':
                    verb_idx.append(i)
            if start_noun:
                if sentence_dict['text'][i].text == 'is' or \
                    sentence_dict['text'][i].text == 'was' or \
                        sentence_dict['text'][i].text == '\'s':
                            is_idx.append(i)
                            start_is = True
            if sentence_dict['pos'][i] in ['NOUN', 'PRON', 'PROPN']:
                index_memory.append(i)
                sentence_noun_list.append(sentence_dict['text'][i].text)
                start_noun = True
        if len(sentence_noun_list) >= 2 and is_idx < index_memory[-1]:
            metaphor_noun_list.append(sentence_noun_list)
            metaphor_sentence_list.append(sentence_list[idx])
    return metaphor_noun_list, metaphor_sentence_list


In [33]:
filter_hudson_df("He ran like the wind. He was a monster.")

([['He', 'monster']], ['He was a monster.'])

In [41]:
metaphor_df = texts.to_frame()
metaphor_df['sentences'], metaphor_df['nouns'] = zip(*metaphor_df['punctuated transcripts'].apply(filter_hudson_df))
# metaphor_df['metaphor_sents'], metaphor_df['metaphor_nouns'] = texts.apply(filter_hudson_df)

  return array(a, dtype, copy=False, order=order)


In [42]:
metaphor_df.head()

Unnamed: 0,punctuated transcripts,sentences,nouns
0,this is a wonderful show of class from hetafa ...,"[[this, show, class, hetafa, they, quote, del,...",[this is a wonderful show of class from hetafa...
1,"[Music] vasquez, bursting up the right wing, s...","[[it, run, ball, benzema, peacock, his, hat], ...","[it's a wonderful run, passed by verdi, beauti..."
2,"casius di stefano ramos, raul up against barce...","[[rebound-, he, who], [that, chase, years, age...","[mengesa off the rebound- and he's who., that ..."
3,"well as you might have just grabbed a peek, th...","[[it, beneficiary, hesitancy, frankie, deyoung...","[well, it was the beneficiary of some hesitanc..."
4,up shorts in honor of the man that used to wea...,"[[shorts, honor, man, that, number, club, who,...",[up shorts in honor of the man that used to we...


In [43]:
metaphor_df.to_csv("RayHudsonMetaphorBroad.csv")