In [94]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize
import torch
from nltk.stem import PorterStemmer
from transformers import AutoTokenizer, AutoModel
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
sp = spacy.load('en_core_web_sm')

In [2]:
hudsoncsv = pd.read_csv("RayHudsonSanitized.csv")

In [3]:
texts = hudsoncsv['punctuated transcripts']

In [96]:
tf_idf_df = pd.read_csv("hudson_word_freqs_stemmed.csv")
ps = PorterStemmer()

In [5]:
tf_idf_df.head()

Unnamed: 0.1,Unnamed: 0,000,10,100,101,10th,11,110,12,128,...,zombie,zomboni,zone,zoo,zoom,zooming,zorro,zubizarreta,zuzu,álvarez
0,0,0.000599,0.003444,0.000599,0.00015,0.00015,0.003743,0.00015,0.001348,0.00015,...,0.00015,0.00015,0.001797,0.00015,0.000599,0.00015,0.000299,0.00015,0.00015,0.00015


In [6]:
def tag_pos(line):
    sent_pos_arr = []
    sent_tag_arr = []
    sp_out = sp(line)
    tokenized = []
    for token in sp_out:
        tokenized.append(token)
        sent_pos_arr.append(token.pos_)
        sent_tag_arr.append(token.tag_)
    text_pos_dict = {"text": tokenized, "pos": sent_pos_arr, "tag": sent_tag_arr}
    return text_pos_dict

In [9]:
print(tag_pos('Fred is a gem'))

{'text': [Fred, is, a, gem], 'pos': ['PROPN', 'AUX', 'DET', 'NOUN'], 'tag': ['NNP', 'VBZ', 'DT', 'NN']}


In [57]:
def between(l1,low,high):
    for i in l1:
        if(i > low and i < high):
            return True
    return False

In [58]:
def gen_noun_pairs(noun_idx_list):
    return zip(noun_idx_list, noun_idx_list[1:])

In [59]:
def get_succ_noun_pairs(is_idx, verb_idx, noun_idx):
    succ_noun_pairs = []
    if len(noun_idx) < 2:
        return []
    noun_pairs = gen_noun_pairs(noun_idx)
    for noun_pair in noun_pairs:
        if between(is_idx, noun_pair[0], noun_pair[1]) and not between(verb_idx, noun_pair[0], noun_pair[1]):
            succ_noun_pairs.append(noun_pair)

    return succ_noun_pairs

In [69]:
def filter_hudson_df(text_in):
    metaphor_noun_list = []
    metaphor_sentence_list = []
    sentence_list = sent_tokenize(text_in)

    for idx in range(len(sentence_list)):
        noun_idx = []
        is_idx = []
        verb_idx = []
        start_noun = False
        start_verb = False
        sentence_noun_list = []
        sentence_dict = tag_pos(sentence_list[idx])
        for i in range(len(sentence_dict['text'])):
            if start_verb:
                if sentence_dict['pos'][i] == 'VERB':
                    verb_idx.append(i)
            if start_noun:
                if sentence_dict['text'][i].text == 'is' or \
                    sentence_dict['text'][i].text == 'was' or \
                        sentence_dict['text'][i].text == '\'s':
                            is_idx.append(i)
                            start_is = True
            if sentence_dict['pos'][i] in ['NOUN', 'PRON', 'PROPN']:
                noun_idx.append(i)
                sentence_noun_list.append(sentence_dict['text'][i].text)
                start_noun = True
        
        succ_noun_pairs = get_succ_noun_pairs(is_idx, verb_idx, noun_idx)
        final_noun_list = []
        final_noun_sentence = []
        
        for pair in succ_noun_pairs:
            metaphor_noun_list.append((sentence_dict['text'][pair[0]].text, sentence_dict['text'][pair[1]].text))
            metaphor_sentence_list.append(sentence_dict['text'][pair[0]:pair[1] + 1])
    return metaphor_noun_list, metaphor_sentence_list


In [70]:
filter_hudson_df("He ran like the wind. He was a monster. He was a demon.")

([('He', 'monster'), ('He', 'demon')],
 [[He, was, a, monster], [He, was, a, demon]])

In [103]:
metaphor_df = texts.to_frame()
metaphor_df['nounpairs'], metaphor_df['sentences'] = zip(*metaphor_df['punctuated transcripts'].apply(filter_hudson_df))
# metaphor_df['metaphor_sents'], metaphor_df['metaphor_nouns'] = texts.apply(filter_hudson_df)

  return array(a, dtype, copy=False, order=order)


In [104]:
metaphor_df.head()

Unnamed: 0,punctuated transcripts,nounpairs,sentences
0,this is a wonderful show of class from hetafa ...,"[(this, show), (it, setup), (who, bit), (he, i...","[[this, is, a, wonderful, show], [it, 's, a, w..."
1,"[Music] vasquez, bursting up the right wing, s...","[(it, run), (it, finish), (it, run), (there, d...","[[it, 's, a, wonderful, run], [it, 's, magic, ..."
2,"casius di stefano ramos, raul up against barce...","[(he, who), (that, chase), (drought, real), (m...","[[he, 's, who], [that, is, giving, chase], [dr..."
3,"well as you might have just grabbed a peek, th...","[(it, beneficiary), (it, it), (midfield, abrac...","[[it, was, the, beneficiary], [it, 's, not, ju..."
4,up shorts in honor of the man that used to wea...,"[(who, man), (that, we), (there, lot), (fact, ...","[[who, is, being, recognized, alongside, that,..."


In [150]:
metaphor_df.to_csv("RayHudsonMetaphorBroad.csv")

In [106]:
print(tf_idf_df['who'])

0    0.043074
Name: who, dtype: float64


In [120]:
def get_noun_tf_idf(noun_list_in):
    pair_mags = []
    for noun_pair in noun_list_in:
        print(noun_pair)
        try:
            mag = abs(tf_idf_df[ps.stem(noun_pair[0])] - tf_idf_df[ps.stem(noun_pair[1])])[0]
        except:
            mag = -1
        pair_mags.append(mag)
    return pair_mags

In [121]:
get_noun_tf_idf([["he", "peacock"], ["this", "show"]])

['he', 'peacock']
['this', 'show']


[0.1233024937109498, 0.09843476388689269]

In [122]:
metaphor_df['noun_dists'] = metaphor_df['nounpairs'].apply(get_noun_tf_idf)

('this', 'show')
('it', 'setup')
('who', 'bit')
('he', 'it')
('he', 'side')
('that', 'side')
('he', 'busquets')
('he', 'it')
('oscar', 'gap')
('it', 'hit')
('it', 'cucarella')
('who', 'phil')
('he', 'point')
('that', 'goal')
('who', 'bit')
('who', 'his')
('he', 'he')
('he', 'phil')
('it', 'bit')
('he', 'barcelona')
('it', 'things')
('it', 'chaka')
('it', 'nothing')
('it', 'you')
('it', 'direction')
('this', 'goal')
('it', 'touch')
('it', 'upright')
('there', 'shot')
('angle', 'halo')
('tonight', 'shark')
('that', 'fish')
('he', 'game')
('it', 'touch')
('it', "anna's-")
('var', 'it')
('it', 'look')
('it', 'what')
('it', 'balls')
('there', 'ball')
('it', 'ideas')
('box', 'jeopardy')
('barcelona', 'minders')
('barcelona', 'lead')
('it', 'penalty')
('it', 'header')
('it', 'piece')
('it', 'header')
('it', 'place')
('it', 'dynamic')
('dynamic', 'it')
('it', 'right')
('that', 'griezmann')
('it', 'referee')
('witch', 'nose')
('it', 'strike')
('it', 'run')
('it', 'finish')
('it', 'run')
('there

In [114]:
metaphor_df.head()

Unnamed: 0,punctuated transcripts,nounpairs,sentences,noun_dists
0,this is a wonderful show of class from hetafa ...,"[(this, show), (it, setup), (who, bit), (he, i...","[[this, is, a, wonderful, show], [it, 's, a, w...","[[0.09843476388689269], [0.2389078329525485], ..."
1,"[Music] vasquez, bursting up the right wing, s...","[(it, run), (it, finish), (it, run), (there, d...","[[it, 's, a, wonderful, run], [it, 's, magic, ...","[[0.2279541900538567], [0.228250234456524], [0..."
2,"casius di stefano ramos, raul up against barce...","[(he, who), (that, chase), (drought, real), (m...","[[he, 's, who], [that, is, giving, chase], [dr...","[[0.0808201219281855], [0.16430464348037718], ..."
3,"well as you might have just grabbed a peek, th...","[(it, beneficiary), (it, it), (midfield, abrac...","[[it, was, the, beneficiary], [it, 's, not, ju...","[[0.23905585515388222], [0.0], [0.004144621637..."
4,up shorts in honor of the man that used to wea...,"[(who, man), (that, we), (there, lot), (fact, ...","[[who, is, being, recognized, alongside, that,...","[[0.0233875078107204], [0.11604940584559979], ..."


In [147]:
def threshold_scores(sentences, scores, pairs):
    output_sent = []
    output_pairs = []
    output_score = []
    for sent, score, pair in zip(sentences, scores, pairs):
        if score > 0.2:
            output_sent.append(sent)
            output_pairs.append(pair)
            output_score.append(score)
    return output_sent, output_pairs, output_score

In [148]:
metaphor_df['sentence_thresh'], metaphor_df['nounpair_thresh'], metaphor_df['score_thresh'] = zip(*metaphor_df.apply(lambda x: threshold_scores(x.sentences, x.noun_dists, x.nounpairs), axis=1))

  return array(a, dtype, copy=False, order=order)


In [149]:
metaphor_df.head()

Unnamed: 0,punctuated transcripts,nounpairs,sentences,noun_dists,sentence_thresh,nounpair_thresh,score_thresh
0,this is a wonderful show of class from hetafa ...,"[(this, show), (it, setup), (who, bit), (he, i...","[[this, is, a, wonderful, show], [it, 's, a, w...","[0.09843476388689269, 0.2389078329525485, 0.03...","[[it, 's, a, wonderful, setup], [it, 's, a, fo...","[(it, setup), (it, hit), (it, cucarella), (it,...","[0.2389078329525485, 0.2298784786711944, 0.239..."
1,"[Music] vasquez, bursting up the right wing, s...","[(it, run), (it, finish), (it, run), (there, d...","[[it, 's, a, wonderful, run], [it, 's, magic, ...","[0.2279541900538567, 0.228250234456524, 0.2279...","[[it, 's, a, wonderful, run], [it, 's, magic, ...","[(it, run), (it, finish), (it, run), (it, run)...","[0.2279541900538567, 0.228250234456524, 0.2279..."
2,"casius di stefano ramos, raul up against barce...","[(he, who), (that, chase), (drought, real), (m...","[[he, 's, who], [that, is, giving, chase], [dr...","[0.0808201219281855, 0.16430464348037718, 0.02...","[[it, 's, a, wonderful, run], [it, 's, magic, ...","[(it, run), (it, finish), (it, wrinkle), (it, ...","[0.2279541900538567, 0.228250234456524, 0.2381..."
3,"well as you might have just grabbed a peek, th...","[(it, beneficiary), (it, it), (midfield, abrac...","[[it, was, the, beneficiary], [it, 's, not, ju...","[0.23905585515388222, 0.0, 0.00414462163734279...","[[it, was, the, beneficiary], [it, 's, the, co...","[(it, beneficiary), (it, connection), (crossba...","[0.23905585515388222, 0.23624343332854242, 0.2..."
4,up shorts in honor of the man that used to wea...,"[(who, man), (that, we), (there, lot), (fact, ...","[[who, is, being, recognized, alongside, that,...","[0.0233875078107204, 0.11604940584559979, 0.04...","[[it, 's, messy, ,, messy, ,, drags], [it, 's,...","[(it, drags), (it, antoine), (it, knife), (it,...","[0.23757563314054542, 0.23535530012054032, 0.2..."


In [151]:
all_nouns = metaphor_df['nounpair_thresh'].tolist()

In [153]:
second_nouns = []
for trans in all_nouns:
    for nounpair in trans:
        second_nouns.append(nounpair[-1])

In [154]:
print(second_nouns)

['setup', 'hit', 'cucarella', 'bit', 'things', 'chaka', 'nothing', 'direction', 'touch', 'upright', 'touch', 'it', 'what', 'balls', 'ideas', 'penalty', 'header', 'piece', 'header', 'place', 'dynamic', 'it', 'right', 'referee', 'strike', 'run', 'finish', 'run', 'run', 'box', 'passenger', 'run', 'finish', 'wrinkle', 'team', 'diamond', 'grace', 'nacho', 'vinicius', 'mendy', 'maldrich', 'either', 'shirts', 'passenger', 'card', 'Applause', 'degrees', 'casamino', 'beneficiary', 'connection', 'it', 'giveaway', 'target', 'ball', 'choir', 'frankie', 'twix', 'sergino', 'run', 'ricochet', 'pyrotechnic', 'heckle', 'hit', 'drags', 'antoine', 'knife', 'hell', 'contact', 'things', 'vein', 'line', 'cherry', 'effort', 'ningiza', 'time', 'lcs', 'tom', 'pass', 'side', 'head', 'ball', 'captain', 'pass', 'lot', 'couch', 'foot', 'it', 'youngster', 'youngsters', 'youngsters', 'pacheco', 'save', 'griezmann', 'nasa', 'break', 'it', 'giveaway', 'slip', 'man', 'finish', 'it', 'song', 'psg', 'edward', 'arbia', 'c