In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize
import torch
from transformers import AutoTokenizer, AutoModel
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
sp = spacy.load('en_core_web_sm')

In [3]:
hudsoncsv = pd.read_csv("RayHudsonSanitized.csv")

In [4]:
hudsoncsv.head()

Unnamed: 0.1,Unnamed: 0,Video Name,Thumbnail URL,Video ID,length,date published,transcript,punctuated transcripts
0,2,Barcelona vs Getafe | LA LIGA HIGHLIGHTS | 4/2...,https://i.ytimg.com/vi/P94-YkBNOII/hqdefault.j...,P94-YkBNOII,"13 minutes, 9 seconds",1 year ago,this is a wonderful show of class from hetafa...,this is a wonderful show of class from hetafa ...
1,3,Ray Hudson's Best Reactions Vol. 2: El Clásico,https://i.ytimg.com/vi/Q383qiSYFTA/hqdefault.j...,Q383qiSYFTA,"5 minutes, 7 seconds",1 year ago,[Music] vasquez bursting up the right wing sn...,"[Music] vasquez, bursting up the right wing, s..."
2,4,Real Madrid vs Barcelona | LALIGA HIGHLIGHTS |...,https://i.ytimg.com/vi/tyOX9p2BWZk/hqdefault.j...,tyOX9p2BWZk,"19 minutes, 39 seconds",1 year ago,casius di stefano ramos raul up against barce...,"casius di stefano ramos, raul up against barce..."
3,5,Real Sociedad vs Barcelona | LALIGA HIGHLIGHTS...,https://i.ytimg.com/vi/iPgkQE-M8SU/hqdefault.j...,iPgkQE-M8SU,"12 minutes, 5 seconds",1 year ago,well as you might have just grabbed a peek th...,"well as you might have just grabbed a peek, th..."
4,6,Barcelona vs Huesca | LALIGA HIGHLIGHTS | 3/15...,https://i.ytimg.com/vi/vPiKCMo7trw/hqdefault.j...,vPiKCMo7trw,"12 minutes, 29 seconds",1 year ago,up shorts in honor of the man that used to we...,up shorts in honor of the man that used to wea...


In [5]:
texts = hudsoncsv['punctuated transcripts']

In [6]:
texts.head()

0    this is a wonderful show of class from hetafa ...
1    [Music] vasquez, bursting up the right wing, s...
2    casius di stefano ramos, raul up against barce...
3    well as you might have just grabbed a peek, th...
4    up shorts in honor of the man that used to wea...
Name: punctuated transcripts, dtype: object

In [9]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [11]:
def series_to_sents(series_in):
    list = series_in.tolist()
    sentence_list = []
    for transcript in list:
        sentences = sent_tokenize(transcript)
        sentence_list.append(sentences)

    return sentence_list

In [59]:
sentences = series_to_sents(texts)
all_sents = [x for y in sentences for x in y]

In [67]:
master_string = ""
for sent in all_sents:
    sent = sent + " "
    master_string += sent

In [68]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform([master_string])
feature_names = vectorizer.get_feature_names()
dense = X.todense()
denselist = dense.tolist()
tfidf_df = pd.DataFrame(denselist, columns=feature_names)

In [70]:
tfidf_df.head()
tfidf_df.to_csv("hudson_word_freqs.csv")

In [20]:
def get_POS(text_list):
    POS_arr = []
    for line in text_list:
        sent_pos_arr = []
        sent_tag_arr = []
        sp_out = sp(line)
        tokenized = []
        for token in sp_out:
            tokenized.append(token)
            sent_pos_arr.append(token.pos_)
            sent_tag_arr.append(token.tag_)
        text_pos_dict = {"text": tokenized, "pos": sent_pos_arr, "tag": sent_tag_arr}
        POS_arr.append(text_pos_dict)
    return POS_arr

In [34]:
def gen_nouns_prn(simile_w_pos):
    simile_nouns = []
    for entry in simile_w_pos:
        entry_nouns = []
        for idx in range(len(entry['pos'])):
            if entry['pos'][idx] in ['NOUN', 'PRON']:
                entry_nouns.append(entry['text'][idx])
        simile_nouns.append(entry_nouns)
    flat_nouns = [str(x) for y in simile_nouns for x in y]
    return flat_nouns

In [43]:
def text_to_nouns_prn(text_list):
    pos_arr = get_POS(text_list)
    nouns_prn = gen_nouns_prn(pos_arr)
    outstring = ""
    for word in nouns_prn:
        word = word + ' '
        outstring += word
    return outstring


In [71]:
def gen_noun_pairs(text_list):
    pos_arr = get_POS(text_list)
    flat_noun_prn = gen_nouns_prn(pos_arr)
    noun_pairs = []
    for idx in range(len(flat_noun_prn)-1):
        noun_pair = (flat_noun_prn[idx], flat_noun_prn[idx+1])
        noun_pairs.append(noun_pair)
    return noun_pairs

