In [None]:
import nltk
import numpy as np
import re
import pandas as pd
import re
from gensim.summarization import summarize
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds

In [None]:
#Importing dialogue to summarize 

dialogue = """
In an attempt of reprising the role of “himself”, Joe Pesci plays a psycho mobster named Tommy, who’s as crazy as he’s funny, although Pesci’s character is in denial of those aspects of his personality.

Martin Scorsese structures this dialogue scene wonderfully, using two camera setups that allow us to see each character’s reactions.

Tommy is telling a story to a group of other mobsters in a restaurant, about how he refused to be interrogated by cops when they approached him as he was resting in a park. Everyone is laughing due to the actor’s hilarious way of telling the tale and Henry Hill (played by Ray Liotta) instinctively says to Tommy that he’s funny, who replies, “funny how?” Henry doesn’t know how to answer and starts mumbling, so Tommy questions him again with a bit of anger in his voice, as silence takes over the table.

Soon enough, Henry finds out that Tommy is only messing with him and everybody starts laughing again. However, we also see how much of a lunatic Tommy is, which makes us wonder if Henry is safe by his side.

This scene is based on a real experience Joe Pesci had when working on a restaurant. He told a mobster that he was funny and things went downhill from there, as the guy didn’t take the compliment too fondly. The director didn’t add this bit to the filming schedule, only he and Joe knew about it, so the other actors improvised around Pesci’s great performance, and their reactions are real and priceless.

Scorsese portrays the relationship between these two in a single masterful dialogue that, on top of being amusing, establishes the movie’s whole rhythm, which is dynamic and unexpected.
"""

In [None]:
#converting dialogue to lowercase and a more readable format

dialogue = re.sub(r' +', ' ', dialogue)
dialogue = re.sub(r'\n|\r', ' ', dialogue)
dialogue = dialogue.strip()
dialogue = dialogue.lower()

In [None]:
#summarizing dialogue

print(summarize(dialogue, ratio=0.2, split=False))

in an attempt of reprising the role of “himself”, joe pesci plays a psycho mobster named tommy, who’s as crazy as he’s funny, although pesci’s character is in denial of those aspects of his personality.
everyone is laughing due to the actor’s hilarious way of telling the tale and henry hill (played by ray liotta) instinctively says to tommy that he’s funny, who replies, “funny how?” henry doesn’t know how to answer and starts mumbling, so tommy questions him again with a bit of anger in his voice, as silence takes over the table.


In [None]:
print(summarize(dialogue, word_count=50, split=False))

in an attempt of reprising the role of “himself”, joe pesci plays a psycho mobster named tommy, who’s as crazy as he’s funny, although pesci’s character is in denial of those aspects of his personality.


In [None]:
#Idea of how many sentances I am working with

sentences = nltk.sent_tokenize(dialogue)
len(sentences)

10

In [None]:
#More cleaning, removing stopwords 

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [None]:
norm_sentences = normalize_corpus(sentences)
norm_sentences[:5]

array(['attempt reprising role joe pesci plays psycho mobster named tommy whos crazy hes funny although pescis character denial aspects personality',
       'martin scorsese structures dialogue scene wonderfully using two camera setups allow us see characters reactions',
       'tommy telling story group mobsters restaurant refused interrogated cops approached resting park',
       'everyone laughing due actors hilarious way telling tale henry hill played ray liotta instinctively says tommy hes funny replies funny henry doesnt know answer starts mumbling tommy questions bit anger voice silence takes table',
       'soon enough henry finds tommy messing everybody starts laughing'],
      dtype='<U226')

In [None]:
#using TF-IDF to see the weights of the words

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
dt_matrix = tv.fit_transform(norm_sentences)
dt_matrix = dt_matrix.toarray()

vocab = tv.get_feature_names()
td_matrix = dt_matrix.T
print(td_matrix.shape)
pd.DataFrame(np.round(td_matrix, 2), index=vocab).head(8)

(121, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
actors,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.22,0.0
add,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.0
allow,0.0,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
also,0.0,0.0,0.0,0.0,0.0,0.31,0.0,0.0,0.0,0.0
although,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amusing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27
anger,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.0,0.0,0.0
answer,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def low_rank_svd(matrix, singular_count=2):
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

In [None]:
num_sentences = 5
num_topics = 3

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
print(u.shape, s.shape, vt.shape)
term_topic_mat, singular_values, topic_document_mat = u, s, vt

(121, 3) (3,) (3, 10)


In [None]:
# remove values below threshold                                         
sv_threshold = 0.5
min_sigma_value = max(singular_values) * sv_threshold
singular_values[singular_values < min_sigma_value] = 0

In [None]:
#achieving the scores/values

scores = np.sqrt(np.dot(np.square(singular_values), 
                                 np.square(topic_document_mat)))
scores

array([0.61098278, 0.78386234, 0.29721354, 0.72143761, 0.70387536,
       0.60644032, 0.65151713, 0.39440139, 0.59435592, 0.63449575])

In [None]:
top_sentence_indices = (-scores).argsort()[:num_sentences]
top_sentence_indices.sort()

In [None]:
#summarized portion of our dialogue

print('\n'.join(np.array(sentences)[top_sentence_indices]))

martin scorsese structures this dialogue scene wonderfully, using two camera setups that allow us to see each character’s reactions.
everyone is laughing due to the actor’s hilarious way of telling the tale and henry hill (played by ray liotta) instinctively says to tommy that he’s funny, who replies, “funny how?” henry doesn’t know how to answer and starts mumbling, so tommy questions him again with a bit of anger in his voice, as silence takes over the table.
soon enough, henry finds out that tommy is only messing with him and everybody starts laughing again.
this scene is based on a real experience joe pesci had when working on a restaurant.
scorsese portrays the relationship between these two in a single masterful dialogue that, on top of being amusing, establishes the movie’s whole rhythm, which is dynamic and unexpected.
