### Importing all the necessary Libraries

In [1]:
import youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
import re
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

### Get the subtitles of the video Using Unique ID

In [2]:
link = "https://www.youtube.com/watch?v=7x5M4lxK-dw" 
unique_id = link.split("=")[-1]
sub = YouTubeTranscriptApi.get_transcript(unique_id)  
subtitle = " ".join([x['text'] for x in sub])

### Summarization using TF-IDF vectorizer

In [3]:
### TF-IDF or term frequency-inverse document frequency is a vectorizer that converts the text into a vector.
### It has 2 terms term frequency and inverse document frequency.
### Term frequency is the number of repetitions of words in a sentence by the total number of words in that sentence.
### Inverse document frequency is the log of no of sentences by the number of sentences containing the given word.

In [4]:
#using sentence_tokenizer of nltk library for tokenization.

from nltk.tokenize import sent_tokenize
subtitle = subtitle.replace("n","")
sentences = sent_tokenize(subtitle)

In [5]:
# organizing the tokenized sentences into the dictionary with the sentence as the key and corresponding index to its value.

In [6]:
organized_sent = {k:v for v,k in enumerate(sentences)}

In [7]:
# using the tf-idf vectorizer,will get the scores of each sentence that we created during tokenization.

In [8]:
import numpy as np
tf_idf = TfidfVectorizer(min_df=1, 
                                    strip_accents='unicode',
                                    max_features=None,
                                    lowercase = True,
                                    token_pattern=r'w{1,}',
                                    ngram_range=(1, 3), 
                                    use_idf=1,
                                    smooth_idf=1,
                                    sublinear_tf=1,
                                    stop_words = 'english')
 

sentence_vectors = tf_idf.fit_transform(sentences)
sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel()



In [9]:
#find out the top N sentences that have a larger score

In [10]:
N = 3
top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N]]

In [11]:
#let’s order the top sentences based on the order in the subtitles

In [12]:
# mapping the scored sentences with their indexes as in the subtitle
mapped_sentences = [(sentence,organized_sent[sentence]) for sentence in top_n_sentences]
# Ordering the top-n sentences in their original order
mapped_sentences = sorted(mapped_sentences, key = lambda x: x[1])
ordered_sentences = [element[0] for element in mapped_sentences]
# joining the ordered sentence
summary = " ".join(ordered_sentences)
 

### Summarization using BART

In [13]:
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

In [14]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [15]:
# encode this subtitle using the Bart Tokenizer.
input_tensor = tokenizer.encode( subtitle, return_tensors="pt", max_length=512)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
outputs_tensor = model.generate(input_tensor, max_length=1000, min_length=500,length_penalty=2.0, num_beams=4, early_stopping=True)
outputs_tensor

tensor([[    2,     0, 25093,  4134,   324,    16,    65,     9,     5,   144,
          5372,  5494,   710,  3958,     9,    70,    86,     4,  1614, 13448,
          1090,    16,     5,  1482,     9,     5,    10,  1549,   329,  8591,
             4, 20863,    16,     5,  2730,     9,     5,  1040,    22, 25093,
           298,    18,  5972,     9,  2032, 15409,   113,     8,  1029,    12,
         11438,     9,     5,   939, 20697,  8591,     4,    20,  1040,    16,
           716,    15,     5,  1040,     9,     5,   276,   766,    30,  3259,
         20269,     8,  1575,     5,   173,     9,  3259, 20269,     6,  3259,
          1745,     6,     8,   643,     4,    20,  8591,    16,   577,    15,
         14734,     8,    34,    10,   316,    12,  8596,  7425,     9,    70,
             5,  7614,    24,    34,  2913,    98,   444,     4,    85,    16,
            67,   577,    15,   312, 36519,     8,     5, 27879,  1121,  1553,
            13,     5,  2733,     8,  9481,     4,  

In [17]:
# outputs will be a tensor in order to get text out of it, we need to decode it using the same Bart Tokenizer model.
print(tokenizer.decode(outputs_tensor[0]))

</s><s>Stephanie is one of the most requested retur guests of all time. Adreesse is the host of the a16z podcast. Steph is the author of the book "Steph's Book of Thigs" and co-host of the iReport podcast. The book is based on the book of the same name by Stephen Colbert and features the work of Stephen Colbert, Stephen King, and others. The podcast is available on iTunes and has a 12-page log of all the topics it has covered so far. It is also available on Stitcher and the TuneIn app for the iPhone and iPad. For more information on the podcast, visit a16Z.com or go to www.a16z.com. For the full interview with Steph, visit her book of thigs, Steph’s Book of Thigs and iReport on  iTunes and Stitcher. For the full transcript of the interview with Stephanie, visit: http://www.stitcher.com/s/Steph-Steph “Steph is back.”. For the full version of this article, please go to: ‘www.steph-steph.com’.  ’www.steven-stephanie-’Steven’ ”Steven.’’ ‘Steven,’ on the web’: ‘Steph, I’m here to talk to yo