<a href="https://colab.research.google.com/github/yeedas/Abstractive_Summary_of_Transcriptions/blob/master/Summarization_using_Latent_Semantic_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive/')

Mounted at /content/drive/


Latent Semantic Evaluation

In [3]:
#import modules
import os.path
import nltk
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
Doc = """Birdwatchers have descended on a salt marsh to see a bird not seen in Britain for 40 years.
The rufous bush chat was spotted at Stiffkey, north Norfolk, prompting up to 100 birdwatchers to go to see it.
Native to southern Spain, Africa and the Balkans the bird, also known as the rufous warbler and rufous bush robin, is rarely seen in northern Europe.

Dick Filby, of Rare Bird Alert, said it "would have been heading for a tropical climate and went the wrong way."

He said the last time the bird was spotted in Britain was at Prawle Point in Devon in 1980.
"In 1998, one was seen in Jersey (part of the British Isles but not classed as part of Britain)."

Mr Filby said he hoped birdwatchers would be wearing masks and keeping socially-distanced as they enjoyed the view.
Ch Supt Chris Balmer said: "People may arrive on their own but some have started to gather in groups larger than six to be able to see the bird. This is a breach of the law.

In the first instance officers will engage, explain and encourage people to leave but enforcement is an option and we will be issuing fixed penalty notices should people not comply."""

Load the document

In [4]:
def load_data(path,file_name):
    """
    Input  : path and file_name
    Purpose: loading text file
    Output : list of paragraphs/documents and
             title(initial 100 words considred as title of document)
    """
    documents_list = []
    titles=[]
    with open( os.path.join(path, file_name) ,"r") as fin:
        for line in fin.readlines():
            text = line.strip()
            documents_list.append(text)
    print("Total Number of Documents:",len(documents_list))
    titles.append( text[0:min(len(text),5)] )
    return documents_list,titles

Preprocess the document

In [5]:
def preprocess_data(doc_set):
    """Input  : document list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text """    
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

Prepare the term frequency matrix for the document

In [6]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

Create a gensim latent semantic analysis model

In [7]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

In [None]:
start,stop,step=2,12,1
plot_graph(clean_text,start,stop,step)

NameError: ignored

Select the number of topics the documents have to be divided into

In [9]:
number_of_topics=2
words=20
document_list,titles=load_data("","/content/drive/My Drive/Summarization/document.txt")
clean_text=preprocess_data(document_list)
dict1,doc_term_matrix=prepare_corpus(clean_text)
model=create_gensim_lsa_model(clean_text,number_of_topics,words)
corpus_lsi = model[doc_term_matrix]

Total Number of Documents: 13
[(0, '0.396*"bird" + 0.291*"rufou" + 0.263*"said" + 0.218*"see" + 0.199*"seen" + 0.184*"rare" + 0.174*"peopl" + 0.173*"bush" + 0.155*"birdwatch" + 0.131*"britain" + 0.118*"known" + 0.118*"balkan" + 0.118*"warbler" + 0.118*"robin" + 0.118*"southern" + 0.118*"africa" + 0.118*"northern" + 0.118*"nativ" + 0.118*"also" + 0.118*"europ"'), (1, '0.419*"peopl" + -0.272*"rufou" + 0.156*"instanc" + 0.156*"engag" + 0.156*"notic" + 0.156*"offic" + 0.156*"explain" + 0.156*"issu" + 0.156*"option" + 0.156*"penalti" + 0.156*"compli" + 0.156*"fix" + 0.156*"enforc" + 0.156*"first" + 0.156*"leav" + 0.156*"encourag" + -0.154*"bush" + -0.150*"seen" + -0.119*"rare" + -0.118*"also"')]


In [None]:
for doc, as_text in zip(corpus_lsi, document_list):
    print(doc, as_text)

[(0, 1.388037033073856), (1, -0.24678646167363777), (2, 0.29428281639113124)] Birdwatchers have descended on a salt marsh to see a bird not seen in Britain for 40 years.
[(0, 1.3291838568349637), (1, -0.6893231653479431), (2, -0.3471039479313534)] The rufous bush chat was spotted at Stiffkey, north Norfolk, prompting up to 100 birdwatchers to go to see it.
[(0, 2.8279258274939716), (1, -2.292508817125084), (2, -1.9553680485437746)] Native to southern Spain, Africa and the Balkans the bird, also known as the rufous warbler and rufous bush robin, is rarely seen in northern Europe.
[] 
[(0, 1.5880513449629072), (1, -0.010356534623308919), (2, 1.2672991803787974)] Dick Filby, of Rare Bird Alert, said it "would have been heading for a tropical climate and went the wrong way."
[] 
[(0, 1.1924827024869298), (1, 0.047335264909172364), (2, 0.7085704198706022)] He said the last time the bird was spotted in Britain was at Prawle Point in Devon in 1980.
[(0, 0.5650999093994775), (1, -0.37176567429

In [10]:
def takenext(elem):
	"""
	sort
	"""
	return elem[1]

Sort the vectors in descending order of weightage

In [12]:
#sort each vector by score
vecsSort = list(map(lambda i: list(), range(2)))
for i,docv in enumerate(corpus_lsi):
	for sc in docv:
		isent = (i, abs(sc[1]))
		vecsSort[sc[0]].append(isent)
vecsSort = list(map(lambda x: sorted(x,key=takenext,reverse=True), vecsSort))	

In [None]:
print(vecsSort)

[[(2, 2.8279258274939716), (10, 2.518802236823936), (4, 1.5880513449629072), (0, 1.388037033073856), (1, 1.3291838568349637), (6, 1.1924827024869298), (9, 1.0150188799376756), (12, 0.835833090818561), (7, 0.5650999093994775)], [(12, 3.0224718340699583), (2, 2.292508817125084), (10, 2.065651178593418), (1, 0.6893231653479431), (7, 0.37176567429176577), (0, 0.24678646167363777), (9, 0.17996419353124002), (6, 0.047335264909172364), (4, 0.010356534623308919)], [(12, 2.5886702332072806), (2, 1.9553680485437746), (9, 1.4807452838604747), (10, 1.3740914565198534), (4, 1.2672991803787974), (6, 0.7085704198706022), (1, 0.3471039479313534), (0, 0.29428281639113124), (7, 0.13328904117071882)]]


In [None]:
sentIndexes = set()

In [13]:
def selectTopSent(summSize, numTopics, sortedVec):
  topSentences = []
  sent_no = []
  sentInd = set()
  sCount = 0
  for i in range(summSize):
    for j in range(numTopics):
      vecs = sortedVec[j]
      si = vecs[i][0]
      if si not in sentInd:
        sent_no.append(si)
        topSentences.append(vecs[i])
        sentInd.add(si)
        sCount +=1
        if sCount == summSize:
          return sent_no

Select the sentences for the summary

In [None]:
def selectTopSent(summSize, numTopics, vecsSort):
	topSentences = []
	sent_no = []
	sentIndexes = set()
  
	sentIndexes = set()
 
 
		topSentences = []
		sent_no = []
		sentIndexes = set()
		sCount = 0
		for i in range(summSize):
			for j in range(numTopics):
				vecs = sortedVecs[j]
				si = vecs[i][0]
				if si not in sentIndexes:
					sent_no.append(si)
					sCount +=1
					print("vecs",vecs[i])
					print("index",si)
					topSentences.append(vecs[i])
					sentIndexes.add(si)
					if sCount == summSize:
						sent_no
			return sent_no
		 			
					
		 

					
		 
		 							
					

In [14]:
topSentences = selectTopSent(8, 2, vecsSort)

In [15]:
print(topSentences)
topSentences.sort()
print(topSentences)

[2, 12, 10, 4, 0, 1, 7, 6]
[0, 1, 2, 4, 6, 7, 10, 12]


In [None]:
print(document_list)

['Birdwatchers have descended on a salt marsh to see a bird not seen in Britain for 40 years.', 'The rufous bush chat was spotted at Stiffkey, north Norfolk, prompting up to 100 birdwatchers to go to see it.', 'Native to southern Spain, Africa and the Balkans the bird, also known as the rufous warbler and rufous bush robin, is rarely seen in northern Europe.', '', 'Dick Filby, of Rare Bird Alert, said it "would have been heading for a tropical climate and went the wrong way."', '', 'He said the last time the bird was spotted in Britain was at Prawle Point in Devon in 1980.', '"In 1998, one was seen in Jersey (part of the British Isles but not classed as part of Britain)."', '', 'Mr Filby said he hoped birdwatchers would be wearing masks and keeping socially-distanced as they enjoyed the view.', 'Ch Supt Chris Balmer said: "People may arrive on their own but some have started to gather in groups larger than six to be able to see the bird. This is a breach of the law.', '', 'In the first

In [16]:
summary = []
doc = []
cnt = 0
for sentence in document_list:
  doc.append(sentence)
  if cnt in topSentences:
    
    summary.append(sentence)
  cnt += 1    
summary = " ".join(summary)
doc = " ".join(doc)
print("\n")
print("Original:")
print(doc)
print("Summary:")
print(summary)



Original:
Birdwatchers have descended on a salt marsh to see a bird not seen in Britain for 40 years. The rufous bush chat was spotted at Stiffkey, north Norfolk, prompting up to 100 birdwatchers to go to see it. Native to southern Spain, Africa and the Balkans the bird, also known as the rufous warbler and rufous bush robin, is rarely seen in northern Europe.  Dick Filby, of Rare Bird Alert, said it "would have been heading for a tropical climate and went the wrong way."  He said the last time the bird was spotted in Britain was at Prawle Point in Devon in 1980. "In 1998, one was seen in Jersey (part of the British Isles but not classed as part of Britain)."  Mr Filby said he hoped birdwatchers would be wearing masks and keeping socially-distanced as they enjoyed the view. Ch Supt Chris Balmer said: "People may arrive on their own but some have started to gather in groups larger than six to be able to see the bird. This is a breach of the law.  In the first instance officers will en

In [None]:
for i in range(70):
  for j in range(3):
	  print(sortedVecs[j])

    

[(107, 2.224644847514939), (119, 2.1630113250419787), (132, 1.4879574440447345), (52, 1.452032979669921), (62, 1.445215182565852), (33, 1.4331974361523336), (92, 1.429777859656594), (142, 1.4254896334776241), (87, 1.4221022534516128), (10, 1.419895062869752), (86, 1.4172213631582744), (156, 1.4169304568585974), (117, 1.4092898118463746), (53, 1.4081402198528086), (5, 1.4046969870309334), (91, 1.4009660562765727), (8, 1.398797355093169), (78, 1.398715845928212), (57, 1.3966205122131925), (15, 1.3932022367078667), (61, 1.3927278295249776), (44, 1.3838568004299727), (157, 1.3835601825264157), (48, 1.379400461678904), (88, 1.378959239008933), (71, 1.3755719753376978), (47, 1.3743880364524026), (65, 1.369783029702494), (41, 1.3678025954510458), (38, 1.3678023616234747), (161, 1.3615332503807587), (163, 1.359804782020478), (114, 1.358334475826402), (125, 1.3549372462301108), (73, 1.3515712232179793), (158, 1.3503990744845362), (19, 1.3486822531281293), (51, 1.3484305707849136), (17, 1.348292

In [None]:
for i,dv in enumerate(corpus_lsi):
  #print(dv)
  for sc in dv:
    isc = (i, abs(sc[1]))
    print(isc)

(0, 0.008642614898942073)
(0, 0.02233378937176901)
(0, 0.03002613250662172)
(1, 1.3139652217048523)
(1, 0.4779347411437951)
(1, 0.08487061648074805)
(2, 1.3199048363117418)
(2, 0.4866553435204607)
(2, 0.08387313226382201)
(3, 0.896733887648004)
(3, 0.24852388263886868)
(3, 0.21560219932470795)
(4, 0.9140291715710056)
(4, 0.25279289361718255)
(4, 0.23208735208735232)
(5, 1.4046969870309334)
(5, 0.5100756366921344)
(5, 0.07797256211993459)
(6, 0.8907908101464642)
(6, 0.2571990902123248)
(6, 0.21496112703273362)
(7, 1.3402342507391973)
(7, 0.49239717223806495)
(7, 0.004890732384955979)
(8, 1.398797355093169)
(8, 0.5199036788591097)
(8, 0.03159607621838589)
(9, 0.9003831926528703)
(9, 0.2530458286315556)
(9, 0.21872953867149836)
(10, 1.419895062869752)
(10, 0.5519589399607714)
(10, 0.16786009760697743)
(11, 1.324045317287255)
(11, 0.47031055373245817)
(11, 0.09538326201020657)
(12, 1.319570787351262)
(12, 0.4861019998968051)
(12, 0.08681982842224027)
(13, 0.895258431532318)
(13, 0.27292369

In [None]:
A =[[1,1,1], [1,0,0], [1,0,0], [1,0,0], [1,0,0], [1,0,0], [0,1,1], [0,1,0], [0,1,1],[0,1,0], [0,1,0], [0,1,0], [0,1,0], [0,0,1], [0,0,1], [0,0,1], [0,0,1], [0,0,1], [0,0,1] ]

In [None]:
print(A)

[[1, 1, 1], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]]


In [None]:
from numpy import array
from scipy.linalg import svd
U, s, VT = svd(A)
print("U is equal to")
print(U)
print("S is ")
print(s)
print("VT is ")
print(VT)

U is equal to
[[-0.46544161  0.26469889  0.06965444 -0.28009166 -0.28009166 -0.28009166
  -0.31391835 -0.14556423 -0.31391835 -0.14556423 -0.14556423 -0.14556423
  -0.14556423 -0.16835412 -0.16835412 -0.16835412 -0.16835412 -0.16835412
  -0.16835412]
 [-0.06765767  0.37138094  0.17076523 -0.22310355 -0.22310355 -0.22310355
   0.2565919   0.32280826  0.2565919   0.32280826  0.32280826  0.32280826
   0.32280826 -0.06621636 -0.06621636 -0.06621636 -0.06621636 -0.06621636
  -0.06621636]
 [-0.06765767  0.37138094  0.17076523 -0.18064447 -0.18064447 -0.18064447
   0.15258884 -0.15425156  0.15258884 -0.15425156 -0.15425156 -0.15425156
  -0.15425156  0.3068404   0.3068404   0.3068404   0.3068404   0.3068404
   0.3068404 ]
 [-0.06765767  0.37138094  0.17076523  0.89461323 -0.10538677 -0.10538677
  -0.03175413 -0.00766416 -0.03175413 -0.00766416 -0.00766416 -0.00766416
  -0.00766416 -0.02408997 -0.02408997 -0.02408997 -0.02408997 -0.02408997
  -0.02408997]
 [-0.06765767  0.37138094  0.17076523 -

D1 The rose is a beautiful flower
D2 The stock exchange has risen by hundred points
D3 The price of the BST stock has fallen down
D4 There were a lot of flowers in the garden
              d1      d2      d3       d4
The           1       1        1       1
rose          1       0        0       0
is            1       0        0       0     
a             1       0        0       1
beautiful     1       0        0       0
flower        1       0        0       1
stock         0       1        1       0
exchange      0       1        0       0
has           0       1        1       0
risen         0       1        0       0
by            0       1        0       0
hundred       0       1        0       0
points        0       1        0       0
price         0       0        1       0
of            0       0        1       1
BST           0       0        1       0
has           0       0        1       0
fallen        0       0        1       0
down          0       0        1       0
There         0       0        0       1
were          0       0        0       1
lot           0       0        0       1
in            0       0        0       1
garden        0       0        0       1
