In [214]:
import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import en_core_web_sm

In [215]:
def summarize(text=None, filename=None, file_prepend='', tokenized=True, sentences=3, clean=False):
  nlp = en_core_web_sm.load()
  
  if filename != None:
    with open(file_prepend+filename, 'r', encoding='utf-8') as f:
      text = ' '.join(f.readlines())
  elif text == '':
    return ''
  
  doc = nlp(text)

  corpus = [sent.text.lower() for sent in doc.sents]
  cv = CountVectorizer(stop_words=list(STOP_WORDS))
  cv_fit = cv.fit_transform(corpus)

  word_list = cv.get_feature_names()
  count_list = cv_fit.toarray().sum(axis=0)

  word_frequency = dict(zip(word_list,count_list))

  val = sorted(word_frequency.values())
  higher_word_frequencies = [word for word, freq in word_frequency.items() if freq in val[-3:]]

  higher_frequency = val[-1]
  for word in word_frequency.keys():  
    word_frequency[word] = (word_frequency[word]/higher_frequency)
  
  sentence_rank={}
  for sent in doc.sents:
    for word in sent :       
      if word.text.lower() in word_frequency.keys():            
        if sent in sentence_rank.keys():
          sentence_rank[sent]+=word_frequency[word.text.lower()]
        else:
          sentence_rank[sent]=word_frequency[word.text.lower()]

  top_sentences = (sorted(sentence_rank.values())[::-1])
  top_sent = top_sentences[:sentences]

  summary = []
  for sent,strength in sentence_rank.items():  
    if strength in top_sent:
      a = str(sent)
      if clean:
        a = a.replace('\n','').replace('\t','')
      if a[-1] not in ['.','!','?']:
        a = a.strip() + '.'
      summary.append(a)
    else:
      continue
  
  if tokenized:
    return summary
  return ' '.join(summary)

In [216]:
from pptx import Presentation
import PyPDF2

import glob
import pandas as pd

In [217]:
pres_text = {}

for f in glob.glob('drive/My Drive/SD/*.pptx'):
  pres = Presentation(f)
  temp = []
  
  for slide in pres.slides:
    for shape in slide.shapes:
      if hasattr(shape, 'text'):
        a = str(shape.text)
        a = a.replace('\n','').replace('\t','')

        if a == '':
          continue
        temp.append(a)
  pres_text[f] = ' '.join(temp)

In [218]:
pdf_text = {}

for f in glob.glob('drive/My Drive/SD/*.pdf'):
  pdf = open(f, 'rb')
  reader = PyPDF2.PdfFileReader(pdf)

  temp = []
  for i in range(reader.numPages):
    a = str(reader.getPage(i).extractText())
    a = a.replace('\n','').replace('\t','')

    if a == '':
      continue
    temp.append(a)

  pdf_text[f] = ' '.join(temp)

In [219]:
from rake_nltk import Metric, Rake

r = Rake()

ranked_phrases = []
for t in list(pres_text.values())+list(pdf_text.values()):
  r.extract_keywords_from_text(t)
  a = r.get_ranked_phrases()
  ranked_phrases.append(' '.join(a))

In [220]:
lrk = list(pres_text.keys())+list(pdf_text.keys())
lrv = list(pres_text.values())+list(pdf_text.values())

df = pd.DataFrame({
    'filename': [l.split('drive/My Drive/SD/')[1] for l in lrk],
    'summary': [summarize(text=t, tokenized=False, sentences=5, clean=True) for t in lrv],
    'keywords': ranked_phrases,
    'text': lrv })

In [221]:
df

Unnamed: 0,filename,summary,keywords,text
0,XI_Media_Presentation_AI_AR.pptx,Who?XI Media LLCXI Media is active in developm...,computer engineering – software engineering pm...,FLAIRR Dr. Reza Aria XI Media LLC August...
1,Senior Design Pitch MAJ RG.pptx,Senior Design Pitch Joe Remesz-Guerrette About...,active duty army officerboard certified nuclea...,Senior Design Pitch Joe Remesz-Guerrette About...
2,Safeplan SD Presentation.pptx,Safeplan Teen Suicide Prevention and Safety Pl...,” features fall 2020complete initial interacti...,Safeplan Teen Suicide Prevention and Safety Pl...
3,"Galajda,Jacob-SeniorDesignPresentation.pptx",Feature arrays for each instrument are concate...,spacesegment barrier retrieval algorithm impro...,Project:Deep Mozart Jacob GalajdaFaculty Spon...
4,SeniorDesign_SwarmingDrone.pptx,Lockheed Martin Missiles and Fire Control Appl...,lockheed martin senior design swarm project © ...,Lockheed Martin Senior Design Swarm Project © ...
5,RealityFlowPitchF2020.pptx,"AR/VR/XRFounded Seebright in 2012, and have wo...",variables via visual scripting “ compile ” con...,Reality Flow Real-Time XR Content Prototyping ...
6,UCF Student Project Pitch Deck Fall 2020.pptx,Advanced Programs & Special Programs(J. Reynol...,reinforcement learning based ai open source to...,"Rebecca BroadwayProduct Management, Senior Sta..."
7,JML-project.pptx,has bugsBugs can lead to failuresDeathInjuryMo...,correct programs verifyincorrect programs give...,Improving Java Verification with OpenJML Gary ...
8,2020 Fall CS SEE.pptx,"US, Canada, Brazil, UK, Germany, France, Italy...",florida space institute computer science senio...,Florida Space InstituteComputer Science Senio...
9,Database presentation - Fall 2020.pptx,"Current database is old, not relationalUnable ...",ucf marine turtle research group four main pro...,UCF Marine Turtle research group Photos: UC...


In [222]:
df['filename'] = [f.replace('.pptx','').replace('.pdf','') for f in df.loc[:,'filename']]

In [223]:
df.to_json('foo.json')