In [6]:
# Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [7]:
# Load Dataset
dataset = pd.read_excel("Dataset/Catalent BWI.xlsx")
dataset.head()

Unnamed: 0,S. No.,Spec #,Spec Name,Para,Sub Section Heading,Submittal Type,Submittal Description,Target Date,Subcontractor
0,44,24119,SELECTIVE DEMOLITION,1.10-A,WARRANTY,Warranty,"Existing Warranties : Remove, replace, patch, ...",,
1,45,24119,SELECTIVE DEMOLITION,1.10-B,WARRANTY,Warranty,Notify warrantor on completion of selective de...,,
2,36,24119,SELECTIVE DEMOLITION,1.5-A,PREINSTALLATION MEETINGS,Meetings,Predemolition Conference : Conduct conference ...,,
3,37,24119,SELECTIVE DEMOLITION,1.6-A,INFORMATIONAL SUBMITTALS,Measurements,"Proposed Protection Measures : Submit report, ...",,
4,38,24119,SELECTIVE DEMOLITION,1.6-B,INFORMATIONAL SUBMITTALS,Schedules,Schedule of Selective Demolition Activities : ...,,


In [8]:
# Column Name and Index of Column
column_index_map = dict(zip(list(dataset.columns), range(len(dataset.columns))))
pd.DataFrame.from_dict(column_index_map, orient = 'index')

Unnamed: 0,0
S. No.,0
Spec #,1
Spec Name,2
Para,3
Sub Section Heading,4
Submittal Type,5
Submittal Description,6
Target Date,7
Subcontractor,8


In [9]:
# Clean Dataset
# Remove Unwanted Features from Frame
#
dataset.drop(columns = dataset.columns[[0, 1, 2, 3, 4, 5, 7, 8]], 
            axis = 1,
            inplace = True)

In [10]:
dataset['Submittal Description'][0]

'Existing Warranties : Remove, replace, patch, and repair materials and surfaces cut or \ndamaged during selective demolition, by methods and with materials and using approved\ncontractors so as not to void existing warranties. Notify warrantor before proceeding. Existing\nwarranties include the following:\n1. TPO Roofing System'

#### Text Rank 

In [47]:
# Text Rank 
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer 
from sumy.summarizers.text_rank import TextRankSummarizer
import nltk
# nltk.download('punkt')

# Creating text parser using tokenization
parser = PlaintextParser.from_string(dataset['Submittal Description'][0], Tokenizer("english"))

# Summarize using sumy TextRank
summarizer = TextRankSummarizer()
summary =summarizer(parser.document, 1)

text_summary=""
for sentence in summary:
    text_summary+=str(sentence)

print(text_summary)

Existing Warranties : Remove, replace, patch, and repair materials and surfaces cut or damaged during selective demolition, by methods and with materials and using approved contractors so as not to void existing warranties.


#### Lex Rank

In [49]:
# Lex Rank
from sumy.summarizers.lex_rank import LexRankSummarizer
summarizer_lex = LexRankSummarizer()
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer 
import nltk
# nltk.download('punkt')

# Creating text parser using tokenization
parser = PlaintextParser.from_string(dataset['Submittal Description'][0], Tokenizer("english"))

# Summarize using sumy TextRank
summarizer = LexRankSummarizer()
summary =summarizer(parser.document, 1)

text_summary=""
for sentence in summary:
    text_summary+=str(sentence)

print(text_summary)


Existing Warranties : Remove, replace, patch, and repair materials and surfaces cut or damaged during selective demolition, by methods and with materials and using approved contractors so as not to void existing warranties.


#### LSA Summarizer

In [51]:
# LSA Summarizer
from sumy.summarizers.lsa import LsaSummarizer
summarizer_lex = LsaSummarizer()
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer 
import nltk
# nltk.download('punkt')

# Creating text parser using tokenization
parser = PlaintextParser.from_string(dataset['Submittal Description'][0], Tokenizer("english"))

# Summarize using sumy TextRank
summarizer = LsaSummarizer()
summary =summarizer(parser.document, 1)

text_summary=""
for sentence in summary:
    text_summary+=str(sentence)

print(text_summary)


Existing Warranties : Remove, replace, patch, and repair materials and surfaces cut or damaged during selective demolition, by methods and with materials and using approved contractors so as not to void existing warranties.


#### NLTK with Frequency Count

In [62]:
# NLTK with Frequency Count
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
# nltk.download('stopwords')

data = dataset['Submittal Description'][0]

stopwords = set(stopwords.words("english"))
words = word_tokenize(data)

freq_table = dict()
for word in words:
    word = word.lower()
    if word in stopwords:
        continue
    if word in freq_table:
        freq_table[word] += 1
    else:
        freq_table[word] = 1

sentences = sent_tokenize(data)
sentence_value = dict()

for sentence in sentences:
    for word, freq in freq_table.items():
        if word in sentence.lower():
            if sentence in sentence_value:
                sentence_value[sentence] += freq
            else:
                sentence_value[sentence] = freq

sum_value = 0
for sentence in sentence_value:
    sum_value += sentence_value[sentence]

average = int(sum_value / len(sentence_value))

summary = ""

for sentence in sentences:
    if(sentence in sentence_value) and (sentence_value[sentence] > (1.7 * average)):
        summary = summary + " " + sentence

print("Summary")
print(summary)

Summary
 Existing Warranties : Remove, replace, patch, and repair materials and surfaces cut or 
damaged during selective demolition, by methods and with materials and using approved
contractors so as not to void existing warranties.


#### NLTK Cosime Similarities and Glove Embedding

In [63]:
# Cosine Similarities and Glove Embedding

import pandas as pd
import numpy as np
import nltk
# nltk.download('punkt')
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

## Data
#
data = dataset['Submittal Description'][0]

## Sentence tokenization
#
sentences = sent_tokenize(data)

## Word representation
# 
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
clean_sentences = [s.lower() for s in clean_sentences]
stop_words = stopwords.words('english')
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

## Sentence vectors
#
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

## Find similarities between sentences
#
sim_mat = np.zeros([len(sentences), len(sentences)])
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

## Convert matrix into graph
#
import networkx as nx
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

## Final
#
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
print("ARTICLE:")
print(data)
print('\n')
print("SUMMARY:")
print(ranked_sentences[0][1])
print('\n')

  clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")


ARTICLE:
Existing Warranties : Remove, replace, patch, and repair materials and surfaces cut or 
damaged during selective demolition, by methods and with materials and using approved
contractors so as not to void existing warranties. Notify warrantor before proceeding. Existing
warranties include the following:
1. TPO Roofing System


SUMMARY:
Existing Warranties : Remove, replace, patch, and repair materials and surfaces cut or 
damaged during selective demolition, by methods and with materials and using approved
contractors so as not to void existing warranties.




### Big Gun - 

#### Google's T5

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

In [2]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)



In [3]:
data = """Polishing Schedule : Submit plan showing polished concrete surfaces and schedule of polishing 
operations for each area of polished concrete before start of polishing operations. Include
locations of all joints, including construction joints.
"""

In [29]:
data = """Samples for Initial Selection : For each type of product requiring color selection.
"""

In [13]:
data = """Mix Designs : For each type of mortar and grout. Include description of type and proportions of 
ingredients.
"""

In [8]:
data = """
 Coordination Owners continuing occupancy portions existing building Owners partial occupancy completed Work"""

In [9]:
inputs = tokenizer.encode("summarize: " + data,
                          return_tensors='pt',
                          max_length=512,
                          truncation=True)

In [10]:
summary_ids = model.generate(inputs, max_length = 4, min_length = 2, length_penalty=5., num_beams=2)
summary = tokenizer.decode(summary_ids[0])


In [11]:
print(summary.replace('<pad>', ''))

 Work completed on


#### Spacy NLargest

In [34]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

In [35]:
def summarize(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

In [50]:
data = """Samples for Initial Selection : For each type of product requiring color selection.
"""
summarize(data, 0.09)


''

#### Py Summarization

In [51]:
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

In [54]:
document = "Samples for Initial Selection : For each type of product requiring color selection."

In [55]:
# Object of automatic summarization.
auto_abstractor = AutoAbstractor()
# Set tokenizer.
auto_abstractor.tokenizable_doc = SimpleTokenizer()
# Set delimiter for making a list of sentence.
auto_abstractor.delimiter_list = [".", "\n"]
# Object of abstracting and filtering document.
abstractable_doc = TopNRankAbstractor()
# Summarize document.
result_dict = auto_abstractor.summarize(document, abstractable_doc)

# Output result.
for sentence in result_dict["summarize_result"]:
    print(sentence)

Samples for Initial Selection : For each type of product requiring color selection.



#### Custom Deep Learning Based Text Summarization

In [57]:
from transformers import pipeline
summarization = pipeline("summarization")


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)
Downloading: 100%|██████████| 1.76k/1.76k [00:00<?, ?B/s]
Downloading: 100%|██████████| 1.14G/1.14G [01:47<00:00, 11.4MB/s]  
Downloading: 100%|██████████| 26.0/26.0 [00:00<00:00, 26.0kB/s]
Downloading: 100%|██████████| 878k/878k [00:01<00:00, 568kB/s]  
Downloading: 100%|██████████| 446k/446k [00:00<00:00, 457kB/s] 


In [58]:
document = "Samples for Initial Selection : For each type of product requiring color selection."

In [62]:
import re
from string import punctuation

clean_text = re.sub(f"[{re.escape(punctuation)}]", "", document)

In [65]:
clean_text = clean_text.lower()
clean_text

'samples for initial selection  for each type of product requiring color selection'

In [18]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

document = """Notify warrantor on completion of selective demolition, and obtain documentation verifying
that existing system has been inspected and warranty remains in effect. Submit
documentation at Project closeout."""
doc = nlp(document)
doc.ents

(Notify, Project)

'samples initial selection   type product requiring color selection'

In [71]:
last = re.sub(' +', ' ', ' '.join(clean_token))

In [72]:
summary_text = summarization(last, max_length = 5, min_length = 3)[0]['summary_text']
print("Summary:", summary_text)

Summary:  samples initial


In [31]:
# pip install yake
import yake

document = """No"""

kw_extractor = yake.KeywordExtractor()
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
numOfKeywords = 1
custom_kw_extractor = yake.KeywordExtractor(lan = language, 
                n = max_ngram_size, 
                dedupLim = deduplication_threshold, 
                top = numOfKeywords, 
                features = None)
keywords = custom_kw_extractor.extract_keywords(document)
# keywords[0][0]
# for kw in keywords:
    # print(kw[0])
keywords[0][0]

IndexError: list index out of range