In [30]:
# Load Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead
from gensim.parsing.preprocessing import remove_stopwords
import string
import re
import yake

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load Dataset Prepared by Production Model
dataset = pd.read_csv("Dataset/Splitted_Catalent_Dataset.csv")
dataset.head()

Unnamed: 0,Spec Number,Spec Name,Para,Sub Section Heading,Submittal Type,Submittal Description
0,24119,SELECTIVE DEMOLITION,1.10-A-1,WARRANTY,Warranty,1. TPO Roofing System
1,24119,SELECTIVE DEMOLITION,1.10-B,WARRANTY,Warranty,Notify warrantor on completion of selective de...
2,24119,SELECTIVE DEMOLITION,1.5-A-1,PREINSTALLATION MEETINGS,Meetings,1. Inspect and discuss condition of constructi...
3,24119,SELECTIVE DEMOLITION,1.5-A-2,PREINSTALLATION MEETINGS,Meetings,2. Review structural load limitations of exist...
4,24119,SELECTIVE DEMOLITION,1.5-A-3,PREINSTALLATION MEETINGS,Meetings,3. Review and finalize selective demolition sc...


In [16]:
# By Google's T5
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)

# Correct Wronged Submittal Types
def summary_predictor(row):
    sub_tupe = row[5]
    digitspattern = r'[0-9]'
    para = remove_stopwords(sub_tupe)
    para = re.sub(digitspattern, '', para)
    para = para.translate(str.maketrans('', '', string.punctuation))

    inputs = tokenizer.encode("summarize: " + para,
                          return_tensors='pt',
                          max_length = 512,
                          truncation = True)

    summary_ids = model.generate(inputs, max_length = 4, min_length = 2, length_penalty=5., num_beams=2)
    summary = tokenizer.decode(summary_ids[0])
    
    return summary.replace("<pad>", "")


dataset['Submittal Summary'] = dataset.apply(func = summary_predictor, axis = 1, result_type="expand")

In [17]:
dataset.head()

Unnamed: 0,Spec Number,Spec Name,Para,Sub Section Heading,Submittal Type,Submittal Description,Submittal Summary
0,24119,SELECTIVE DEMOLITION,1.10-A-1,WARRANTY,Warranty,1. TPO Roofing System,TPO
1,24119,SELECTIVE DEMOLITION,1.10-B,WARRANTY,Warranty,Notify warrantor on completion of selective de...,warrantor completion
2,24119,SELECTIVE DEMOLITION,1.5-A-1,PREINSTALLATION MEETINGS,Meetings,1. Inspect and discuss condition of constructi...,Inspect discuss
3,24119,SELECTIVE DEMOLITION,1.5-A-2,PREINSTALLATION MEETINGS,Meetings,2. Review structural load limitations of exist...,Review structural load
4,24119,SELECTIVE DEMOLITION,1.5-A-3,PREINSTALLATION MEETINGS,Meetings,3. Review and finalize selective demolition sc...,Finalize selective


In [19]:
# Back to CSV using Google's T5 Base
dataset.to_csv("Google_Catalent.csv", index = False)

In [32]:
# Load Dataset Prepared by Production Model
dataset = pd.read_csv("Dataset/Splitted_Catalent_Dataset.csv")
dataset.head()

Unnamed: 0,Spec Number,Spec Name,Para,Sub Section Heading,Submittal Type,Submittal Description
0,24119,SELECTIVE DEMOLITION,1.10-A-1,WARRANTY,Warranty,1. TPO Roofing System
1,24119,SELECTIVE DEMOLITION,1.10-B,WARRANTY,Warranty,Notify warrantor on completion of selective de...
2,24119,SELECTIVE DEMOLITION,1.5-A-1,PREINSTALLATION MEETINGS,Meetings,1. Inspect and discuss condition of constructi...
3,24119,SELECTIVE DEMOLITION,1.5-A-2,PREINSTALLATION MEETINGS,Meetings,2. Review structural load limitations of exist...
4,24119,SELECTIVE DEMOLITION,1.5-A-3,PREINSTALLATION MEETINGS,Meetings,3. Review and finalize selective demolition sc...


In [34]:
# Yake
kw_extractor = yake.KeywordExtractor()
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.1
numOfKeywords = 1
custom_kw_extractor = yake.KeywordExtractor(lan = language, 
                n = max_ngram_size, 
                dedupLim = deduplication_threshold, 
                top = numOfKeywords, 
                features = None)

def summary_predictor(row):
    sub_tupe = row[5]
    summary = custom_kw_extractor.extract_keywords(sub_tupe)

    try:
        return summary[0][0]
    except:
        "Not Found"


dataset['Submittal Summary'] = dataset.apply(func = summary_predictor, axis = 1, result_type="expand")


In [35]:
dataset.to_csv("Yake_Catalent.csv")