### Load Libraries

In [1]:
import pdfplumber
import itertools
import json
import re
import spacy
from os import path
import csv
import pickle
import pandas as pd
import numpy as np

### Load Sample

In [2]:
pre_data = """
SECTION 200553
TAGGING AND IDENTIFICATION

PART 1 - GENERAL
 1.4. SUBMITTALS 
Submit the following items for Owner approval: 
1. Product Data: Manufacturer's catalog cut sheets and other published technical data for each of the 
following: 
a. Nameplates, instructions plates, signs and labels. 
b. Fasteners. 
2. Samples: Provide samples of each color, lettering style, and other graphic representation required 
for identification materials. Provide samples of labels and signs. No material is to be ordered 
without this approval. 
3. Provide a listing of proposed names, abbreviations and other designations used in identification. 
Provide an electronic copy of the schedule of proposed tags, nameplates and engraving for Owner 
approval. No material is to be ordered without this approval. 
4. Provide a final and complete, electronic listing of all applied tags, nameplates and engravings. 
5. Provide a Hand Valve schedule as an electronic version in Microsoft Excel. Mark valves which are 
intended for emergency shut-off and similar special uses, by special flags, in margin of schedule. 
Furnish additional copies of schedule for Maintenance Manuals. Valve Schedule shall include the 
following information: 
a. Piping system 
b. System abbreviation (“CW”, “CHWS”, “RO”, “WFI”, etc.) 
c. Valve identification number 
d. Location of valve (room or space) 
e. Type of valve (gate, ball, automatic control, etc.)
f. Manufacturer and manufacturer's catalog number 
g. Indicate valves intended for emergency shutoff or other special use

1.5 NEXT SAMPLE

PART 2 - PRODUCT DATA
PART 3 - EXECUTION
 END OF SECTION
"""


### Preprocessing for Section

In [3]:
pre_data = re.sub(' +', ' ', pre_data)

In [4]:
start_index = re.search(r'SECTION|DOCUMENT', pre_data).start()
end_index= pre_data.rindex("PART 1")
first_part_index = pre_data.index("PART 1")

section_details = pre_data[start_index:first_part_index]
section_details

'SECTION 200553\nTAGGING AND IDENTIFICATION\n\n'

In [5]:
section_details_to_skip = [item.strip() for item in section_details.split("\n") if item.strip() != ""]

In [6]:
section_details = re.sub(' +', ' ', section_details.replace("\n", " "))
section_details

'SECTION 200553 TAGGING AND IDENTIFICATION '

In [7]:
nlp2 = spacy.load("Spacy Custom NER Dump/")

spec_number = ""
spec_name = ""
flag1, flag2 = False, False
section_data = nlp2(section_details)
for sent in section_data.ents:
    if(sent.label_ == 'section_name'):
        spec_name = str(sent)
        flag1 = True

if(not flag1):
    spec_name = "Not Found"

print("Section Name - {}".format(spec_name))

Section Name - TAGGING AND IDENTIFICATION


### Data Preprocessing For Mapping

In [88]:
# Select Lines From Last Occurance of Part 1
data = pre_data[end_index:]

In [89]:
# Remove Unwanted Lines, Skip Section Details and Mapp into String
final_data = ""
head_flag = True
for index, line in enumerate(data.splitlines()):
    if("END OF SECTION" in line or "END OF DOCUMENT" in line.strip()):
        continue
    elif(len(line.strip()) == 0):
        continue
    elif([ele for ele in section_details_to_skip if(line.strip().startswith(ele))]):
        continue
    else:
        final_data = final_data + line.strip() + "\n"

In [90]:
# Arrange Lines to Proper Pointers
final_lines = []
index = -1
for i, line in enumerate(final_data.splitlines()):
    line = line.strip()
    if(line.strip().startswith("PART")): 
        final_lines.append(line)
        index = index + 1
    elif(re.search(r"^[0-9]\.[0-9]", line)):
        final_lines.append(line)
        index = index + 1
    elif(re.search(r"^[A-Za-z]\.", line)):
        final_lines.append(line)
        index = index + 1
    elif(re.search(r"^[0-9]+\.", line)):
        final_lines.append(line)
        index = index + 1
    elif(re.search(r"^[0-9]+\)", line)):
        final_lines.append(line)
        index = index + 1
    elif(re.search(r"^[a-z]+\)", line)):
        final_lines.append(line)
        index = index + 1
    elif(final_lines[index].strip().startswith("PART")):
            continue
    elif(line.strip().isupper()):
        final_lines[index] = final_lines[index] + " " + line
    elif(re.search(r"^[0-9]+\.[0-9]", final_lines[index].strip())):
        final_lines.append(line)
        index = index + 1
    else:
        final_lines[index] = final_lines[index] + " " + line
            

In [8]:
# All Heading
heading = []
flag = True
for line in final_lines:
    if(re.search(r"^[0-9]+\.[0-9]", line.strip()) or line.strip().startswith("PART")):
        heading.append(line)
        flag = False
    elif(re.search(r"^[A-Z]\.", line.strip()) and flag):
        heading.append(line)

print(heading)

NameError: name 'final_lines' is not defined

In [92]:
# Heading Pairs
res = list(map(list, zip(heading, heading[1:])))
index_data = []
heading_list = []
for i, data in enumerate(res):
    if(i == 0):
        heading_list.append("PART 1 - GENERAL")
    if("SUBMITTAL" in data[0]): 
        heading_list.append(data)
    if("PART" in data[1]):
        heading_list.append(data[1])

heading_list

['PART 1 - GENERAL',
 ['1.4. SUBMITTALS', '1.5 NEXT SAMPLE'],
 'PART 2 - PRODUCT DATA',
 'PART 3 - EXECUTION']

In [93]:
# Submittal Lines Index
data_lines = []
if(len(heading_list) == 0):
    final_lines = []
else:
    for item in heading_list:
        if("SUMMITAL" in item or "SUBMITTAL" in item[0] or "SUBMITTALS" in item[0]):
            x, y = final_lines.index(item[0]), final_lines.index(item[1])
            data_lines.append((x, y))
        elif("PART " in item):
            data_lines.append(item)

print(data_lines)

['PART 1 - GENERAL', (1, 17), 'PART 2 - PRODUCT DATA', 'PART 3 - EXECUTION']


In [94]:
# Submittal Records for Mapping
dataset = []
for pos in data_lines:
    if("PART" in pos):
        dataset.append(pos)
    else:
        for ll in range(pos[0], pos[1]):
            dataset.append(final_lines[ll])

print(dataset)

['PART 1 - GENERAL', '1.4. SUBMITTALS', 'Submit the following items for Owner approval:', "1. Product Data: Manufacturer's catalog cut sheets and other published technical data for each of the following:", 'a. Nameplates, instructions plates, signs and labels.', 'b. Fasteners.', '2. Samples: Provide samples of each color, lettering style, and other graphic representation required for identification materials. Provide samples of labels and signs. No material is to be ordered without this approval.', '3. Provide a listing of proposed names, abbreviations and other designations used in identification. Provide an electronic copy of the schedule of proposed tags, nameplates and engraving for Owner approval. No material is to be ordered without this approval.', '4. Provide a final and complete, electronic listing of all applied tags, nameplates and engravings.', '5. Provide a Hand Valve schedule as an electronic version in Microsoft Excel. Mark valves which are intended for emergency shut-of

In [103]:
# Mapp to Data Frame
mapp_dataset = pd.DataFrame(columns = ["SECTION", "SECTION NAME", "PART", "SUBSECTION", "SUBSECTION NAME", "DESCRIPTION"], dtype = str)
subsection_flag = False
subsection = "Not Found"
subsection1 = "Not Found"
subsection_name = "Not Found"
part_name = "Not Found"
heading_flag = False
for index, line in enumerate(dataset):
    if(line.strip().startswith('PART')):
        part_name = line.strip()
    elif(re.search(r"^[0-9]+\.[0-9]+", line.strip())):
        subsection = line.split()[0]
        subsection_name = " ".join(line.split()[1:])
        subsection_flag = True
    elif(re.search(r"^[A-Z]\.", line.strip())):
        heading_flag = True
        subsection1 = line.split()[0]
        mapp_dataset.loc[len(mapp_dataset)] = [spec_number, spec_name, part_name, subsection + subsection1, subsection_name, line.strip()]
    elif(heading_flag):
        mapp_dataset.loc[len(mapp_dataset) - 1, "DESCRIPTION"] = mapp_dataset.loc[len(mapp_dataset) - 1, "DESCRIPTION"] + " " + line.strip()
    else:
        if(subsection_flag):
            mapp_dataset.loc[len(mapp_dataset)] = [spec_number, spec_name, part_name, subsection, subsection_name, line.strip()]
            subsection_flag = False
        else:
            mapp_dataset.loc[len(mapp_dataset) - 1, "DESCRIPTION"] = mapp_dataset.loc[len(mapp_dataset) - 1, "DESCRIPTION"] + " " + line.strip()


mapp_dataset

Unnamed: 0,SECTION,SECTION NAME,PART,SUBSECTION,SUBSECTION NAME,DESCRIPTION
0,200553,TAGGING AND IDENTIFICATION,PART 1 - GENERAL,1.4.,SUBMITTALS,Submit the following items for Owner approval:...


In [102]:
# mapp_dataset.loc[len(mapp_dataset) - 1, "DESCRIPTION"]

"Submit the following items for Owner approval: 1. Product Data: Manufacturer's catalog cut sheets and other published technical data for each of the following: a. Nameplates, instructions plates, signs and labels. b. Fasteners. 2. Samples: Provide samples of each color, lettering style, and other graphic representation required for identification materials. Provide samples of labels and signs. No material is to be ordered without this approval. 3. Provide a listing of proposed names, abbreviations and other designations used in identification. Provide an electronic copy of the schedule of proposed tags, nameplates and engraving for Owner approval. No material is to be ordered without this approval. 4. Provide a final and complete, electronic listing of all applied tags, nameplates and engravings. 5. Provide a Hand Valve schedule as an electronic version in Microsoft Excel. Mark valves which are intended for emergency shut-off and similar special uses, by special flags, in margin of sc

In [51]:
# Handle Preceeding Zero of Section
mapp_dataset.SECTION = mapp_dataset.SECTION.apply('="{}"'.format)

In [52]:
# Generate CSV and Append
big_spec_name = "XOXOXO"
big_spec_name = big_spec_name + ".csv"
file_status = path.exists(big_spec_name)

if(file_status):
    dataset = pd.read_csv(big_spec_name, dtype = str)
    dataset = dataset.append(mapp_dataset, index)
    dataset.to_csv(big_spec_name, index = False)
else:
    mapp_dataset.to_csv(big_spec_name, index = False)

In [98]:
## Load Saved Model, Vectorizer and Encoder
#
with open("ML Model/vectorizer.pickle", 'rb+') as file:
    vectorizer_saved = pickle.load(file)

with open("ML Model/label_encoder.pickle", 'rb+') as file:
    encorder_saved = pickle.load(file)

with open("ML Model/type_classifier.pickle", 'rb+') as file:
    classifier_saved = pickle.load(file)

In [100]:
da = vectorizer_saved.transform(["NA"])
classifier_saved.predict(da)

array([4])

In [18]:
## Load Prepated Data Data
#
new_dataset = pd.read_csv("YYY.csv")
new_dataset.head()

Unnamed: 0,SECTION,SECTION_NAME,PART,SUB SECTION,SUB SECTION HEADING,DECRIPTION
0,27 05 29,HANGERS AND SUPPORTS FOR COMMUNICATIONS SYSTEMS,PART 1 - GENERAL,1.03 A,SUBMITTALS,A. Refer to Section 27 05 00 for requirements ...
1,27 05 29,HANGERS AND SUPPORTS FOR COMMUNICATIONS SYSTEMS,PART 2 - PRODUCTS,2.02 A,STRUCTURAL SUPPORT SYSTEMS SUBMITTALS,A. Slotted strut supports \n1. Acceptable manu...


In [19]:
description_vector = vectorizer_saved.transform(new_dataset['DECRIPTION'])
predictions = classifier_saved.predict(description_vector)
new_dataset['TYPE'] = encorder_saved.inverse_transform(predictions)
new_dataset = new_dataset[['SECTION', 'SECTION_NAME', 'PART', 'SUB SECTION', 'SUB SECTION HEADING', 'TYPE','DECRIPTION']]
new_dataset.to_csv("YYY_Updated.csv", index = False)

In [2]:
# !jupyter nbconvert --to script "submittal_extraction_v9.ipynb"

[NbConvertApp] Converting notebook submittal_extraction_v8.ipynb to script
[NbConvertApp] Writing 10120 bytes to submittal_extraction_v8.py


In [43]:
# !jupyter nbconvert --to PDFviaHTML "submittal_extraction_v9.ipynb"


[NbConvertApp] Converting notebook submittal_extraction_v9.ipynb to PDFviaHTML
[NbConvertApp] Writing 189718 bytes to submittal_extraction_v9.pdf


In [104]:
if(False):
    pass
# Else
else:
    pass