In [1]:
import pdfplumber # Go to https://github.com/jsvine/pdfplumber for installation instructions and explanation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os

In [3]:
# For making a reresentation of a symbol
def create_symbol_sig(fp,idx, n_parts):
    ppoint = find_distances(fp.curves[idx:idx+n_parts])
    full_dict = {}
    full_dict['distances'] = ppoint
    full_dict['total_points'] = len(ppoint)
    return full_dict

In [5]:
#takes in two sets of coordinates and returns a floating point value
def euc_dist(a, b):    
    d = np.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2)
    return float(d)

# find the distance from the first point to all other points in the curve set and save them
# accepts a list of coord tuples and returns a list of floating point values 
def find_distances(c_list):
    d_list = []
    base = c_list[0]['points'][0]
    for c in c_list:
        points = c['points']
        for i in range(0, len(points)):
            d_list.append(euc_dist(base, points[i]))
        
    return d_list

# matches the template to the tested curves by using abs. differences
# l1 will be the template and l2 the tested
# accepts two lists and returns a list of values
def compare_distances(l1, l2):
    assert len(l1) == len(l2)
    diff = []
    for i in range(0, len(l1)):
        diff.append(np.abs(l2[i] - l1[i]))
    
    return diff

In [6]:
# functions for symbol matching

def match_symbols(file, sym, idx):
    # compare distances to get a difference val
    # l1 and l2 are lists of equal size
    l1 = sym['distances']
    l2 = find_distances(file.curves[idx : idx + sym['components']]) 
    diff = compare_distances(l1, l2)
    # discriminate base on that val 
    return diff

# find the first point for those that match the symbol
def get_matches(file, sym, tol = 1):
    matches = {}
    for i in range(0, len(file.curves) - sym['components']):
        try:
            scores = match_symbols(file, sym, i)
        except:
            print("out of range error for idx {}".format(i))
            print(len(file.curves[i]['pts']))
            
        # add the idx val if it is a match and skip, otherwise do next 
        if len(scores) > 0 and sum(scores) < tol:
            matches[i] = scores
            
    return matches

In [8]:
def get_max(symbols):
    mx = 0
    for s in symbols:
        for n in symbols[s]:
            if symbols[s][n]['components'] > mx:
                mx = symbols[s][n]['components']
    return mx

In [81]:
# recieve an index val and the pdf file
# return the mean x,y vals for the given curve
def get_xy(key, file):
    x = float(file.curves[key]["x1"] + file.curves[key]["x0"])/2
    y = float(file.curves[key]["y1"] + file.curves[key]["y0"])/2
    
    return x, y

# recieive a dictionary of the starting keys and the distance list for each match 
# output a table of schema <id, type, x, y, distances>
def get_outputs(matches, a_type, file):
    table = pd.DataFrame(columns = ["id", "type", "x_coord", "y_coord", "total_diff", "mean_diff", "max_diff"])
    
    for m in matches:
        x, y = get_xy(m, file)
        table = table.append({"id": m, "type": a_type, "x_coord": x, "y_coord": y, "total_diff": sum(matches[m]),
                              "mean_diff": np.mean(matches[m]), "max_diff": np.max(matches[m])}, ignore_index = True)
    return table

In [110]:
# Loading in the outlets file
with open("../symbols/symbols.json", "r") as infile:
    symbols = json.load(infile)


In [127]:
print(fps)

['A1492010025.pdf', 'A1492010029.pdf', 'DataSet', 'DC2782010014.pdf', 'E1102007016.pdf', 'E1102007020.pdf', 'E1422010011.pdf', 'E1422010014.pdf', 'E1602007002.pdf', 'E1662010012.pdf', 'E1662010013.pdf', 'E1712007005.pdf', 'E1712007007.pdf', 'E1712007010.pdf', 'E1822010019.pdf', 'E1822010022.pdf', 'E1892007005.pdf', 'E1892007007.pdf', 'full fp.pdf', 'PNG_Dataset', 'simple fp.pdf']


In [287]:
#loading files
path = r'../floor_plans/'
fps = os.listdir(path)
x = 16
f = fps[x]
file_1 = pdfplumber.open(path +f).pages[0]
    
# setting up the visualisation tool
im = file_1.to_image(resolution = 144)

In [289]:
tol = 10
tot_diffs = {}

for c in range(0, len(file_1.curves)): #len(file_1.curves)
    # get the total points for 1 to n sequential
    diffs = {}

    n_points = [len(file_1.curves[c]['pts'])]
    try:
        for i in range(1, get_max(symbols)):
            n_points.append(n_points[i-1] + len(file_1.curves[c + i]['pts']))

        for types in symbols:
            for num in symbols[types]:
                #check if the number of points for the components is matching
                if n_points[symbols[types][num]['components']-1] == symbols[types][num]['total_points']:
                    #run matching algorithm
                    diffs[str(types + "_" + num)] = match_symbols(file = file_1, sym = symbols[types][num], idx = c)
    
    except:
        print("error on index {}".format(c))
           
    if len(diffs) > 0:
        mn = sum(diffs[list(diffs.keys())[0]])
        key = list(diffs.keys())[0]
        
        for i in range(1, len(diffs)):
            if sum(diffs[list(diffs.keys())[i]]) < mn:
                mn = sum(diffs[list(diffs.keys())[i]])
                key = list(diffs.keys())[i]
        
        if mn < tol:
            tot_diffs[c] = diffs[key]

print(len(tot_diffs))

error on index 91130
61


In [None]:
# visualising on the floorplan
im.reset()

for r in tot_diffs:
    if sum(tot_diffs[r]) / len(tot_diffs[r]) < 0.5:
        im.draw_circle(file_1.curves[r], stroke='#66FF66', radius = 10)
im

In [None]:
tbl = get_outputs(tot_diffs, "outlet", file_1)
tbl.head(5)

In [None]:
#exporting table and saving in the outputs folder
tbl.loc[tbl['mean_diff'] < 0.5].to_csv(r"../outputs/{}_assets.csv".format(x))