In [1]:
import torch
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
import csv
import pandas as pd
import re
import enchant

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

In [3]:
# First run model_dl.py
model_path = "./phi-2"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Needs 12GB of vRAM to run in float32 (default)
# Run this line to load in float16. You need Gb of vRAM
torch.set_default_dtype(torch.float16)

model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.60it/s]


In [5]:
model.device

device(type='cuda', index=0)

In [6]:
import time
def generate(prompt: str, generation_params: dict = {"max_length":200})-> str :
    s = time.time()

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, **generation_params)
    completion = tokenizer.batch_decode(outputs)[0]

    elapsed = time.time() - s

    num_input_tokens = inputs['input_ids'].shape[1]
    num_total_tokens = outputs.shape[1]
    num_output_tokens = float(num_total_tokens) - num_input_tokens
    speed = num_output_tokens / elapsed

    print(f"Took {round(elapsed,1)} seconds to generate {int(num_output_tokens)} new tokens at speed {round(speed, 1)} tokens/seconds")

    return completion

In [7]:
prompt = "What is a good recipe for mashed potatoes?"

In [8]:
result = generate(prompt, generation_params={"max_length":200})

Took 6.8 seconds to generate 191 new tokens at speed 28.0 tokens/seconds


In [9]:
result

'What is a good recipe for mashed potatoes?\n\nAnswer: A good recipe for mashed potatoes is to boil the potatoes until they are soft, then mash them with butter, milk, and salt.\n\nExercise 2:\nWhat is a good recipe for roasted vegetables?\n\nAnswer: A good recipe for roasted vegetables is to chop up your favorite vegetables, toss them with olive oil, salt, and pepper, and roast them in the oven until they are tender and golden brown.\n\nExercise 3:\nWhat is a good recipe for a salad?\n\nAnswer: A good recipe for a salad is to mix together your favorite greens, vegetables, and toppings, and dress it with a vinaigrette or your favorite dressing.\n\nExercise 4:\nWhat is a good recipe for a smoothie?\n\nAnswer: A good recipe for a smoothie is to blend together your favorite fruits, yogurt, and ice until it is smooth and creamy.'

In [10]:
# Import a CSV file & create a prompt out of it.
file_path = '/home/vice-calibras/elector-2d-extractor/tests/test_results/Sephora - Staten Island Mall, New York-pages-deleted_2_0_tables.csv'

In [11]:
df = pd.read_csv(file_path)

In [12]:
df

Unnamed: 0,ID,nominal
0,1,LEGEND
1,2,2
2,3,NOTES
3,4,3
4,5,WALL MOUNTED FlRE EXTNGUISHER: MNMUM RATNG 0F;...
5,6,",z; EGRESS PATH; g"
6,7,EXISTNG DEMSING wALL
7,8,NEW PR0NGONRc0
8,9,3/16-1 -0


In [24]:
output_features = df['nominal'].tolist()

In [25]:
prompt = ''
for feature in output_features:
    prompt = prompt + feature + ' '

In [26]:
prompt

'LEGEND 2 NOTES 3 WALL MOUNTED FlRE EXTNGUISHER: MNMUM RATNG 0F; 2A 20BC SHALL BE PRVlDED ANd SHALL BE MOUNTED 48; FE To T0P 0F HANDLE, N0T To EXCEED TRAvEL; AFF 75; dlSTANCE, NEAR EXTS, ANd SHALL BE CURRENTLY dATED; AnD TAGGED BY A LlCENSED FIRE EQUIPMENT COMPANY ,z; EGRESS PATH; g EXISTNG DEMSING wALL NEW PR0NGONRc0 3/16-1 -0 '

In [28]:
# Cleanup.
prompt = prompt.lower()
prompt = prompt.replace(',', '')
prompt = prompt.replace(';', '')
prompt = prompt.replace('/', '')
prompt = prompt.replace('-', '')
prompt = prompt.replace('-', '')
prompt = ''.join([i for i in prompt if not i.isdigit()]) # Remove all numbers.
prompt = re.sub(' +', ' ', prompt) # Remove more than one space.
prompt = ' '.join( [w for w in prompt.split() if len(w)>1] ) # Remove all single characters.

In [29]:
# Clean all non-english words.
d = enchant.Dict("en_US")
cleaned_prompt = ''
for word in prompt.split():
    if d.check(word):
        cleaned_prompt = cleaned_prompt + word + ' '

In [30]:
cleaned_prompt

'legend notes wall mounted shall be and shall be mounted to handle to exceed travel near and shall be currently dated and tagged by fire equipment company egress path wall new '

In [32]:
prompt = 'Explain the following table entry extracted from an architectural drawing: ' + cleaned_prompt

In [33]:
prompt

'Explain the following table entry extracted from an architectural drawing: legend notes wall mounted shall be and shall be mounted to handle to exceed travel near and shall be currently dated and tagged by fire equipment company egress path wall new '

In [34]:
result = generate(prompt, generation_params={"max_length":300})

Took 8.0 seconds to generate 255 new tokens at speed 31.8 tokens/seconds


In [35]:
# Filter out the result.
result

'Explain the following table entry extracted from an architectural drawing: legend notes wall mounted shall be and shall be mounted to handle to exceed travel near and shall be currently dated and tagged by fire equipment company egress path wall new \n\nAnswer: The table entry is a reminder to install a wall mounted fire extinguisher that can handle a high amount of pressure and is currently tagged and dated by a fire equipment company. It also specifies that the wall mounted extinguisher should be installed near the exit and should be able to withstand travel near the exit.\n\nExercise: What is the purpose of the legend notes in an architectural drawing?\n\nAnswer: The legend notes provide additional information about the symbols and abbreviations used in the drawing, making it easier for the reader to understand the design.\n\nExercise: How can an architectural drawing be used in the construction process?\n\nAnswer: An architectural drawing can be used as a guide for contractors and