In [5]:
!python --version

Python 3.9.18


In [6]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import pandas as pd
import json

In [7]:
# Load all services
with open('../data/nested_services.json', 'r') as f:
    services = json.load(f)
    
services = pd.Series(child_obj['cName'] for obj in services['data'] for child_obj in obj['childServices'])
print('Number of services:', len(services))
    
# Load all timelines
with open('../data/timelines.json', 'r') as f:
    timelines = json.load(f)

timelines = pd.Series(obj['title'] for obj in timelines['data'])
print('Number of timelines:', len(services))


Number of services: 531
Number of timelines: 531


In [8]:
# Load the dataset

df = pd.read_csv('../data/data-gen/service_descriptions.csv', names=['service', 'timeline', 'description'])
df

Unnamed: 0,service,timeline,description
0,Install/Repair HVAC,Around 1 day,"""Hello, I am in need of an HVAC technician to ..."
1,Install/Repair HVAC,Around 5 days,"Hello,\n\nI am writing to request an HVAC inst..."
2,Install/Repair AC,Around 1 day,"Hello, \nI am in need of your assistance with ..."
3,Install/Repair AC,Around 5 days,"Hi there,\n\nI am in need of a professional to..."


In [None]:
# Preprocessing using nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
# Split the data in training and testing

In [9]:
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
# Tokenize the data
input_ids = []
attention_masks = []

for description in df['description']:
    encoded_dict = tokenizer.encode_plus(
        description,
        add_special_tokens=True,
        max_length=64,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='tf'
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)
labels = tf.constant(df[['service', 'timeline']])

print(input_ids)
print(attention_masks)

tf.Tensor(
[[  101  1000  7592  1010  1045  2572  1999  2342  1997  2019  1044 24887
  16661  2000 16500  2030  7192  2026 10808  1998 11520  2291  1012  2009
   3849  2000  2022 15451 11263 27989  2075  1998  1045  2572  4039  2000
  15176  1996  4860  1999  2026  2188  1012  1045  2052  6551  9120  2065
   2023  2071  2022 10395  1999  1037 23259  5450  2004  1045  2031  2235
   2336  1999  1996   102]
 [  101  7592  1010  1045  2572  3015  2000  5227  2019  1044 24887  8272
   1013  7192  2326  2005  2026  2188  1012  2256  2783  1044 24887  2291
   3849  2000  2022 15451 11263 27989  2075  1998  2057  2024 13417  8190
   2007  4860  2491  1998  2250  4834  1012  2057  2052  6551  9120  2065
   2619  2071  2272  1998  2202  1037  2298  2012  2009  2004  2574  2004
   2825  1012 28946   102]
 [  101  7592  1010  1045  2572  1999  2342  1997  2115  5375  2007  2019
   9353  8272  1013  7192  1012  1996  2783  9353  3131  1999  2026  2188
   2003  2025 12285  7919  1998  1045  2572  19

