## Preprocessing

In [1]:
import os

# Domain to use
DOMAIN = 'firefox'

DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))

In [2]:
import os

pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [3]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

Using TensorFlow backend.


In [4]:
"Total vocabulary: {}".format(len(token_dict))

'Total vocabulary: 30522'

In [5]:
import pandas as pd

In [6]:
df_train = pd.read_csv('{}'.format(DATASET))

In [7]:
df_train.head()

Unnamed: 0,bug_id,priority,component,dup_id,short_desc,description,bug_status,resolution,version,creation_ts,delta_ts
0,10954,P3,Preferences,[],Dialup properties needs to be exposed in prefs,The dialup properties of the profile should be...,RESOLVED,WONTFIX,Trunk,1999-07-30 15:55:51 -0700,2008-05-14 11:44:15 -0700
1,14871,--,General,269442,[Find] Find whole word only,Please add Match Whole Word Only option to bro...,RESOLVED,DUPLICATE,Trunk,1999-09-24 14:49:34 -0700,2011-10-05 16:35:31 -0700
2,19118,--,Preferences,[],Plug-In Manager (ui for choosing mimetype-plug...,I would really like a plug-in manager for my b...,RESOLVED,WONTFIX,Trunk,1999-11-17 14:58:26 -0800,2013-01-29 11:48:39 -0800
3,54746,P3,Preferences,[],Language encodings in font prefs dialog not so...,Language encodings are listed in a seemingly r...,RESOLVED,WORKSFORME,Trunk,2000-09-29 14:12:11 -0700,2013-02-27 15:47:29 -0800
4,56892,P3,General,[],Synaptics touchpad scrolling not working,From Bugzilla Helper:; User-Agent: Mozilla/5.0...,RESOLVED,WORKSFORME,unspecified,2000-10-16 14:48:15 -0700,2009-10-14 11:38:29 -0700


In [8]:
bug_ids = df_train['bug_id'].values

len(bug_ids)

115814

In [60]:
"Test ; ; Test".lower()

'test ; ; test'

In [124]:
idx

829674

In [200]:
import numpy as np
idx = np.random.choice(bug_ids, 1)[0]
# idx = 944459
# 944459, 829674
bug_df = df_train[df_train['bug_id'] == idx]


desc_reg = str(bug_df['description'].values[0])

def first_regex(text):
    text_lower = text.lower()
    return text[ int(text.find('; ; ')) + 4 : ] if 'user-agent:' in text_lower else text

def second_regex(text):
    text_lower = text.lower()
    return text[ int(text_lower.find('steps to reproduce')) + 20 : ] if 'steps to reproduce' in text_lower else text

bug = {
    'description' : second_regex(first_regex(desc_reg)),
#     'description_regex' : second_regex(first_regex(desc_reg)),
    'title' : bug_df['short_desc'].values[0]
}

# print(bug['description'])
# print("#############################################")
# print(bug['description_regex'])

bug

{'description': ' 1.; 2.; 3.',
 'title': 'The buttons are not able to be clicked that is in the area of flash menu.'}

### REGEX

Regex to process the Firefox dataset

In [213]:
MAX_SEQUENCE_LENGTH_T = 20 # 40
MAX_SEQUENCE_LENGTH_D = 20 # 200

In [203]:
from keras_bert import Tokenizer
tokenizer = Tokenizer(token_dict)

In [201]:
"Title ", bug['title']

('Title ',
 'The buttons are not able to be clicked that is in the area of flash menu.')

In [231]:
tokenizer.tokenize("[CLS] 1 .")

['[CLS]', '[', 'cl', '##s', ']', '1', '.', '[SEP]']

In [230]:
tokenizer.encode("[CLS] 1 .", max_len=MAX_SEQUENCE_LENGTH_T)

([101,
  1031,
  18856,
  2015,
  1033,
  1015,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [224]:
import _pickle as pickle

fixes = ['351892', '556144']

for bug_id in fixes:
    bug = pickle.load(open(os.path.join('data/processed/firefox', 'bugs', '{}.pkl'.format(bug_id)), 'rb'))
    print(bug['description_bert'])
    print(tokenizer.encode(bug['description_bert'], max_len=MAX_SEQUENCE_LENGTH_T))

[CLS] after an update loaded automatically this afternoon ; all my book ##marks disappeared . its as if the slate has been wiped clean . ; ; rep ##rod ##ucible : didn ##t try [SEP]
([101, 1031, 18856, 2015, 1033, 2044, 2019, 10651, 8209, 8073, 2023, 5027, 1025, 2035, 2026, 2338, 1001, 1001, 6017, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
[CLS] 1 . select book ##marks ; 2 . hove ##r mouse over one of the links ( not folder ##s ) ; 3 . black box appears on top of book ##mark link icon . ; actual results : ; black box appeared . ; ; expected results : ; not shown the black box . [SEP]
([101, 1031, 18856, 2015, 1033, 1015, 1012, 7276, 2338, 1001, 1001, 6017, 1025, 1016, 1012, 25215, 1001, 1001, 1054, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [204]:
text = tokenizer.tokenize(bug['title'], max_len=MAX_SEQUENCE_LENGTH_T)
text

['[CLS]',
 'the',
 'buttons',
 'are',
 'not',
 'able',
 'to',
 'be',
 'clicked',
 'that',
 'is',
 'in',
 'the',
 'area',
 'of',
 'flash',
 'menu',
 '.',
 '[SEP]']

In [207]:
ids, segments = tokenizer.encode(bug['title'], max_len=MAX_SEQUENCE_LENGTH_T)

In [209]:
ids

[101,
 1996,
 11287,
 2024,
 2025,
 2583,
 2000,
 2022,
 13886,
 2008,
 2003,
 1999,
 1996,
 2181,
 1997,
 5956,
 12183,
 1012,
 102,
 0]

In [210]:
tokenizer.decode(ids)

['the',
 'buttons',
 'are',
 'not',
 'able',
 'to',
 'be',
 'clicked',
 'that',
 'is',
 'in',
 'the',
 'area',
 'of',
 'flash',
 'menu',
 '.']