# Baseline 

https://github.com/tqtg/DuplicateBugFinder

In [1]:
from __future__ import print_function, division

In [2]:
import numpy as np
import pandas as pd
import os
import random
import json
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tqdm import tqdm
from collections import defaultdict

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [3]:
from keras.layers import Conv1D, Input, Add, Activation, Dropout, Embedding, MaxPooling1D, GlobalMaxPool1D, Flatten, Dense, Concatenate, BatchNormalization
from keras.models import Sequential, Model
from keras.regularizers import l2
from keras.initializers import TruncatedNormal
from keras.layers.advanced_activations import LeakyReLU, ELU
from keras import optimizers
import _pickle as pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
DIR = ''

## Dataset

In [5]:
#from google.colab import drive
#drive.mount('/content/drive')

### Bugs train

In [6]:
df_train = pd.read_csv('mozilla_firefox.csv')

In [7]:
df_train.head(3)

Unnamed: 0,Issue_id,Priority,Component,Duplicated_issue,Title,Description,Status,Resolution,Version,Created_time,Resolved_time
0,10954,P3,Preferences,,Dialup properties needs to be exposed in prefs,The dialup properties of the profile should be...,RESOLVED,WONTFIX,Trunk,1999-07-30 15:55:51 -0700,2008-05-14 11:44:15 -0700
1,14871,--,General,269442.0,[Find] Find whole word only,Please add Match Whole Word Only option to bro...,RESOLVED,DUPLICATE,Trunk,1999-09-24 14:49:34 -0700,2011-10-05 16:35:31 -0700
2,19118,--,Preferences,,Plug-In Manager (ui for choosing mimetype-plug...,I would really like a plug-in manager for my b...,RESOLVED,WONTFIX,Trunk,1999-11-17 14:58:26 -0800,2013-01-29 11:48:39 -0800


In [8]:
df_train.columns = ['issue_id', 'priority', 'component', 'duplicated_issue', 'title', 'description', 'status', 'resolution', 'version', 'creation_ts', 'delta_ts']

### Pairs train

In [9]:
df_train_pair = pd.read_csv('train_mozilla_firefox.csv')

In [10]:
df_train_pair.shape

(92651, 2)

In [11]:
df_train_pair[df_train_pair['Duplicate'].isna()].shape

(62625, 2)

In [12]:
df_train_pair.head()

Unnamed: 0,Issue_id,Duplicate
0,10954,
1,14871,243500;410103;505684;515027;528678
2,19118,326494;328227;414070;436576;443686;457861;475975
3,54746,
4,56892,191258;281233;290692;300719;307581;310641;3117...


### Reading pairs

In [13]:
def read_pairs(df):
  bug_pairs = []
  bug_ids = set()
  for row in df.iterrows():
    duplicates = row[1]['Duplicate']
    bug1 = row[1]['Issue_id']
    duplicates = [] if (type(duplicates) == float) else np.array(duplicates.split(';'), dtype=np.float)
    if len(duplicates) == 0: # No duplicate
      bug_ids.add(int(bug1))
    else: # duplicate
      bug_ids.add(int(bug1))
      for bug2 in duplicates:
        bug_pairs.append((int(bug1), int(bug2)))
        bug_ids.add(int(bug2))
  with open(os.path.join(DIR, 'bug_pairs.txt'), 'w') as f:
    for pair in bug_pairs:
      f.write("%d %d\n" % pair)
  bug_ids = sorted(bug_ids)
  with open(os.path.join(DIR, 'bug_ids.txt'), 'w') as f:
    for bug_id in bug_ids:
      f.write("%d\n" % bug_id)
  return bug_pairs, bug_ids

In [14]:
bug_pairs, bug_ids = read_pairs(df_train_pair)
print("Number of bugs: {}".format(len(bug_ids)))
print("Number of pairs: {}".format(len(bug_pairs)))

Number of bugs: 92651
Number of pairs: 310378


## Split into train and test

In [15]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.8

In [16]:
def split_train_test(bug_pairs):
  random.shuffle(bug_pairs)
  split_idx = int(len(bug_pairs) * VALIDATION_SPLIT)
  with open(os.path.join(DIR, 'train.txt'), 'w') as f:
    for pair in bug_pairs[:split_idx]:
      f.write("%d %d\n" % pair)
  test_data = {}
  for pair in bug_pairs[split_idx:]:
    bug1 = int(pair[0])
    bug2 = int(pair[1])
    if bug1 not in test_data:
      test_data[bug1] = set()
    test_data[bug1].add(bug2)
  with open(os.path.join(DIR, 'test.txt'), 'w') as f:
    for bug in test_data.keys():
      f.write("{} {}\n".format(bug, ' '.join([str(x) for x in test_data[bug]])))
  print('Train and test created')

In [17]:
split_train_test(bug_pairs)

Train and test created


## Pre-processing

#### Normalize text (lower, remove simbols and other caracter and tokenizer)

In [18]:
def func_name_tokenize(text):
  s = []
  for i, c in enumerate(text):
    if c.isupper() and i > 0 and text[i-1].islower():
      s.append(' ')
    s.append(c)
  return ''.join(s).strip()

def normalize_text(text):
  try:
    tokens = re.compile(r'[\W_]+', re.UNICODE).split(text)
    text = ' '.join([func_name_tokenize(token) for token in tokens])
    text = re.sub(r'\d+((\s\d+)+)?', 'number', text)
  except:
    return 'description'
  return ' '.join([word.lower() for word in nltk.word_tokenize(text)])

In [19]:
def normalized_data(bug_ids, df):
  products = set()
  bug_severities = set()
  priorities = set()
  versions = set()
  components = set()
  bug_statuses = set()
  text = []
  normalized_bugs = open(os.path.join(DIR, 'normalized_bugs.json'), 'w')
  for row in df.iterrows():
      
      bug = row[1]
      
      bug['description'] = normalize_text(bug['description'])
      if 'title' in bug:
        bug['title'] = normalize_text(bug['title'])
      else:
        bug['title'] = ''
      
      normalized_bugs.write('{}\n'.format(bug.to_json()))

      text.append(bug['description'])
      text.append(bug['title'])
  return text

In [20]:
%%time 

text = normalized_data(bug_ids, df_train)

Wall time: 10min 9s


#### Visualizing texts

In [21]:
idx = np.random.choice(range(len(text))) + 2
text[idx-2:idx]

['web pages that open new windows have browsers with default toolbars',
 'user agent mozilla number windows u windows nt number en us rv numbera gecko number phoenix number build identifier mozilla number windows u windows nt number en us rv numbera gecko number phoenix number the get new themes link in the themes section of the new options preferences dialog has the appearance of normal text but should actually look like a hyperlink reproducible always steps to reproduce number go to tools options themes number hover mouse over get new themes number left click on get new themes actual results get new themes has the appearance of normal text and when hovering the mouse appears as a text selection cursor however the link to the themes page still works expected results get new themes should be highlighted and underlined like a normal hyperlink and when hover the mouse cursor should appear as a pointing hand']

### Building vocabulary

In [22]:
def build_vocabulary(train_text):
  word_freq = build_freq_dict(train_text)
  print('word vocabulary')
  word_vocab = save_vocab(word_freq, MAX_NB_WORDS, 'word_vocab.pkl')
  return word_vocab

In [23]:
def build_freq_dict(train_text):
  print('building frequency dictionaries')
  word_freq = defaultdict(int)
  for text in tqdm(train_text):
    for word in text.split():
      word_freq[word] += 1
  return word_freq

In [24]:
def save_vocab(freq_dict, vocab_size, filename):
  top_tokens = sorted(freq_dict.items(), key=lambda x: -x[1])[:vocab_size - 2]
  print('most common token is %s which appears %d times' % (top_tokens[0][0], top_tokens[0][1]))
  print('less common token is %s which appears %d times' % (top_tokens[-1][0], top_tokens[-1][1]))
  vocab = {}
  i = 2  # 0-index is for padding, 1-index is for UNKNOWN
  for j in range(len(top_tokens)):
    vocab[top_tokens[j][0]] = i
    i += 1
  with open(os.path.join(DIR, filename), 'wb') as f:
    pickle.dump(vocab, f)
  return vocab

In [25]:
word_vocab = build_vocabulary(text)

building frequency dictionaries


100%|███████████████████████████████| 231628/231628 [00:07<00:00, 31271.69it/s]


word vocabulary
most common token is number which appears 1290496 times
less common token is anoyng which appears 10 times


In [8]:
def load_vocab(filename):
    with open(os.path.join(DIR, filename), 'rb') as f:
        return pickle.load(f)

word_vocab = load_vocab('word_vocab.pkl')
"word_vocab loaded!"

'word_vocab loaded!'

In [10]:
len(word_vocab)

19998

### Saving the vocabulary

In [5]:
from IPython.display import clear_output

UNK = 1
num_lines =  len(open(os.path.join(DIR, 'normalized_bugs.json'), 'r').read().splitlines()) * 2
total = num_lines // 2

def dump_bugs(word_vocab):
    bug_dir = os.path.join(DIR, 'bugs')
    if not os.path.exists(bug_dir):
        os.mkdir(bug_dir)
    bugs = []
    cont = 1
    print("Reading the normalized_bugs.json ...")
    with open(os.path.join(DIR, 'normalized_bugs.json'), 'r') as f:
        loop = tqdm(f)
        for line in f:
            loop.set_description('Data dumping {}/{}'.format(cont, total))
            bugs.append(json.loads(line))
            cont += 1

    return bugs

In [6]:
from multiprocessing import Pool
import multiprocessing

def dump_vocabulary(bugs, bug_dir):
    cont=0
    total = len(bugs)
    print("Starting the dump ...")
    loop = tqdm()
    for bug in bugs:
        #bug = json.loads(line)
        #print(bug)
        if cont % 100 == 0:
            clear_output()
            loop.set_description('Data dumping {}/{}'.format(cont, total))
        cont+=1
        bug['description_word'] = [word_vocab.get(w, UNK) for w in bug['description'].split()]
        if len(bug['title']) == 0:
            bug['title'] = bug['description'][:10]
        bug['title_word'] = [word_vocab.get(w, UNK) for w in bug['title'].split()]
        #bug.pop('description')
        #bug.pop('title')
        with open(os.path.join(bug_dir, str(bug['issue_id']) + '.pkl'), 'wb') as f:
            pickle.dump(bug, f)

def processing_dump(bugs):
    #clear_output()
    cpu = 2
    pool = Pool(processes=cpu) # start 4 worker processes
    bug_dir = os.path.join(DIR, 'bugs')
    print("Starting the slice ...")
    works = []
    n = len(bugs) // cpu
    n = 1 if n == 0 else n
    sliced = []
    pos_end = n
    end = len(bugs)
    for i in range(cpu):
        pos_end = end if pos_end>=end else pos_end
        pos_end = end if (i+1) == cpu and pos_end < end else pos_end
        sliced.append(bugs[i*n:pos_end])
        pos_end += n

    print("Slicing done!")
    for s in sliced:
        if len(s) > 0:
            #works.append(pool.apply_async(dump_vocabulary, (s, bug_dir, )))
            dump_vocabulary(s, bug_dir)

    #print("Executing the works...")
    #res = [w.get() for w in works]

    print("All done!")

In [9]:
bugs = dump_bugs(word_vocab)

Reading the normalized_bugs.json ...


Data dumping 115814/115814: : 0it [05:03, ?it/s]

In [10]:
len(bugs)

115814

In [11]:
processing_dump(bugs)
print("Saved!")



Data dumping 57900/57907: : 0it [01:43, ?it/s]

All done!
Saved!


### Test case to parallel processing

In [3]:
# a = [1, 2, 3, 4, 5, 6]
# a = [1, 2, 3, 4]
# a = [1, 2, 3]
# a = [1]
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

cpu = multiprocessing.cpu_count()
n = len(a) // cpu

print("N=", n)

n = 1 if n == 0 else n

sliced = []
pos_end = n
end = len(a)
for i in range(cpu):
    pos_end = end if pos_end>=end else pos_end
    pos_end = end if (i+1) == cpu and pos_end < end else pos_end
    sliced.append(a[i*n:pos_end])
    pos_end += n
sliced

N= 3


[[1, 2, 3], [4, 5, 6], [7, 8, 9, 10]]