## Analysing textual features

In [1]:
import keras
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
%matplotlib inline

Using TensorFlow backend.


In [2]:
from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

In [3]:
MAX_SEQUENCE_LENGTH_T = 100 # 40
MAX_SEQUENCE_LENGTH_D = 500 # 200
EMBEDDING_DIM = 300
MAX_NB_WORDS = 2000

In [4]:
DOMAIN = 'netbeans'
DIR = 'data/processed/{}/{}'.format(DOMAIN, 'bert')
DIR_PAIRS = 'data/normalized/{}'.format(DOMAIN)
DATASET = os.path.join('data/normalized/{}'.format(DOMAIN), '{}.csv'.format(DOMAIN))

In [5]:
import os

# uncased_L-12_H-768_A-12
# multi_cased_L-12_H-768_A-12
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [6]:
from keras_bert import load_vocabulary

token_dict = load_vocabulary(vocab_path)

In [7]:
df_train_pair = pd.read_csv(os.path.join(DIR_PAIRS, '{}_pairs.csv'.format(DOMAIN)))
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D, token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [8]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [9]:
experiment.load_ids()

Reading bug ids


In [10]:
len(baseline.bug_ids)

216715

In [None]:
experiment.load_bugs()

HBox(children=(IntProgress(value=0, max=216715), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [None]:
issues_by_buckets = experiment.get_buckets_for_bugs()

In [None]:
%%time

baseline.prepare_dataset(issues_by_buckets, path_train='train_chronological', path_test='test_chronological')

In [None]:
title_corpus = [baseline.bug_set[i]['title'][:MAX_SEQUENCE_LENGTH_T][5:-5] for i in tqdm(baseline.bug_ids)]
description_corpus = [baseline.bug_set[i]['description'][:MAX_SEQUENCE_LENGTH_D][5:-5] for i in tqdm(baseline.bug_ids)]

In [None]:
title_length = [len(baseline.bug_set[i]['title'][5:-5].split(' ')) for i in tqdm(baseline.bug_ids)]
description_length = [len(baseline.bug_set[i]['description'][5:-5].split(' ')) for i in tqdm(baseline.bug_ids)]

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(17, 8))
# Title
df_title = pd.DataFrame(title_length, columns=['short_desc'])
print(df_title.describe())
ax = df_title.plot.hist(ax=axes[0])
ax.set_title('Frequência de tokens para títulos (short_desc)')
ax.set_ylabel('frequência')
ax.set_xlabel('número de tokens')
# Description
df_desc = pd.DataFrame(description_length, columns=['description'])
print(df_desc.describe())
df_desc = df_desc[df_desc['description'] <= 600]
ax = df_desc.plot.hist(color='g', ax=axes[1])
ax.set_title('Frequência de tokens para descrição (description)')
ax.set_ylabel('frequência')
ax.set_xlabel('número de tokens')

In [None]:
def filter_words(corpus, callback):
    corpus_filtered_words = [row.split(' ') for row in corpus]
    corpus_in_words = []
    for row in corpus_filtered_words:
        corpus_in_words += [word for word in row if callback(len(word))]
    return corpus_in_words

### Word cloud title

In [None]:
%%time

from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(max_font_size=100,width = 1520, height = 535).generate(" ".join(title_corpus))
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Word cloud description

In [None]:
%%time

wordcloud = WordCloud(max_font_size=100,width = 1520, height = 535).generate(" ".join(description_corpus))
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Word cloud 2 words in title

In [None]:
corpus_2_words = [row.split(' ') for row in title_corpus]
title_corpus_2_words = []
for row in corpus_2_words:
    title_corpus_2_words += [word for word in row if len(word) == 2]

In [None]:
%%time

wordcloud = WordCloud(max_font_size=100,width = 1520, height = 535).generate(' '.join(title_corpus_2_words))
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Word cloud 2 words in description

In [None]:
corpus_2_words = [row.split(' ') for row in description_corpus]
desc_corpus_2_words = []
for row in corpus_2_words:
    desc_corpus_2_words += [word for word in row if len(word) == 2]

In [None]:
%%time

wordcloud = WordCloud(max_font_size=100,width = 1520, height = 535).generate(' '.join(desc_corpus_2_words))
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Word cloud 1 word in title

In [None]:
corpus_1_word = [row.split(' ') for row in title_corpus]
title_corpus_1_word = []
for row in corpus_1_word:
    title_corpus_1_word += [word for word in row if len(word) == 1]

In [None]:
freq_words = {}
for word in title_corpus_1_word:
    if word not in freq_words:
        freq_words[word] = 0
        
    freq_words[word] += 1

In [None]:
MEDIUM_SIZE = 16
SMALL_SIZE = 14

plt.rc('font', size=SMALL_SIZE)

df = pd.DataFrame(freq_words, columns=list(freq_words), index=range(len(freq_words))).transpose()[[0]]
df.columns = ['freq']
ax = df.sort_values('freq', ascending=True).plot.barh(figsize=(12, 8))
ax.set_title('Frequency of tokens in title')
ax.set_ylabel('tokens')
ax.set_xlabel('frequency')

### Word cloud 1 word in title

In [None]:
corpus_1_word = [row.split(' ') for row in description_corpus]
desc_corpus_1_word = []
for row in corpus_1_word:
    desc_corpus_1_word += [word for word in row if len(word) == 1]

In [None]:
freq_words = {}
for word in desc_corpus_1_word:
    if word not in freq_words:
        freq_words[word] = 0
        
    freq_words[word] += 1

In [None]:
MEDIUM_SIZE = 16
SMALL_SIZE = 14

plt.rc('font', size=SMALL_SIZE)

df = pd.DataFrame(freq_words, columns=list(freq_words), index=range(len(freq_words))).transpose()[[0]]
df.columns = ['freq']
ax = df.sort_values('freq', ascending=True).plot.barh(figsize=(12, 8))
ax.set_title('Frequency of tokens in description')
ax.set_ylabel('tokens')
ax.set_xlabel('frequency')

### Distribution of words in title

In [None]:
dist_word = [row.split(' ') for row in title_corpus]
size_tokens = []
for row in dist_word:
    size_tokens += [len(word) for word in row]

In [None]:
from collections import Counter
freq_tokens = Counter(size_tokens)

In [None]:
df = pd.DataFrame(freq_tokens, index=range(len(freq_tokens))).transpose()[[0]]
df.columns = ['token_size']
ax = df.sort_values('token_size', ascending=True).plot.bar(figsize=(18, 8))
ax.set_title('Size of tokens in title')
ax.set_xlabel('token_size')

In [None]:
def filter_by_words_title(token_size):
    return token_size == 6

corpus_filtered = filter_words(title_corpus, filter_by_words_title)
len(corpus_filtered)

In [None]:
%%time

wordcloud = WordCloud(max_font_size=100,width = 1520, height = 535).generate(' '.join(corpus_filtered))
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
def filter_by_words_title(token_size):
    return token_size > 8

corpus_filtered = filter_words(title_corpus, filter_by_words_title)
len(corpus_filtered)

In [None]:
%%time

wordcloud = WordCloud(max_font_size=100,width = 1520, height = 535).generate(' '.join(corpus_filtered))
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Distribution of words in description

In [None]:
dist_word = [row.split(' ') for row in description_corpus]
size_tokens = []
for row in dist_word:
    size_tokens += [len(word) for word in row]

In [None]:
from collections import Counter
freq_tokens = Counter(size_tokens)

In [None]:
df = pd.DataFrame(freq_tokens, index=range(len(freq_tokens))).transpose()[[0]]
df.columns = ['token_size']
ax = df.sort_values('token_size', ascending=True).plot.bar(figsize=(18, 8))
ax.set_title('Size of tokens in description')
ax.set_xlabel('token_size')

In [None]:
def filter_by_words_desc(token_size):
    return token_size == 3

corpus_filtered = filter_words(description_corpus, filter_by_words_desc)
len(corpus_filtered)

In [None]:
%%time

wordcloud = WordCloud(max_font_size=100,width = 1520, height = 535).generate(' '.join(corpus_filtered))
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
def filter_by_words_desc(token_size):
    return token_size > 8

corpus_filtered = filter_words(description_corpus, filter_by_words_desc)
len(corpus_filtered)

In [None]:
%%time

wordcloud = WordCloud(max_font_size=100,width = 1520, height = 535).generate(' '.join(corpus_filtered))
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

### Visualize a random bug

In [None]:
bug_selected = np.random.choice(baseline.bug_ids, 1)[0]

bug = baseline.bug_set[bug_selected]

bug

#### Total missing values after preprocessing

In [None]:
bugs_empty_desc = [idx for idx in list(baseline.bug_set) if baseline.bug_set[idx]['description'] == '']
bugs_empty_title = [idx for idx in list(baseline.bug_set) if baseline.bug_set[idx]['title'] == '']

In [None]:
len(bugs_empty_title), len(bugs_empty_desc)

In [None]:
df = pd.read_csv(DATASET)
df[df['bug_id'].isin(bugs_empty_desc)]

In [None]:
len(bugs_empty_desc) / len(baseline.bug_set) * 100.0