# Switch topics preprocessed

- https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

In [1]:
# from __future__ import print_function, division
import re
import numpy as np
import pandas as pd

import os
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from methods.baseline import Baseline
from methods.experiments import Experiment
from methods.evaluation import Evaluation
from methods.retrieval import Retrieval

import random

import _pickle as pickle

from keras_bert import Tokenizer
from keras_bert import load_vocabulary

import re, nltk, spacy, gensim
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn

from sklearn.manifold import TSNE
import time
import seaborn as sns

    
# %matplotlib inline

Using TensorFlow backend.


In [2]:
%env base openoffice

env: base=openoffice


In [3]:
# https://stackoverflow.com/questions/40115043/no-space-left-on-device-error-while-fitting-sklearn-model
%env JOBLIB_TEMP_FOLDER=/hd/tmp

env: JOBLIB_TEMP_FOLDER=/hd/tmp


In [4]:
MAX_SEQUENCE_LENGTH_T = 20 # 100
MAX_SEQUENCE_LENGTH_D = 20 # 500
EMBEDDING_DIM = 300
MAX_NB_WORDS = 20000
SELECT_TOPICS = '30'

In [5]:
# Domain to use
DOMAIN = os.environ['base']
PREPROCESSING = 'bert'
TOKEN = 'bert'
# Dataset paths
DIR = 'data/processed/{}/{}'.format(DOMAIN, PREPROCESSING)
DATASET = 'data/normalized/{}/{}.csv'.format(DOMAIN, DOMAIN)

In [6]:
df = pd.read_csv(DATASET)

In [7]:
df.head()

Unnamed: 0,bug_id,bug_severity,bug_status,component,creation_ts,delta_ts,description,dup_id,priority,product,resolution,short_desc,version
0,14,trivial,CLOSED,Website general issues,2000-10-17 19:40:00 +0000,2006-02-07 22:23:55 +0000,it would be nice if the combination of OpenOff...,[],P3,Infrastructure,FIXED,openoffice.org issuezillla URL's display a hor...,current
1,15,trivial,CLOSED,Bugzilla,2000-10-19 11:49:00 +0000,2003-12-27 10:23:17 +0000,this task is just a test\nthis task is just at...,[],P5,Infrastructure,FIXED,test task,current
2,16,trivial,CLOSED,_openoffice.org administrative interface (obso...,2000-10-20 15:06:00 +0000,2003-12-06 14:52:32 +0000,Using the ADMIN link on the main page leads to...,[],P1,Infrastructure,FIXED,Administrative Interface not available,current
3,17,trivial,CLOSED,_openoffice.org CVS (obsolete),2000-10-20 18:46:00 +0000,2003-12-06 14:52:32 +0000,Entered into Issuezilla for tracking purposes....,[],P3,Infrastructure,FIXED,update modules file.,current
4,19,trivial,CLOSED,definition,2000-10-20 22:54:00 +0000,2007-09-22 22:15:55 +0000,<text:quo-vadis></text:quo-vadis>\n <text:er...,[],P2,xml,FIXED,"text:quo-vadis, text:ergo-sum",605


In [8]:
df_textual = df[['bug_id', 'short_desc', 'description']]
df_textual.loc[:, 'text'] = df['short_desc'] + ". " + df['description']
df_textual.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,bug_id,short_desc,description,text
0,14,openoffice.org issuezillla URL's display a hor...,it would be nice if the combination of OpenOff...,openoffice.org issuezillla URL's display a hor...
1,15,test task,this task is just a test\nthis task is just at...,test task. this task is just a test\nthis task...
2,16,Administrative Interface not available,Using the ADMIN link on the main page leads to...,Administrative Interface not available. Using ...
3,17,update modules file.,Entered into Issuezilla for tracking purposes....,update modules file.. Entered into Issuezilla ...
4,19,"text:quo-vadis, text:ergo-sum",<text:quo-vadis></text:quo-vadis>\n <text:er...,"text:quo-vadis, text:ergo-sum. <text:quo-vadis..."


In [9]:
data = df_textual['text'].values

In [10]:
print("Total of data ", len(data))

Total of data  72234


In [11]:
data[:-2]

array(['openoffice.org issuezillla URL\'s display a horribly ugly heading gif.. it would be nice if the combination of OpenOffice.org gif and "IssueZilla:" \ntext could be artfully combined with better matching alignment, color and font.\nI tried playing with vertical alignment and it didn\'t help. \nIt seemed bogus to spend much time trying to align the OOo gif with the text\nsince it would still look bad anyways in some other browser.\n\nI\'m hoping to get some graphics assistance to make this look better.\n\nNote, see http://www.openoffice.org/issues/editparams.cgi \'bannerhtml\'\nand \'blurbhtml\' parameters for where this information is set. It\'s not like we\ncan\'t easily set the HTML different, so anybody has a suggestion for',
       'test task. this task is just a test\nthis task is just athis task is just a test\n test\nathisathistthis task is just a test\nthis task is just athis task is just a test\n testthis task is justhis task is just a test\nthis task is just athis task

## Bert tokens

In [13]:
pretrained_path = 'uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
model_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')
token_dict = load_vocabulary(vocab_path)
tokenizer = Tokenizer(token_dict)

### Load bugs

In [14]:
baseline = Baseline(DOMAIN, DIR, DATASET, MAX_SEQUENCE_LENGTH_T, MAX_SEQUENCE_LENGTH_D,
                   token_dict['[CLS]'], token_dict['[SEP]'])
evaluation = Evaluation(verbose=0)
retrieval = Retrieval()
experiment = Experiment(baseline, evaluation)

In [15]:
experiment.set_retrieval(retrieval, baseline, DOMAIN)

In [16]:
experiment.load_ids()
len(baseline.bug_ids)

Reading bug ids


98070

In [17]:
%%time

experiment.load_bugs(TOKEN)
len(baseline.sentence_dict)

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 10.9 s, sys: 1.27 s, total: 12.2 s
Wall time: 12.6 s


In [18]:
ids = []
data = []
for bug_id in tqdm(baseline.bug_set):
    bug = baseline.bug_set[bug_id]
    ids.append(bug_id)
    title = bug['title']
    desc = bug['description']
    data.append("{}\n{}\n".format(title, desc))

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))




## Switch topics for each bug

In [23]:
for i in tqdm(range(len(ids))):
    filename = 'bugs/{}.pkl'.format(ids[i])
    try:
        with open(os.path.join(DIR, filename), 'rb') as f:
            bug = pickle.load(f)
            
            # If exist preliminar topics, then rename to remain old sizes
            if 'topics' in bug:
                size_of_topic = len(bug['topics'])
                old_topic_name = 'topics_{}'.format(size_of_topic)
                old_topic_label_name = 'topic_{}'.format(size_of_topic)
                old_topic_index_name = 'topic_index_{}'.format(size_of_topic)
                bug[old_topic_name] = bug['topics']
                bug[old_topic_label_name] = bug['topic']
                bug[old_topic_index_name] = bug['topic_index']
            
            # Update bugs with topics
            if 'topics_{}'.format(SELECT_TOPICS) in bug:
                bug['topics'] = bug['topics_{}'.format(SELECT_TOPICS)]
                bug['topic'] = bug['topic_{}'.format(SELECT_TOPICS)]
                bug['topic_index'] = bug['topic_index_{}'.format(SELECT_TOPICS)]
    except:
        bug = {}
    if('topic' not in bug):
        print("Bug of index {} was not saved".format(i))
        continue
    bug_dir = os.path.join(DIR, 'bugs')
    with open(os.path.join(bug_dir, str(ids[i]) + '.pkl'), 'wb') as f:
        pickle.dump(bug, f)

HBox(children=(IntProgress(value=0, max=98070), HTML(value='')))




In [24]:
filename = 'bugs/{}.pkl'.format(ids[0])
with open(os.path.join(DIR, filename), 'rb') as f:
    print(pickle.load(f))

{'priority': '4\n', 'topic': 20, 'topic_30': 20, 'issue_id': 13, 'delta_ts': '2003-12-06 14:52:32 +0000', 'resolution': 'NOT_AN_ISSUE', 'version': '50\n', 'title_segment': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'bug_status': '1\n', 'topic_index_30': 19, 'dup_id': '[]', 'topic_50': 33, 'topics_50': array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.12, 0.  , 0.  ,
       0.  , 0.  , 0.16, 0.15, 0.  , 0.  , 0.  , 0.  , 0.  , 0.15, 0.  ,
       0.17, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.13]), 'bug_severity': '2\n', 'title_token': [101, 3231, 11829, 1024, 3526, 3609, 2003, 3308, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'title': '[CLS] test bug : cell col