# Bug Prediction

In [1]:
import string
import re
import nltk
nltk.download(["stopwords"])
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def preprocess(text):
    cleaned_text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,.<=>?@[]^`{|}~' + u'\xa0'))
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), ''))
    cleaned_text = ' '.join(['_variable_with_underscore' if '_' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_dash' if '-' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_long_variable_name' if len(t) > 15 and t[0] != '#' else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_weburl' if t.startswith('http') and '/' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number' if re.sub('[\\/;:_-]', '', t).isdigit() else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_address' if re.match('.*0x[0-9a-f].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_name_with_number' if re.match('.*[a-f]*:[0-9]*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_one_character' if re.match('[a-f][0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_three_characters' if re.match('[a-f]{3}[0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_version' if any(i.isdigit() for i in t) and t.startswith('v') else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_localpath' if ('\\' in t or '/' in t) and ':' not in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_image_size' if t.endswith('px') else t for t in cleaned_text.split()])
    tokenized_text = word_tokenize(cleaned_text)
    sw_removed_text = [word for word in tokenized_text if word not in stopword_set]
    sw_removed_text = [word for word in sw_removed_text if len(word) > 2]
    stemmed_text = ' '.join([stemmer.stem(w) for w in sw_removed_text])
    return stemmed_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thanatornkanthala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from multiprocessing.pool import ThreadPool as Pool
import pandas as pd

def initialize_pool(stopwords, ps):
    global stopword_set
    global stemmer
    stopword_set = stopwords
    stemmer = ps

dataset = pd.read_json('data/embold_train.json')
dataset.loc[dataset['label'] > 0, 'label'] = -1
dataset.loc[dataset['label'] == 0, 'label'] = 1
dataset.loc[dataset['label'] == -1, 'label'] = 0
stopwords = set(stopwords.words('english'))
ps = PorterStemmer()
pool = Pool(8, initializer=initialize_pool, initargs=(stopwords, ps, ))

cleaned_title = pool.map(preprocess, dataset.title)
cleaned_body = pool.map(preprocess, dataset.body)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

data_texts = pd.DataFrame([cleaned_title, cleaned_body], index=["title", "body"]).T
y = dataset["label"]
data_fit, data_blindtest, y_fit, y_blindtest = train_test_split(data_texts, y, test_size=0.1)

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidf_vectorizer.fit(cleaned_title + cleaned_body)

X_tfidf_fit = tfidf_vectorizer.transform(data_fit["title"])
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest["title"])

In [4]:
from sklearn import model_selection
import lightgbm as lgb

gbm_model = lgb.LGBMClassifier()

precision_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('CV: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.886622 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 118085
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 2310
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444481 -> initscore=-0.222994
[LightGBM] [Info] Start training from score -0.222994
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.730052 seconds.
You can set `force_col_wise=true` to remove the o

In [5]:
from sklearn import model_selection, metrics

data_fit_train, data_fit_test, y_fit_train, y_fit_test = model_selection.train_test_split(data_fit, y_fit, test_size=0.3)

X_tfidf_fit_train = tfidf_vectorizer.transform(data_fit_train['title'])
X_tfidf_fit_test = tfidf_vectorizer.transform(data_fit_test['title'])
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest['title'])

gbm_model.fit(X_tfidf_fit_train, y_fit_train, eval_set=[(X_tfidf_fit_test, y_fit_test)], eval_metric='AUC')

precision_test_score = metrics.precision_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')
recall_test_score = metrics.recall_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')
f1_test_score = metrics.f1_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro')

print('test: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, f1_test_score))

[LightGBM] [Info] Number of positive: 41992, number of negative: 52508
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.170045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 105672
[LightGBM] [Info] Number of data points in the train set: 94500, number of used features: 2146
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444360 -> initscore=-0.223486
[LightGBM] [Info] Start training from score -0.223486
test: p:0.7440 r:0.7678 f:0.7458


In [6]:
import pickle

pickle.dump(tfidf_vectorizer, open('data/github_bug_prediction_tfidf_vectorizer.pkl', 'wb'))
pickle.dump(gbm_model, open('data/github_bug_prediction_basic_model.pkl', 'wb'))

# Topic Modeling

In [7]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack

lsa = TruncatedSVD(n_components=500, n_iter=100, random_state=0)
lsa.fit(X_tfidf_fit)
X_lsa_fit = lsa.transform(X_tfidf_fit)

gbm_model_with_lsa = lgb.LGBMClassifier()

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

X_fit_with_lsa = hstack([X_tfidf_fit, X_lsa_fit]).tocsr()

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.143999 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.183237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444481 -> initscore=-0.222994
[LightGBM] [Info] Start training from score -0.222994
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444481 -> initsco

In [8]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(cleaned_title + cleaned_body)
X_tf_fit = count_vectorizer.transform(data_fit['title'])
X_tf_blindtest = count_vectorizer.transform(data_blindtest['title'])
lda = LatentDirichletAllocation(n_components=500, random_state=0)
lda.fit(X_tf_fit)
X_lda_fit = lda.transform(X_tf_fit)
gbm_model_with_lda = lgb.LGBMClassifier()

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

X_fit_with_lda = hstack([X_tfidf_fit, X_lda_fit]).tocsr()

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127462
[LightGBM] [Info] Number of data points in the train set: 108000, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444481 -> initscore=-0.222994
[LightGBM] [Info] Start training from score -0.222994
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Number of positive: 48004, number of negative: 59996
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.