In [51]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score, precision_score, recall_score

In [3]:
data = pd.read_csv("eclipse_bug_report_data.csv")
data = data.dropna()
data['creation_date_date']=pd.to_datetime(data['creation_date'])
data = data.sort_values('creation_date_date')
data.head()

Unnamed: 0,bug_id,creation_date,component_name,product_name,short_description,long_description,assignee_name,reporter_name,resolution_category,resolution_code,status_category,status_code,update_date,quantity_of_votes,quantity_of_comments,resolution_date,bug_fix_time,severity_category,severity_code,creation_date_date
4602,JDT-4167,2001-10-10,UI,JDT,Cannot delete then save a file of same name in...,This could be a core problem - not sure.\nIf I...,kai-uwe_maetzel,Karice_McIntyre,fixed,1,resolved,4,2002-04-30,0,5,2002-04-30,202,major,4,2001-10-10
7876,JDT-4133,2001-10-10,UI,JDT,EC DCR: Add line numbers to Java editor (1GIV594),From Eclipse Corner: Feature request - line n...,kai-uwe_maetzel,carolynmacleod4,fixed,1,resolved,4,2002-05-30,0,12,2002-05-09,211,normal,2,2001-10-10
3539,PLATFORM-81,2001-10-10,Team,PLATFORM,Cannot interrupt large file release (1GE6RUE),Platform: JDK 108 W2K\n\nIf you import a large...,jeff_brown,t.p.ellison,fixed,1,resolved,4,2002-04-10,0,3,2002-04-10,182,normal,2,2001-10-10
1095,PLATFORM-2863,2001-10-10,UI,PLATFORM,platform lazy loading thwarted by perspectives...,The lazy platform loading feature suffers from...,Kevin_Haaland,john.arthorne,fixed,1,resolved,4,2002-05-30,0,3,2002-05-30,232,normal,2,2001-10-10
8072,JDT-3299,2001-10-10,Core,JDT,Autobuild produces errors when renaming source...,1) split JUnit project into two source folders...,kent_johnson,kai-uwe_maetzel,fixed,1,resolved,4,2002-02-04,0,6,2002-02-04,117,normal,2,2001-10-10


In [4]:
data_informed = data[['long_description', 'severity_category', 'creation_date_date']].copy()

In [5]:
# Preprocess data
porter_stemmer = PorterStemmer()
data_informed['ld_tokenized'] = [simple_preprocess(line, deacc=True) for line in data_informed['long_description']]
data_informed['ld_t_stemmed'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in data_informed['ld_tokenized'] ]

In [6]:
def map_sentiment(label):
    if label == 'normal':
        return 0
    elif label == 'blocker':
        return 1
    elif label == 'trivial':
        return 2
    elif label == 'minor':
        return 3
    elif label == 'major':
        return 4
    elif label == 'critical':
        return 5
    else:
        return 6

In [7]:
data_informed['label'] = [ map_sentiment(x) for x in data_informed['severity_category']]

In [8]:
data_informed['label'].head()

4602    4
7876    0
3539    0
1095    0
8072    0
Name: label, dtype: int64

In [9]:
encoder = OneHotEncoder(sparse=False)
# transform data
labels = encoder.fit_transform(np.array(data_informed['severity_category']).reshape(-1,1))

In [10]:
train_x, test_x, train_y, test_y = train_test_split(data_informed[['ld_t_stemmed', 'creation_date_date']], data_informed[['label']], test_size=0.1, shuffle=False)

In [11]:
train_x.head()

Unnamed: 0,ld_t_stemmed,creation_date_date
4602,"[thi, could, be, core, problem, not, sure, if,...",2001-10-10
7876,"[from, eclips, corner, featur, request, line, ...",2001-10-10
3539,"[platform, jdk, if, you, import, larg, file, i...",2001-10-10
1095,"[the, lazi, platform, load, featur, suffer, fr...",2001-10-10
8072,"[split, junit, project, into, two, sourc, fold...",2001-10-10


In [12]:
# Skip-gram model (sg = 1)
size = 1000
window = 3
min_count = 1
workers = 3
sg = 1
stemmed_tokens = pd.Series(data_informed['ld_t_stemmed']).values
word2vec_model_file = 'word2vec_' + str(size) + '.model'
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = size, workers = workers, window = window, sg = sg)

In [13]:
w2v_model.save(word2vec_model_file)

In [14]:
# Load the model from the model file
sg_w2v_model = Word2Vec.load(word2vec_model_file)

In [15]:
# Store the vectors for train data in following file
word2vec_filename = 'train_review_word2vec.csv'
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in train_x.iterrows():
        model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['ld_t_stemmed']], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [16]:
# Load from the filename
word2vec_df = pd.read_csv(word2vec_filename)
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

clf_decision_word2vec.fit(word2vec_df, train_y)

DecisionTreeClassifier()

In [17]:
test_features_word2vec = []
for index, row in test_x.iterrows():
    model_vector = np.mean([sg_w2v_model.wv[token] for token in row['ld_t_stemmed']], axis=0)
    if type(model_vector) is list:
        test_features_word2vec.append(model_vector)
    else:
        test_features_word2vec.append(np.array([0 for i in range(1000)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
print(accuracy_score(test_y['label'],test_predictions_word2vec))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0.8494288681204569




In [41]:
confusionmat = (confusion_matrix(test_y['label'],test_predictions_word2vec))
confusionmat.diagonal()/confusionmat.sum(axis=1)

print(confusionmat)

[[818   0   0   0   0   0]
 [ 24   0   0   0   0   0]
 [ 14   0   0   0   0   0]
 [ 26   0   0   0   0   0]
 [ 54   0   0   0   0   0]
 [ 27   0   0   0   0   0]]


In [49]:
print(
    f1_score(test_y['label'],test_predictions_word2vec, average='macro'),
    f1_score(test_y['label'],test_predictions_word2vec, average='micro'),
    f1_score(test_y['label'],test_predictions_word2vec, average='weighted')
    )

print(
    accuracy_score(test_y['label'],test_predictions_word2vec),
    accuracy_score(test_y['label'],test_predictions_word2vec),
    accuracy_score(test_y['label'],test_predictions_word2vec)
    )



0.15309751076174435 0.8494288681204569 0.7802726716704478
0.8494288681204569 0.8494288681204569 0.8494288681204569


In [54]:
print(precision_score(test_y['label'], test_predictions_word2vec, average='weighted'))

# test_predictions_word2vec.to_csv('predictions.csv')
np.savetxt("predictions.csv", test_y, delimiter=",")



0.7215294019964006


  _warn_prf(average, modifier, msg_start, len(result))
