Imports: classifier models, data preprocessors, training testing splitters, and reporting entities

In [1]:
!pip install -U gensim



In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from gensim.parsing.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,f1_score, precision_score, recall_score

Read Data and sort by date

In [3]:
data = pd.read_csv("eclipse_bug_report_data.csv")
data = data.dropna()
data['creation_date_date']=pd.to_datetime(data['creation_date'])
data = data.sort_values('creation_date_date')
data.head()

Unnamed: 0,bug_id,creation_date,component_name,product_name,short_description,long_description,assignee_name,reporter_name,resolution_category,resolution_code,status_category,status_code,update_date,quantity_of_votes,quantity_of_comments,resolution_date,bug_fix_time,severity_category,severity_code,creation_date_date
4602,JDT-4167,2001-10-10,UI,JDT,Cannot delete then save a file of same name in...,This could be a core problem - not sure.\nIf I...,kai-uwe_maetzel,Karice_McIntyre,fixed,1,resolved,4,2002-04-30,0,5,2002-04-30,202,major,4,2001-10-10
7876,JDT-4133,2001-10-10,UI,JDT,EC DCR: Add line numbers to Java editor (1GIV594),From Eclipse Corner: Feature request - line n...,kai-uwe_maetzel,carolynmacleod4,fixed,1,resolved,4,2002-05-30,0,12,2002-05-09,211,normal,2,2001-10-10
3539,PLATFORM-81,2001-10-10,Team,PLATFORM,Cannot interrupt large file release (1GE6RUE),Platform: JDK 108 W2K\n\nIf you import a large...,jeff_brown,t.p.ellison,fixed,1,resolved,4,2002-04-10,0,3,2002-04-10,182,normal,2,2001-10-10
1095,PLATFORM-2863,2001-10-10,UI,PLATFORM,platform lazy loading thwarted by perspectives...,The lazy platform loading feature suffers from...,Kevin_Haaland,john.arthorne,fixed,1,resolved,4,2002-05-30,0,3,2002-05-30,232,normal,2,2001-10-10
8072,JDT-3299,2001-10-10,Core,JDT,Autobuild produces errors when renaming source...,1) split JUnit project into two source folders...,kent_johnson,kai-uwe_maetzel,fixed,1,resolved,4,2002-02-04,0,6,2002-02-04,117,normal,2,2001-10-10


In [4]:
# retaining pertinent information from the imported dataset
data_informed = data[['long_description', 'severity_category', 'creation_date_date']].copy()

In [5]:
# checking for im/balance in dataset.
data_informed['severity_category'].value_counts()

normal      7579
major        952
minor        379
critical     359
blocker      204
trivial      154
Name: severity_category, dtype: int64

In [6]:
# Preprocess data, tokenizing and stemming 
porter_stemmer = PorterStemmer()
data_informed['ld_tokenized'] = [simple_preprocess(line, deacc=True) for line in data_informed['long_description']]
data_informed['ld_t_stemmed'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in data_informed['ld_tokenized'] ]

In [7]:
# confused here
tot_lis = []
for i, v in data_informed[['ld_t_stemmed']].iterrows():
    for c in v:
        for e in c:
            tot_lis.append(e)

In [8]:
tot_lis_set = list(set(tot_lis))
len(tot_lis_set)

23668

In [9]:
# Assigning numeric labels to the bug priority classes. 
def map_sentiment(label):
    if label == 'normal':
        return 0
    elif label == 'blocker':
        return 1
    elif label == 'trivial':
        return 2
    elif label == 'minor':
        return 3
    elif label == 'major':
        return 4
    elif label == 'critical':
        return 5
    else:
        return 6

In [10]:
# adding the label column on to data_informed for pertinent data + labels
data_informed['label'] = [ map_sentiment(x) for x in data_informed['severity_category']]
# data_informed['label'].head()

In [11]:
train_x, test_x, train_y, test_y = train_test_split(data_informed[['ld_t_stemmed', 'creation_date_date']], data_informed[['label']], test_size=0.1, random_state=22, shuffle=False)

In [12]:
train_x.head()

Unnamed: 0,ld_t_stemmed,creation_date_date
4602,"[thi, could, be, core, problem, not, sure, if,...",2001-10-10
7876,"[from, eclips, corner, featur, request, line, ...",2001-10-10
3539,"[platform, jdk, if, you, import, larg, file, i...",2001-10-10
1095,"[the, lazi, platform, load, featur, suffer, fr...",2001-10-10
8072,"[split, junit, project, into, two, sourc, fold...",2001-10-10


In [13]:
# Skip-gram model (sg = 1)
size = 1000
window = 5
min_count = 1
workers = 3
sg = 1
stemmed_tokens = pd.Series(data_informed['ld_t_stemmed']).values
word2vec_model_file = 'word2vec_' + str(size) + '.model'
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = size, workers = workers, window = window, sg = sg)

In [14]:
w2v_model.save(word2vec_model_file)

In [15]:
# Load the model from the model file
sg_w2v_model = Word2Vec.load(word2vec_model_file)

In [16]:
# Store the vectors for train data in following file
word2vec_filename = 'train_review_word2vec.csv'
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in train_x.iterrows():
        # lis = []
        # for tokein in row['ld_t_stemmed']:
        #   try:
        #       lis.apend(sg_w2v_model.wv[token])
        #   except:
        #       lis.append(0)
        # model_vector = (np.mean(lis, axis=0)).tolist()
        model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['ld_t_stemmed']], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [17]:
# Load from the filename
word2vec_df = pd.read_csv(word2vec_filename)
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

clf_decision_word2vec.fit(word2vec_df, train_y)

In [18]:
test_features_word2vec = []
for index, row in test_x.iterrows():
    model_vector = np.mean([sg_w2v_model.wv[token] for token in row['ld_t_stemmed']], axis=0)
    if type(model_vector) is list:
        test_features_word2vec.append(model_vector)
    else:
        test_features_word2vec.append(np.array([0 for i in range(1000)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
print(accuracy_score(test_y['label'],test_predictions_word2vec))
print(classification_report(test_y['label'],y_pred=test_predictions_word2vec))

0.8494288681204569
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        54
           5       0.00      0.00      0.00        27

    accuracy                           0.85       963
   macro avg       0.14      0.17      0.15       963
weighted avg       0.72      0.85      0.78       963



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
confusionmat = (confusion_matrix(test_y['label'],test_predictions_word2vec))
confusionmat.diagonal()/confusionmat.sum(axis=1)

print(confusionmat)
print(classification_report(test_y['label'],y_pred=test_predictions_word2vec))

[[818   0   0   0   0   0]
 [ 24   0   0   0   0   0]
 [ 14   0   0   0   0   0]
 [ 26   0   0   0   0   0]
 [ 54   0   0   0   0   0]
 [ 27   0   0   0   0   0]]
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        54
           5       0.00      0.00      0.00        27

    accuracy                           0.85       963
   macro avg       0.14      0.17      0.15       963
weighted avg       0.72      0.85      0.78       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
print(
    f1_score(test_y['label'],test_predictions_word2vec, average='macro'),
    f1_score(test_y['label'],test_predictions_word2vec, average='micro'),
    f1_score(test_y['label'],test_predictions_word2vec, average='weighted')
    )



0.15309751076174435 0.8494288681204569 0.7802726716704478


In [21]:
print(precision_score(test_y['label'], test_predictions_word2vec, average='weighted'))

# test_predictions_word2vec.to_csv('predictions.csv')
np.savetxt("predictions.csv", test_y, delimiter=",")



0.7215294019964006


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
from sklearn.svm import SVC

In [23]:
# Load from the filename
word2vec_df = pd.read_csv(word2vec_filename)
#Initialize the model
clf_svc =  SVC(gamma='auto')

clf_svc.fit(word2vec_df, train_y)

  y = column_or_1d(y, warn=True)


In [24]:
test_predictions_svc = clf_svc.predict(test_features_word2vec)
print(accuracy_score(test_y['label'],test_predictions_svc))
print(classification_report(test_y['label'],y_pred=test_predictions_svc))



0.8494288681204569
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        54
           5       0.00      0.00      0.00        27

    accuracy                           0.85       963
   macro avg       0.14      0.17      0.15       963
weighted avg       0.72      0.85      0.78       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
clf_svc_balanced = SVC(gamma='scale', class_weight='balanced')
clf_svc_balanced.fit(word2vec_df, train_y)

  y = column_or_1d(y, warn=True)


In [26]:
test_predictions_svc = clf_svc_balanced.predict(test_features_word2vec)
print(accuracy_score(test_y['label'],test_predictions_svc))
print(classification_report(test_y['label'],y_pred=test_predictions_svc))



0.056074766355140186
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.06      1.00      0.11        54
           5       0.00      0.00      0.00        27

    accuracy                           0.06       963
   macro avg       0.01      0.17      0.02       963
weighted avg       0.00      0.06      0.01       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
from sklearn.ensemble import RandomForestClassifier
weights = {
    0:10.0,
    1:50.0,
    2:50.0,
    3:50.0,
    4:50.0,
    5:50.0
}
model = RandomForestClassifier(n_estimators=10, class_weight=weights)
model.fit(word2vec_df, train_y)
test_predictions = model.predict(test_features_word2vec)
print(accuracy_score(test_y['label'],test_predictions))
print(classification_report(test_y['label'],y_pred=test_predictions))

  model.fit(word2vec_df, train_y)


0.056074766355140186
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.06      1.00      0.11        54
           5       0.00      0.00      0.00        27

    accuracy                           0.06       963
   macro avg       0.01      0.17      0.02       963
weighted avg       0.00      0.06      0.01       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
!pip install imblearn



In [29]:
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler()

In [30]:
X_over, y_over = oversample.fit_resample(train_x, train_y)

In [31]:
print(train_x.shape)

print(X_over.shape)

(8664, 2)
(40566, 2)


In [32]:
# Store the vectors for train data in following file
word2vec_filename = 'train_review_word2vec_new.csv'
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in X_over.iterrows():
        model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['ld_t_stemmed']], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [33]:
# Load from the filename
word2vec_df = pd.read_csv(word2vec_filename)
model = RandomForestClassifier(n_estimators=10)
model.fit(word2vec_df, y_over)
test_predictions = model.predict(test_features_word2vec)
print(accuracy_score(test_y['label'],test_predictions))
print(classification_report(test_y['label'],y_pred=test_predictions))

  model.fit(word2vec_df, y_over)


0.028037383177570093
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        54
           5       0.03      1.00      0.05        27

    accuracy                           0.03       963
   macro avg       0.00      0.17      0.01       963
weighted avg       0.00      0.03      0.00       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

stemmed_tokens = data_informed['ld_t_stemmed'].values
corpus = []
for each in stemmed_tokens:
    corpus.append(' '.join(each))
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

In [35]:
train_x['final'] = train_x['ld_t_stemmed'].apply(lambda x: vectorizer.transform([' '.join(x)]).toarray()[0])

In [36]:
test_x['final'] = test_x['ld_t_stemmed'].apply(lambda x: vectorizer.transform([' '.join(x)]).toarray()[0])

In [37]:
clf_decision_word2vec = DecisionTreeClassifier()

clf_decision_word2vec.fit(list(train_x['final'].to_numpy()), train_y)

In [38]:
test_x['final'] = test_x['ld_t_stemmed'].apply(lambda x: vectorizer.transform([' '.join(x)]).toarray()[0])
pred_y = clf_decision_word2vec.predict(list(test_x['final'].to_numpy()))
print(accuracy_score(test_y['label'],pred_y))
print(classification_report(test_y['label'],y_pred=pred_y))


0.7445482866043613
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       818
           1       0.17      0.08      0.11        24
           2       0.11      0.07      0.09        14
           3       0.04      0.04      0.04        26
           4       0.09      0.13      0.11        54
           5       0.11      0.07      0.09        27

    accuracy                           0.74       963
   macro avg       0.23      0.21      0.22       963
weighted avg       0.74      0.74      0.74       963



In [39]:
clf_svc =  SVC()
clf_svc.fit(list(train_x['final'].to_numpy()), train_y)

  y = column_or_1d(y, warn=True)


Error: Canceled future for execute_request message before replies were done

In [None]:
pred_y = clf_svc.predict(list(test_x['final'].to_numpy()))
print(accuracy_score(test_y['label'],pred_y))
print(classification_report(test_y['label'],y_pred=pred_y))

0.8494288681204569
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        54
           5       0.00      0.00      0.00        27

    accuracy                           0.85       963
   macro avg       0.14      0.17      0.15       963
weighted avg       0.72      0.85      0.78       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
clf_svc =  RandomForestClassifier(n_estimators=1)
clf_svc.fit(list(train_x['final'].to_numpy()), train_y)
pred_y = clf_svc.predict(list(test_x['final'].to_numpy()))
print(accuracy_score(test_y['label'],pred_y))
print(classification_report(test_y['label'],y_pred=pred_y))

  clf_svc.fit(list(train_x['final'].to_numpy()), train_y)


0.7185877466251298
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       818
           1       0.00      0.00      0.00        24
           2       0.20      0.14      0.17        14
           3       0.03      0.04      0.03        26
           4       0.08      0.11      0.09        54
           5       0.00      0.00      0.00        27

    accuracy                           0.72       963
   macro avg       0.19      0.19      0.19       963
weighted avg       0.73      0.72      0.72       963



In [None]:
X_over, y_over = oversample.fit_resample(train_x, train_y)

In [None]:
clf_svc =  RandomForestClassifier(n_estimators=1)
clf_svc.fit(list(X_over['final'].to_numpy()), y_over)
pred_y = clf_svc.predict(list(test_x['final'].to_numpy()))
print(accuracy_score(test_y['label'],pred_y))
print(classification_report(test_y['label'],y_pred=pred_y))

  clf_svc.fit(list(X_over['final'].to_numpy()), y_over)


0.6386292834890965
              precision    recall  f1-score   support

           0       0.85      0.74      0.79       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.05      0.08      0.06        26
           4       0.06      0.13      0.08        54
           5       0.00      0.00      0.00        27

    accuracy                           0.64       963
   macro avg       0.16      0.16      0.16       963
weighted avg       0.73      0.64      0.68       963



In [None]:
clf_svc =  SVC(class_weight='balanced')
clf_svc.fit(list(train_x['final'].to_numpy()), train_y)
pred_y = clf_svc.predict(list(test_x['final'].to_numpy()))
print(accuracy_score(test_y['label'],pred_y))
print(classification_report(test_y['label'],y_pred=pred_y))

  y = column_or_1d(y, warn=True)


0.7798546209761164
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.14      0.24      0.17        54
           5       0.00      0.00      0.00        27

    accuracy                           0.78       963
   macro avg       0.17      0.19      0.18       963
weighted avg       0.74      0.78      0.76       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(list(train_x['final'].to_numpy()), train_y)
pred_y = clf.predict(list(test_x['final'].to_numpy()))
print(accuracy_score(test_y['label'],pred_y))
print(classification_report(test_y['label'],y_pred=pred_y))

  y = column_or_1d(y, warn=True)


0.8494288681204569
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        54
           5       0.00      0.00      0.00        27

    accuracy                           0.85       963
   macro avg       0.14      0.17      0.15       963
weighted avg       0.72      0.85      0.78       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
word2vec_df_min_max = min_max_scaler.fit_transform(word2vec_df)

In [None]:
word2vec_df_min_max.shape

(8664, 1000)

In [None]:
clf = MultinomialNB()
clf.fit(word2vec_df_min_max, train_y)

  y = column_or_1d(y, warn=True)


MultinomialNB()

In [None]:
test_features_word2vec = []
for index, row in test_x.iterrows():
    model_vector = np.mean([sg_w2v_model.wv[token] for token in row['ld_t_stemmed']], axis=0)
    if type(model_vector) is list:
        test_features_word2vec.append(model_vector)
    else:
        test_features_word2vec.append(np.array([0 for i in range(1000)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
print(accuracy_score(test_y['label'],test_predictions_word2vec))
print(classification_report(test_y['label'],y_pred=test_predictions_word2vec))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0.8494288681204569
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       818
           1       0.00      0.00      0.00        24
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00        54
           5       0.00      0.00      0.00        27

    accuracy                           0.85       963
   macro avg       0.14      0.17      0.15       963
weighted avg       0.72      0.85      0.78       963



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 