# Similarity comparison based on TF-IDF and Word2Vec
This experiment is based on my summer research report.

## Part I -- Cosine similarity calculation before and after change points

## Import dataset and convert it into dictionary

In [2]:
import re
import sys
import pandas as pd
import numpy as np
import datetime
from skmultiflow.drift_detection import ANGLE
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from scipy import spatial
import gensim.downloader as api
from nltk.corpus import stopwords
import nltk

df = pd.read_csv("OF.csv")
dataset = df[["timestamp", "bug"]]

bug_dic = {}

for index, row in dataset.iterrows():
    bug_no = row["bug"]
    if bug_no not in bug_dic:
        bug_dic[bug_no] = [row["timestamp"]]
    elif bug_no in bug_dic:
        bug_dic[bug_no].append(row["timestamp"])

## Generate hour-based stream for each bug 

In [3]:
def stream_generation(time_list, start, end):
    days = (end - start) // 3600
    stream = [0] * days
    
    for time in time_list:
        index = (time - start) // 3600
        stream[index] += 1
    return stream

## Process each bugs and generate change points reports

In [74]:
change_info = {}
for key in bug_dic:
    bug_list = bug_dic[key]
    if len(bug_list) >= 100:
        bug_list.sort()
        start_time = int(datetime.datetime.fromtimestamp(bug_list[0] // 86400 * 86400).timestamp())
        end_time = int(datetime.datetime.fromtimestamp((bug_list[-1] // 86400 + 1) * 86400).timestamp())
        data_stream = stream_generation(bug_list, start_time, end_time)
        index = 0
        angle = ANGLE()
        change_info[key] = [0]
        for data in data_stream:
            index += 1
            angle.add_element(data)
            if angle.detected_change():
                true_point = index - angle.get_drift_location()
                time_diff = index - true_point
                if time_diff > 0:
                    true_point = index - angle.get_drift_location()
                    change_info[key].append(true_point)
        change_info[key].append(index)
len(change_info)

1169

## Import bug comments dataset and convert it into dictionary

In [98]:
df = pd.read_csv("intermittent_bug_comments.csv", engine='python')

bugs_dict = {}

for index, row in df.iterrows():
    bug_no = row["Bug"]
    try:
        if bug_no not in bugs_dict:
            bugs_dict[bug_no] = [[row["timestamp"]]+row["Processed_comment"].split()]
        elif bug_no in bugs_dict:
            bugs_dict[bug_no].append([row['timestamp']]+row["Processed_comment"].split())
    except:
        continue
df.head()

Unnamed: 0,Bug,Comment_author,comment_type,timestamp,Comment_content,Processed_comment
0,332722,Bob Clary [:bc:],\nDescription\n,1144174304,My test run on 2006-04-01 showed a number of d...,test run showed number date related errors bel...
1,332722,Bob Clary [:bc:],\nComment 2\n,1144619799,I am unable to reproduce (so far) the other cr...,unable reproduce far crashes ecma date js ecma...
2,332722,Bob Clary [:bc:],\nComment 3\n,1145337493,Filed bug 334427 on the ecma_3/Date/15.9.5.5.j...,filed bug ecma date js ecma date js windows js...
3,332722,Bob Clary [:bc:],\nComment 4\n,1145640762,This test appears to have problems around midn...,test appears problems around midnight test ecm...
4,332722,Bob Clary [:bc:],\nComment 5\n,1257165472,re-enable ecma_3/Date/15.9.5.5.js which only f...,enable ecma date js fails linux dst


## Process the dictionary to get selected bug comments

In [99]:
keys = []
for key in bugs_dict:
    if key not in change_info:
        keys.append(key)
        continue
    if len(change_info[key]) == 2:
        keys.append(key)
for key in keys:
    del bugs_dict[key]
len(bugs_dict)

380

## Import several online and our own pre-training Word2Vec models

In [97]:
pretrain_model = api.load('word2vec-google-news-300')
print(pretrain_model.wv.similarity("python", "js"))

0.15475279


  


In [7]:
pretrain_model2 = api.load('fasttext-wiki-news-subwords-300')
print(pretrain_model2.wv.similarity("python", "js"))

0.31644663


  


In [8]:
pretrain_model3 = api.load('glove-wiki-gigaword-300')
print(pretrain_model3.wv.similarity("python", "js"))

-0.030526154


  


In [9]:
pretrain_model4 = Word2Vec.load('bug_comments_model.bin')
print(pretrain_model4.wv.similarity("python", "js"))

-0.02326997


## TF-IDF and Word2Vec combined cosine similarity calculation

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer
def cosine_sim(list1, list2, doc1, doc2, model):
    extra = []
    vec1 = np.zeros((100, ), dtype='float32')
    counter1 = 0
    for sub in list1:
        for word in sub:
            try:
                vec1 = np.add(vec1, model[word])
                counter1 += 1
            except:
                extra.append(word)
                continue
    try:
        vec1 = np.divide(vec1, counter1)
    except:
        pass

    vec2 = np.zeros((100, ), dtype='float32')
    counter2 = 0
    for sub in list2:
        for word in sub:
            try:
                vec2 = np.add(vec2, model[word])
                counter2 += 1
            except:
                extra.append(word)
                continue
    try:
        vec2 = np.divide(vec2, counter2)
    except:pass

    extra = {i:extra.count(i) for i in set(extra)}
    word2vec_sim = 1 - spatial.distance.cosine(vec1, vec2)
    
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([doc1, doc2])
    denselist = vectors.todense().tolist()
    tfidf_sim = 1 - spatial.distance.cosine(denselist[0], denselist[1])
    
    cos_sim = word2vec_sim + tfidf_sim
    
    return cos_sim, extra

## Help function: generate space split document from list

In [11]:
def doc_generate(words):
    doc = []
    for sub in words:
        for word in sub:
            doc.append(word)
    return " ".join(doc)

## Process two datasets and pick out the expected change points

In [111]:
report_info = {}
bugs = [key for key in bugs_dict.keys()]
total = 0
counter = 0
for bug_no in bugs:
    bug = bug_dic[bug_no]
    bug.sort()
    start_time = int(datetime.datetime.fromtimestamp(bug[0] // 86400 * 86400).timestamp())
    end_time = int(datetime.datetime.fromtimestamp((bug[-1] // 86400 + 1) * 86400).timestamp())
    data_stream = stream_generation(bug, start_time, end_time)
    if bug_no in change_info:
        report_info[bug_no] = []
        time_serious = change_info[bug_no]
        time_len = len(time_serious)
        for i in range(0, time_len-2):
            start = start_time + time_serious[i] * 3600
            change = start_time + time_serious[i+1] * 3600
            end = start_time + time_serious[i+2] * 3600
            
            pre_change = []
            post_change = []
            doc1 = []
            doc2 = []
            
            for words_list in bugs_dict[bug_no]:
                try:
                    if int(words_list[0]) >= start and int(words_list[0]) <= change:
                        pre_change.append(words_list[1:])
                        doc1 += words_list[1:]
                    elif int(words_list[0]) >= change and int(words_list[0]) <= end:
                        post_change.append(words_list[1:])
                        doc2 += words_list[1:]
                except:
                    continue
            
            pre_len = len(pre_change)
            post_len = len(post_change)
            if pre_len > 1 and post_len > 1:
                total += 1
                try:
                    document1 = " ".join(doc1)
                    document2 = " ".join(doc2)
                    inter_cos, extra = cosine_sim(pre_change, post_change, document1, document2, pretrain_model4)
                    
                    document1 = doc_generate(pre_change[:len(pre_change)//2])
                    document2 = doc_generate(pre_change[len(pre_change)//2:])
                    pre_cos, pre_extra = cosine_sim(pre_change[:len(pre_change)//2], pre_change[len(pre_change)//2:], document1, document2, pretrain_model4)
                    
                    document1 = doc_generate(post_change[:len(post_change)//2])
                    document2 = doc_generate(post_change[len(post_change)//2:])
                    post_cos, post_extra = cosine_sim(post_change[:len(post_change)//2], post_change[len(post_change)//2:], document1, document2, pretrain_model4)
                    
                    if inter_cos <= pre_cos and inter_cos <= post_cos:
                        counter += 1
                        report_info[bug_no].append((start, change, end, pre_len, post_len, pre_cos, post_cos, inter_cos, extra))
                        
                except:
                    continue

print(counter/total)

  if __name__ == '__main__':


0.23618090452261306


## Write outputs into csv files

In [113]:
import csv
output = csv.writer(open('word2vec_tfidf_large.csv', 'w'))
output.writerow(['Bug No', 'Start point', 'Change point', 'End point', 'prechange comments', 'postchange comments', 'pre cosine distance', 'post cosine distance', 'inter cosine distance', 'extra words'])

for key, value in report_info.items():
    for info in value:
        try:
            #if info[5] <= info[6] and info[5] < info[7]:
            output.writerow([key, info[0], info[1], info[2], info[3], info[4], info[5], info[6], info[7], info[8]])
        except:
            continue

## Part II -- Average similarity between different bug reports

## Several functions for text pre-processing

In [117]:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re

contractions, contractions_re = _get_contractions(contraction_dict)

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

replace_contractions("this's a text with contraction")

'this is a text with contraction'

In [118]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mr.nothing/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [121]:
def preprocessing(doc):
    doc = re.sub(r'https?:\/\/.*[\r\n]*', ' ', doc, flags = re.MULTILINE)
    doc = replace_contractions(doc)
    doc = re.sub(r'[^a-zA-Z\s]', ' ', doc, flags = re.MULTILINE)
    doc = re.sub(r'\s+', ' ', doc, flags = re.MULTILINE)
    doc = re.sub(r'\n+', ' ', doc, flags = re.MULTILINE)
    doc = doc.strip().lower()
    tokenized_words = doc.split()
    doc = [word for word in tokenized_words if word not in stop_words]
    return doc

## Cosine similarity calculation only based on Word2Vec 

In [164]:
def cosine_sim2(list1, list2, model):
    extra = []
    vec1 = np.zeros((100, ), dtype='float32')
    counter1 = 0
    for sub in list1:
        for word in sub:
            try:
                vec1 = np.add(vec1, model[word])
                counter1 += 1
            except:
                extra.append(word)
                continue
    try:
        vec1 = np.divide(vec1, counter1)
    except:
        pass

    vec2 = np.zeros((100, ), dtype='float32')
    counter2 = 0
    for sub in list2:
        for word in sub:
            try:
                vec2 = np.add(vec2, model[word])
                counter2 += 1
            except:
                extra.append(word)
                continue
    try:
        vec2 = np.divide(vec2, counter2)
    except:pass

    extra = {i:extra.count(i) for i in set(extra)}
    cos_sim = 1 - spatial.distance.cosine(vec1, vec2)
    
    return cos_sim, extra

## Average similarity for different individual bug reports

In [72]:
selected_bugs =[key for key in bugs_dict.keys()]
counter = 0
counter2 = 0
total = 0
threashold = 0.8569841307142506
for i in range(len(selected_bugs)-1):
    for j in range(i+1,len(selected_bugs)):
        counter += 1
        cos_sim, extra = cosine_sim2(bugs_dict[selected_bugs[i]], bugs_dict[selected_bugs[j]], pretrain_model)
        if cos_sim <= threashold:
            counter2 += 1
            #print("The cosine similarity between",selected_bugs[i], "and", selected_bugs[j], "is", cos_sim)
        total += cos_sim
#print("The average similarity among the bugs is", total/counter)
print("The rate under the average is", counter2/counter)
        
        

The rate under the average is 0.4166666666666667


| pre-train model | average similarity | under avg |
| --- | --- | --- |
| word2vec-google-news-300 | 0.8570 | 0.4167 |
| fasttext-wiki-news-subwords-300 | 0.8767 | 0.3478 |
| glove-wiki-gigaword-300 | 0.8034 | 0.4529 |
| bug_comment_model | 0.7470 | 0.4420 |

## Base line generation from 20 newsgroups dataset

In [142]:
from sklearn.datasets import fetch_20newsgroups
target_names = ['alt.atheism',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.misc']

datasets = {}
for target in target_names:
    training_set = fetch_20newsgroups(subset='train', categories=[target])
    datasets[target] = []
    data = training_set.data
    for sentence in data:
        datasets[target].append(preprocessing(sentence))


In [152]:
counter = 0
counter2 = 0
total = 0
threashold = 0.9353
for i in range(len(target_names)-1):
    for j in range(i+1,len(target_names)):
        counter += 1
        cos_sim, extra = cosine_sim2(datasets[target_names[i]], datasets[target_names[j]], pretrain_model4)
        if cos_sim <= threashold:
            counter2 += 1
            #print("The cosine similarity between",selected_bugs[i], "and", selected_bugs[j], "is", cos_sim)
        total += cos_sim
#print("The average similarity among the bugs is", total/counter)
print("The rate under the average is", counter2/counter)

  


The rate under the average is 0.3333333333333333


| pre-train model | average similarity | under avg |
| --- | --- | --- |
| word2vec-google-news-300 | 0.8993 | 0.4761 |
| fasttext-wiki-news-subwords-300 | 0.9482 | 0.3333|
| glove-wiki-gigaword-300 | 0.8965 | 0.4761 |
| bug_comment_model | 0.9353 | 0.3333 |