# Set Variables

In [1]:
# -------- dataset
# software_name = "camel"
# software_name = "cloudstack"
software_name = "geode"
# software_name = "hbase"

# -------- bad smell
# bad_smell = "CC" # Cyclomatic Complexity
bad_smell = "DE" # Design
# bad_smell = "NC" # Npath Complexity

# Google Colab

In [2]:
sub_folder = software_name + "_" + bad_smell

In [3]:
if software_name == "camel":
    dataset_file_name = "camel_DE - v.02"
    
elif software_name == "cloudstack":
    dataset_file_name = "cloudstack_DE - v.01"
    
elif software_name == "geode":
    dataset_file_name = "geode_DE - v.01"
    
else:
    dataset_file_name = "hbase_DE - v.01"

In [4]:
tempPre =    "E:/darsy/00/02- arshad/10- paper code/"
tempData =   "00- My Data/Datasets/Direct Method/"
tempOutput = "01- Jupyter Notebook/Direct Method/00. Output/"

pre_path_data   = tempPre + tempData   + software_name + "/" + sub_folder + "/"
pre_path_output = tempPre + tempOutput + software_name + "/" + sub_folder + "/" + dataset_file_name + "/"

# Libs

In [5]:
import string
import re
import json

import enlighten

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torchvision import transforms
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.text import TextCollection

In [None]:
# import nltk
# nltk.download("stopwords")

In [None]:
!python -V
import matplotlib
print(matplotlib.__version__)

In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Config

In [None]:
mypaths = {
    "dataset":        pre_path_data   + dataset_file_name + ".csv",
    "tfidf": {
        "output_vec": pre_path_output + "tfidf-vector-v01.json",
        "output_brt": pre_path_output + "bugRepTokens-v01.json"
    }
}

preprocessing_params = {
    "columns_name":   ["text",  "bug_class_2"],
    "columns_dtype" : {0: "str", 1: "int64"},
    "bug_classes": [0, 1], 
    "num_bug_classes": 2
}

# I. Compose

In [None]:
class Rows(object):
    def __init__(self, columns_name, bug_classes):
        self.columns_name = columns_name
        self.bug_classes = bug_classes
    
    
    def __call__(self, df):
        # 1. Set cells to None that have just white spaces
        df = df.apply(self.white_spaces_to_None_, axis=1)
        
        # 2. Delete rows that have NaN values in each of its columns
        df.dropna(axis=0, how="any", subset=self.columns_name, inplace=True)
        
        # 3. Delete rows with class value other than [0, 1]
        indexNames = df[~df["bug_class_2"].isin(self.bug_classes)].index
        df.drop(indexNames, axis=0, inplace=True)
        
        return df
    
    
    # set columns that just have white spaces to None
    def white_spaces_to_None_(self, row):
        for i in self.columns_name:
            if row[i] and len(str(row[i]).strip()) == 0:
                row[i] = None
        return row

# II. Read Files

In [None]:
df_main = pd.read_csv(
    mypaths["dataset"], 
    names=preprocessing_params["columns_name"], 
    dtype=preprocessing_params["columns_dtype"],
    header=None, 
    skip_blank_lines=True
)

print("len df_main before compose: ", len(df_main))

In [None]:
composed_pre = transforms.Compose([
    Rows(
        preprocessing_params["columns_name"], 
        preprocessing_params["bug_classes"]
    )
])

df_main = composed_pre(df_main)

print("len df_main after compose: ", len(df_main))

In [None]:
texts = df_main["text"].tolist()

# IV. ProgressLines

In [None]:
class ProgressLines():
    
    def progress_lines(self, num, total, description, unit, colour):
        desc = self.set_strings_to_equal_len_(description)
        manager = enlighten.get_manager()
        progresses = []
        for i in range(num):
            prog = manager.counter(total=total[i], desc=desc[i], unit=unit[i], color=colour[i])
            prog.refresh()
            progresses.append(prog)
        self.progresses = progresses
    
    
    def set_strings_to_equal_len_(self, description):
        max_len = 0
        longest_string_length = len(max(description, key=len))
        w = []
        for i, word in enumerate(description):
            temp = longest_string_length - len(word)
            w.append(word + " " * temp)
        return w

# V. Preprocessing

In [None]:
class Preprocessing():
    
    docMaxTokenNo_org = 0
    bugRepTokens = [] # [[w1, w2, w3, ...], [w1, w2, ...], ...]
    vector_tfidf = [] # array of dictinaries: [{"w1": 0.1, "w2": 0.3, ...}, {}, ...]
    w2vDic = {} # dic : {"w1": [0.1, 0.2, ...], "w2": [0.1, 0.3, ...], ...}
    paddingVector = np.zeros(300, dtype="float32")
    
    
    # ************************** tokenize ************************** #
    
    def tokenize(self, texts):
        stop_words = set(stopwords.words("english"))
        excludedTokens = {"http", "url", "https"}
        
        for i, doc in enumerate(texts):
            thisTokens = []
            doc = doc.lower()
            for token in WordPunctTokenizer().tokenize(doc):
                if (token in string.punctuation or token in stop_words or token in excludedTokens or 
                    (not re.findall("\w", token)) or re.findall("\A[0-9]", token)):
                    continue
                thisTokens.append(token)
                self.w2vDic[token] = self.paddingVector
            self.bugRepTokens.append(thisTokens)
            if (len(thisTokens) > self.docMaxTokenNo_org):
                self.docMaxTokenNo_org = len(thisTokens)
    
    
    # calculate tfidf of corpuses words
    def vectorize_tfidf(self):
        texts = TextCollection(self.bugRepTokens)
        tempDic = {}
        
        # --- ProgressLines
        pl = ProgressLines()
        pl.progress_lines(1, [len(self.bugRepTokens)], ["vectorize_tfidf"], ["bug"], ["blue"])
        
        # --- vectorize_tfidf
        for doc in self.bugRepTokens:
            tempDic = {term: texts.tf_idf(term, doc) for term in doc}
            tempDic = {term: w for term, w in sorted(tempDic.items(), key=lambda item:item[1], reverse=True)}
            self.vector_tfidf.append(tempDic)
            pl.progresses[0].update()
    
    
    def save_to_file_tfidf(self, vector_tfidf_path):
        with open(vector_tfidf_path, "w") as fout:
            json.dump(self.vector_tfidf, fout)
#         with open(bugRepTokens_path, "w") as fout:
#             json.dump(self.bugRepTokens, fout)

## Vectorize

In [None]:
ds = Preprocessing()

ds.tokenize(texts)
ds.vectorize_tfidf()

ds.save_to_file_tfidf(
    mypaths["tfidf"]["output_vec"]
    #mypaths["tfidf"]["output_brt"]
)

In [None]:
print("df_main length:    ", len(df_main))
print("bugRepTokens:      ", len(ds.bugRepTokens))
print("vector_tfidf:      ", len(ds.vector_tfidf))
print("docMaxTokenNo_org: ", ds.docMaxTokenNo_org)
print("w2vDic:            ", len(ds.w2vDic))