In [1]:
#Scripts information:
#Input: File level dataset. Files: "Nova.csv","Ironic.csv", "Base.csv"
#Output: File level result. File: "csv_commented_lineLevel_raw.csv" includes 'dataset','changeId','fileName','lineNumber','token','ngramScore','groundTruth','lengthScore'
#                                  "csv_commented_limescore.csv" includes 'dataset','changeId','fileName','lineNumber','token','limeScore'
#Description: This script is used to generate the csv files for predicting lines to be commented for RQ2
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from scipy.sparse import hstack
from scipy import sparse
from lime.lime_tabular import LimeTabularExplainer
import time, pickle, math, warnings, os, operator
import matplotlib.pyplot as plt
import csv
import math

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from scipy.optimize import differential_evolution
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier

# path that saves prediction result from n-gram model 
eval_file_path = './dataset/eval_file'

# path that saves all trained models
model_path = './dataset/ml-model'

# path that saves all LIME models
explainer_model_path = './dataset/lime-feature-model'

#load file level dataset
data_nova = pd.read_csv('./dataset/fileLevel/OpenStack_ex.csv', dtype=None, sep=',').to_numpy()
# data_ironic = pd.read_csv('./dataset/fileLevel/Ironic.csv', dtype=None, sep=',').to_numpy()
data_base = pd.read_csv('./dataset/fileLevel/Base_ex.csv', dtype=None, sep=',').to_numpy()
#load line level dataset
line_data_nova = pd.read_csv('./dataset/lineLevel/dataset-openstack-linelevel/OpenStack_ex.csv', dtype=None, sep=',').to_numpy()
# line_data_ironic = pd.read_csv('./dataset/lineLevel/dataset-openstack-linelevel/Ironic.csv', dtype=None, sep=',').to_numpy()
line_data_base = pd.read_csv('./dataset/lineLevel/dataset-qt-linelevel/qt_base_ex.csv', dtype=None, sep=',').to_numpy()


In [2]:
#generate training/test datset
def getDatasetFromRawData(project_source,data, bias):
    row_data = data[0:,3]
    row_data_Y = data[0:,0]
    if project_source == "base":
        row_data_deletions = data[0:,6]
        row_data_additions = data[0:,7]
        row_data_changedLine = data[0:,8]
    else:
        row_data_deletions = data[0:,7]
        row_data_additions = data[0:,8]
        row_data_changedLine = data[0:,9]
    Y_train = []
    is_comment = 0
    not_comment = 0
    for element in row_data_Y:
        if(element == 0):
            Y_train.append(False)
            not_comment += 1
        else:
            Y_train.append(True)
            is_comment += 1
    Y_train = np.array(Y_train)

    #finding a index that wouldn't separate file in same changeId into both training dataset and test dataset
    first = int(len(data)*0.6)
    divider = int(len(data)*0.2) + first + bias
    data_count_vect = CountVectorizer(min_df=2, max_df=0.5)
    train_row_data = row_data[:divider]
    test_row_data = row_data[divider:]
    train_row_data_deletions = row_data_deletions[:divider]
    test_row_data_deletions = row_data_deletions[divider:]
    train_row_data_additions = row_data_additions[:divider]
    test_row_data_additions = row_data_additions[divider:]
    train_row_data_changedLine = row_data_changedLine[:divider]
    test_row_data_changedLine = row_data_changedLine[divider:]
    data_train_counts = data_count_vect.fit_transform(train_row_data)
    data_test_counts = data_count_vect.transform(test_row_data)
    final_train_X = np.hstack((data_train_counts.toarray(),train_row_data_deletions[:,None]))
    final_train_X = np.hstack((final_train_X,train_row_data_additions[:,None]))
    final_train_X = np.hstack((final_train_X,train_row_data_changedLine[:,None]))
    final_test_X = np.hstack((data_test_counts.toarray(),test_row_data_deletions[:,None]))
    final_test_X = np.hstack((final_test_X,test_row_data_additions[:,None]))
    final_test_X = np.hstack((final_test_X,test_row_data_changedLine[:,None]))
    final_train_y = Y_train[:divider]
    final_test_y = Y_train[divider:]
    print(type(data_count_vect))
    data_count_vect.fit_transform(train_row_data)
    print(type(data_count_vect))
    del data_train_counts,data_test_counts,train_row_data,test_row_data,data,row_data,row_data_Y
    return final_train_X,final_train_y,final_test_X,final_test_y,divider,data_count_vect

In [3]:
def printResult(x, y, model):
    print("AUC:",roc_auc_score(y, model.predict_proba(x)[:,1]))
    print("Precision:",precision_score(y, model.predict(x)))
    print("Recall:",recall_score(y, model.predict(x)))
    print("F1:",f1_score(y, model.predict(x)))
    print("Confusion matrix: \n",confusion_matrix(y, model.predict(x)))

In [4]:
def trainRFmodel(project,rf_train_X,rf_train_y,rf_test_X,rf_test_y,seed,bias):
    train_rf_model_path = model_path+'/smote_abstr_number_df_2_rf_'+project+'-'+str(seed)+str(bias)+'.pkl'
    if not os.path.exists(train_rf_model_path):
        rf = RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=seed, class_weight='balanced')
        rf_X, rf_y = RandomUnderSampler(sampling_strategy=0.5, random_state=seed).fit_resample(rf_train_X, rf_train_y)
        rf_X, rf_y = SMOTE(k_neighbors=100, random_state=seed).fit_resample(rf_X, rf_y)
        # rf_X, rf_y = SMOTE(k_neighbors=100, random_state=seed).fit_resample(rf_train_X, rf_train_y)
        rf.fit(rf_X,rf_y)
        rf_ouput = open(train_rf_model_path, 'wb')
        pickle.dump(rf,rf_ouput)
        print("finish to creat a new model")
    else:
        with open(train_rf_model_path,'rb') as f:
            rf = pickle.load(f)
    printResult(rf_test_X,rf_test_y,rf)
    return rf

In [5]:
#Return a dictionary include the correctly predicted files by our rf model.
def getCorrectedPredictFileDict(model,data,test_X,test_y,divider):
    predict_counter = 0;
    correct_prediction_dict = {}
    predic_set = []
    test_dataset_row = data[divider:]
    predict_label = model.predict(test_X)
    for index in range(len(predict_label)):
        if(predict_label[index] == test_y[index] and test_y[index] == 1):
            predict_counter += 1
            changeId = test_dataset_row[index][2]
            fileName = test_dataset_row[index][1]
            if changeId in correct_prediction_dict:
                correct_prediction_dict[changeId][fileName] = test_X[index]
            else:
                correct_prediction_dict[changeId] = {fileName:test_X[index]}
            predic_set.append((changeId,fileName))
    return correct_prediction_dict,set(predic_set)

In [6]:
#Return a dictionary include all lines in the correctly predicted files.
def getCorrectedPredictLineDict(correct_prediction_dict,line_data, ngram=False):
    line_check_counter = 0
    line_predict_dict = {}
    line_predict_set = []
    for line_element in line_data:
        line_predict_fileName = line_element[2]
        line_predict_changeId = line_element[3]
        if ngram == True:
            line_predict_code = str(line_element[6]).replace("<NUMBER>","")
        else:
            line_predict_code = line_element[6]
        line_predict_loc = line_element[9]
        line_predict_label = line_element[0]
        if line_predict_changeId in correct_prediction_dict:
            if line_predict_fileName in correct_prediction_dict[line_predict_changeId]:
                if line_predict_changeId in line_predict_dict:
                    if line_predict_fileName in line_predict_dict[line_predict_changeId]:
                        line_predict_dict[line_predict_changeId][line_predict_fileName].append((line_predict_code, line_predict_loc, line_predict_label))
                    else:
                        line_predict_dict[line_predict_changeId][line_predict_fileName] = [(line_predict_code, line_predict_loc, line_predict_label)]
                else:
                    line_predict_dict[line_predict_changeId] = {line_predict_fileName:[(line_predict_code, line_predict_loc, line_predict_label)]}
                line_predict_set.append((line_predict_changeId,line_predict_fileName))
    return line_predict_dict,set(line_predict_set)

In [7]:
def getDistributionLineInTrain(row_data, line_row_data,divider):
    train_row_data_contents = row_data[:divider]
    train_row_data_dict = {}
    #Build a dict only includes the training data.
    for train_row_data_content in train_row_data_contents:
        train_row_data_label = train_row_data_content[0]
        train_row_data_file = train_row_data_content[1]
        train_row_data_changeId = train_row_data_content[2]
        if train_row_data_label == 1:
            if train_row_data_changeId in train_row_data_dict:
                train_row_data_dict[train_row_data_changeId].append(train_row_data_file)
            else:
                train_row_data_dict[train_row_data_changeId] = [train_row_data_file]
    line_row_data_dict = {}
    code_length_distribution = []
    for line_code_content in line_row_data:
        train_row_line_data_label = line_code_content[0]
        train_row_line_data_file = line_code_content[2]
        train_row_line_data_changeId = line_code_content[3]
        train_row_line_data_code = line_code_content[6]
        #check if the LOC's file and change id in the training dataset 
        if train_row_line_data_changeId in train_row_data_dict and train_row_line_data_file in train_row_data_dict[train_row_line_data_changeId] and train_row_line_data_label == 1:
            #Save to a new dict
            code_length_distribution.append(len(str(train_row_line_data_code).split()))
            if train_row_line_data_changeId in line_row_data_dict:
                if train_row_line_data_file in line_row_data_dict[train_row_line_data_changeId]:
                    line_row_data_dict[train_row_line_data_changeId][train_row_line_data_file].append(train_row_line_data_code)
                else:
                    line_row_data_dict[train_row_line_data_changeId] = {train_row_line_data_file:[train_row_line_data_code]}
            else:
                line_row_data_dict[train_row_line_data_changeId] = {train_row_line_data_file:[train_row_line_data_code]}
    length_count_dict = {}
    (unique, counts) = np.unique(code_length_distribution,return_counts=True)
    for index in range(len(unique)):
        length_count_dict[unique[index]] = float(counts[index]/len(code_length_distribution))
    return length_count_dict,code_length_distribution,line_row_data_dict,train_row_data_dict

In [8]:
#Generate the LIME score 
def getLimeExplainerScoreDict(project,correct_prediction_dict,data_count_vect,lime_train_X,model,seed,bias):
    explainer_feature_path = explainer_model_path + '/feature_ouput_' + project +'-'+ str(seed)+'-'+str(bias)+'.pkl' 

    if not os.path.exists(explainer_feature_path): 
        feature_dict = {}
                
        stop_tokens = ["*deletedLine","*addedLine",'*changedLine']
        python_common_tokens = []
        stop_tokens = ["*deletedLine","*addedLine","*changedLine","i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
        python_common_tokens = ['abs','delattr','hash','memoryview','set','all','dict','help','min','setattr','any','dir','hex','next','slice','ascii','divmod','id','object','sorted','bin','enumerate','input','oct','staticmethod','bool','eval','int','open','str','breakpoint','exec','isinstance','ord','sum','bytearray','filter','issubclass','pow','super','bytes','float','iter','print','tuple','callable','format','len','property','type','chr','frozenset','list','range','vars','classmethod','getattr','locals','repr','zip','compile','globals','map','reversed','import','complex','hasattr','max','round','False','await','else','import','passNone','break','except','in','raise','True','class','finally','is','return','and','continue','for','lambda','try','as','def','from','nonlocal','while','assert','del','global','not','with','async','elif','if','or','yield', 'self']
        c_common_tokens = ['auto','const','double','float','int','short','struct','unsigned','break','continue','else','for','long','signed','switch','void','case','default','enum','goto','register','sizeof','typedef','volatile','char','do','extern','if','return','static','union','while','asm','namespace','try','bool','explicit','new','typeid','catch','false','operator','template','typename','class','friend','private','this','using','inline','public','throw','virtual','delete','mutable','protected','true']
        if project == 'nova' or project == 'ironic':
            print("using python stopword")
            common_tokens = python_common_tokens
        if project == 'base':
            print("using c stopword")
            common_tokens = c_common_tokens
        all_features = np.concatenate([data_count_vect.get_feature_names_out(), ['*deletedLine','*addedLine','*changedLine']])
        explainer = LimeTabularExplainer(lime_train_X, 
                                          feature_names=all_features, 
                                          class_names=['True','False'],
                                          discretize_continuous=False, random_state=seed
                                         )
        feature_training_counter = 0
        print("Training explainer")
        for key,value in correct_prediction_dict.items():
            for fileName, codeArray in value.items():
                print(feature_training_counter)
                exp = explainer.explain_instance(codeArray,model.predict_proba,num_features=len(all_features), top_labels=1)
                features_val = exp.as_list(label=1)
                new_features_val = [tup for tup in features_val]
                if key in feature_dict:
                    feature_dict[key][fileName] = {val[0]:val[1] for val in new_features_val if val[0] not in stop_tokens and val[0] not in common_tokens}
                else:
                    feature_dict[key] = {fileName: {val[0]:val[1] for val in new_features_val if val[0] not in stop_tokens and val[0] not in common_tokens}}
                feature_training_counter += 1
                
        feature_ouput = open(explainer_feature_path, 'wb')
        pickle.dump(feature_dict,feature_ouput)
        
        print("write feature output to pickle")
    else:
        with open(explainer_feature_path, "rb") as f:
            feature_dict = pickle.load(f)
    return feature_dict

In [9]:
#Ngram
# 1. Run functions"outputSlpTestFile","outputSlpTrainFile","generateNgramFiles" 
# 2. A training file "ngram_train_PORJECTNAME_2.csv" and a testing file "ngram_test_PORJECTNAME_2.csv" for ngram approach will be geenrated in the path "data_process/commented/dataset/eval_file/"
# 2. Put these two files to the path "/SLP-Core/src/main/java/slp/core/example/" and follow the code comments in EntrpoyForEachLine.java. It would generate result file for ngram named "entropy_PROJECT_2.csv" 
# 3. Then put result of ngram file "entropy_PROJECT_2.csv" to the path "data_process/commented/dataset/eval_file/"
# 4. Run rest of code

In [10]:
#Generate test dataset for ngram baseline approach
#Output files "ngram_test_PORJECTNAME_2.csv"
def outputSlpTestFile(project,line_predict_dict_ngram,seed):
    csv_ngram_path = eval_file_path + '/ngram_test_'+ project +'-'+str(seed)+'.csv'
    csv_lime_path = eval_file_path + '/lime_test_'+ project +'-'+str(seed)+'.csv'
    csv_ngram_file = open(csv_ngram_path,"w")
    csv_lime_file = open(csv_lime_path,"w")
    csv_ngram_writer = csv.writer(csv_ngram_file,quoting=csv.QUOTE_NONE,escapechar='　') 
    csv_lime_writer = csv.writer(csv_lime_file) 
    for key,value in line_predict_dict_ngram.items():
        for fileName, codeLines in value.items():
            for codeLine in codeLines:
                csv_ngram_writer.writerow([codeLine[0]])
                csv_lime_writer.writerow([key, fileName, codeLine[0].strip(),codeLine[1],codeLine[2]])
    csv_ngram_file.close()  
    csv_lime_file.close()
    print("write test datset done")

In [11]:
#Generate training dataset for ngram baseline approach
#Output files "ngram_train_PORJECTNAME_2.csv"
def outputSlpTrainFile(project,seed):
    capitalied_project = project.capitalize()
    data_ngram_train = pd.read_csv(eval_file_path + '/ngramTrain' + capitalied_project + '.csv', dtype=None, sep=',',header=None).to_numpy().tolist()
    data_train_ngram_test = pd.read_csv(eval_file_path + '/ngram_test_' + project + '-'+str(seed)+'.csv', dtype=None, sep=',',header=None).to_numpy().tolist()
    data_ngram_train = [x[0] for x in data_ngram_train]
    data_train_ngram_test = [x[0].strip() for x in data_train_ngram_test]
    new_data_train_ngram_test = list(set(data_ngram_train) - set(data_train_ngram_test))

    csv_ngram_train_path = eval_file_path + '/ngram_train_'+ project +'-'+str(seed)+'.csv'
    csv_new_train_ngram_file = open(csv_ngram_train_path,"w")
    csv_ngram_train_writer = csv.writer(csv_new_train_ngram_file,quoting=csv.QUOTE_NONE,escapechar='　') 
    for code in new_data_train_ngram_test:
        csv_ngram_train_writer.writerow([code])
    csv_new_train_ngram_file.close()  
    print("write train datset done")

In [12]:
#Output ngram training/test dataset
def generateNgramFiles(seed,projectName,data,line_data,bias):
    projectName = projectName
    train_X, train_y, test_X, test_y,divider,data_count_vect = getDatasetFromRawData(projectName,data, bias)
    rf = trainRFmodel(projectName,train_X, train_y, test_X, test_y,seed,bias)
    correct_prediction_dict, predic_set = getCorrectedPredictFileDict(rf,data,test_X,test_y,divider)
    line_predict_dict_ngram,line_predict_set_ngram = getCorrectedPredictLineDict(correct_prediction_dict,line_data, ngram=True)
    outputSlpTestFile(projectName,line_predict_dict_ngram,seed)
    outputSlpTrainFile(projectName,seed)

In [13]:
#generate ngram rank score
def evalLineNgram(project,seed):
    data_entropy = pd.read_csv(eval_file_path + '/entropy_'+project+'-'+str(seed)+'.csv', dtype=None, sep=',',header=None).to_numpy()
    data_ngram_test = pd.read_csv(eval_file_path + '/lime_test_'+project+'-'+str(seed)+'.csv', dtype=None, sep=',',header=None).to_numpy()

    ngram_eval_line = {}
    for index in range(len(data_ngram_test)):
        eval_line_ngram_changeId = data_ngram_test[index][0]
        eval_line_ngram_fileName = data_ngram_test[index][1]
        eval_line_ngram_code = data_ngram_test[index][2]
        eval_line_ngram_line = data_ngram_test[index][3]
        eval_line_ngram_label = data_ngram_test[index][4]
        eval_line_entropy = data_entropy[index][0]
        if eval_line_ngram_changeId in ngram_eval_line:
            if eval_line_ngram_fileName in ngram_eval_line[eval_line_ngram_changeId]:
                ngram_eval_line[eval_line_ngram_changeId][eval_line_ngram_fileName].append((eval_line_ngram_code,eval_line_ngram_line,eval_line_ngram_label,eval_line_entropy))
            else:
                ngram_eval_line[eval_line_ngram_changeId][eval_line_ngram_fileName] = [(eval_line_ngram_code,eval_line_ngram_line,eval_line_ngram_label,eval_line_entropy)]
        else:
            ngram_eval_line[eval_line_ngram_changeId] = {eval_line_ngram_fileName:[(eval_line_ngram_code,eval_line_ngram_line,eval_line_ngram_label,eval_line_entropy)]}
    return ngram_eval_line

In [14]:
#Generate csv files fr our approach
def generateRawDataCsv(result,project,eval_dict_ngram,length_count_dict):
    for changeId, pair in eval_dict_ngram.items():
        for fileName, value in pair.items():
            for index, element in enumerate(value):
                token_length = len(str(element[0]).split())
                length_score = length_count_dict.get(token_length,0)
                result.append({"dataset": project, "changeId":changeId, "fileName": fileName, "lineNumber":element[1], "token":element[0], "ngramScore":str(element[3]), "groundTruth":element[2], "lengthScore":length_score})
    return result

def generateLimeScoreDataCsv(result,project,feature_dict,line_predict_dict):
    for changeId, pair in line_predict_dict.items():
        for fileName, value in pair.items():
            for index, element in enumerate(value):
                code = list(set(element[0].lower().split()))
                for token in code:
                    token_score = feature_dict[changeId][fileName].get(token,0)
                    result.append({"dataset": project, "changeId":changeId, "fileName": fileName, "lineNumber":element[1],"token":token,"limeScore":token_score})
    return result

In [15]:
#generate result for our approach
def result(seed,projectName,data,line_data,bias):
    projectName = projectName
    #RF
    train_X, train_y, test_X, test_y,divider,data_count_vect = getDatasetFromRawData(projectName,data, bias)
    rf = trainRFmodel(projectName,train_X, train_y, test_X, test_y,seed,bias)
    correct_prediction_dict, predic_set = getCorrectedPredictFileDict(rf,data,test_X,test_y,divider)
    line_predict_dict,line_predict_set = getCorrectedPredictLineDict(correct_prediction_dict,line_data)
    feature_dict = getLimeExplainerScoreDict(projectName,correct_prediction_dict,data_count_vect,train_X,rf,seed,bias)
    length_count_dict,code_length_distribution,line_row_data_dict,train_row_data_dict = getDistributionLineInTrain(data,line_data,divider)
    eval_dict_ngram = evalLineNgram(projectName,seed)

    raw_data_result = []
    lime_score_result = []   
    raw_data_result = generateRawDataCsv(raw_data_result,projectName,eval_dict_ngram,length_count_dict)
    lime_score_result = generateLimeScoreDataCsv(lime_score_result,projectName,feature_dict,line_predict_dict)
    return raw_data_result,lime_score_result

In [16]:
#output csv files at path: ./dataset/csv/
def outputcsv():
    raw_data_result_nova,lime_score_result_nova = result(2,"nova",data_nova,line_data_nova,5) 
    #raw_data_result_ironic,lime_score_result_ironic = result(2,"ironic",data_ironic,line_data_ironic,-4) 
    raw_data_result_base,lime_score_result_base = result(2,"base",data_base,line_data_base,6)
    raw_result = raw_data_result_nova + raw_data_result_base
    lime_result = lime_score_result_nova + lime_score_result_base

    csv_path_raw = './dataset/csv/csv_commented_lineLevel_raw.csv'
    csv_file_raw = open(csv_path_raw,"w")
    fieldnames_raw = ['dataset','changeId','fileName','lineNumber','token','ngramScore','groundTruth','lengthScore']
    print("generating csv, length:", len(raw_result))
    csv_writer_raw = csv.DictWriter(csv_file_raw,quoting=csv.QUOTE_NONE,escapechar=';', fieldnames= fieldnames_raw) 
    csv_writer_raw.writeheader()
    for row in raw_result:
        csv_writer_raw.writerow(row) 
    print("csv generated")

    csv_path_raw = './dataset/csv/csv_commented_limescore.csv'
    csv_file_raw = open(csv_path_raw,"w")
    fieldnames_raw = ['dataset','changeId','fileName','lineNumber','token','limeScore']
    print("generating csv, length:", len(raw_result))
    csv_writer_raw = csv.DictWriter(csv_file_raw,quoting=csv.QUOTE_NONE,escapechar=';', fieldnames= fieldnames_raw) 
    csv_writer_raw.writeheader()
    for row in lime_result:
        csv_writer_raw.writerow(row) 
    print("csv generated")

In [17]:
#Generate ngram training/test dataset for studied projects
generateNgramFiles(2,"nova",data_nova,line_data_nova,5)
# generateNgramFiles(2,"ironic",data_ironic,line_data_ironic,-4)
generateNgramFiles(2,"base",data_base,line_data_base,6)

<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.CountVectorizer'>
AUC: 0.8396746203904555
Precision: 0.17938931297709923
Recall: 0.5222222222222223
F1: 0.26704545454545453
Confusion matrix: 
 [[2090  215]
 [  43   47]]
write test datset done
write train datset done
<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.CountVectorizer'>
AUC: 0.7408735687889986
Precision: 0.11736334405144695
Recall: 0.4294117647058823
F1: 0.18434343434343436
Confusion matrix: 
 [[3335  549]
 [  97   73]]
write test datset done
write train datset done


In [18]:
#Generate csv files for evaluation and data evaluation in R scripts
outputcsv()

<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.CountVectorizer'>
AUC: 0.8396746203904555
Precision: 0.17938931297709923
Recall: 0.5222222222222223
F1: 0.26704545454545453
Confusion matrix: 
 [[2090  215]
 [  43   47]]
using python stopword
Training explainer
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
write feature output to pickle
<class 'sklearn.feature_extraction.text.CountVectorizer'>
<class 'sklearn.feature_extraction.text.CountVectorizer'>
AUC: 0.7408735687889986
Precision: 0.11736334405144695
Recall: 0.4294117647058823
F1: 0.18434343434343436
Confusion matrix: 
 [[3335  549]
 [  97   73]]
using c stopword
Training explainer
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
write feature 