In [2]:
import numpy as np
import pandas as pd
import sys
import re
import time
import datetime
from time import time, gmtime, strftime
from datetime import datetime
from hyperdash import monitor_cell  
import statistics
from tqdm import tqdm
from __future__ import division
from collections import Counter
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, cohen_kappa_score, roc_auc_score, roc_curve, auc, mutual_info_score, average_precision_score, precision_recall_curve
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize as tokenizer

from ipykernel import kernelapp as app

# no need of "print" for several objects!!!
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

init_vec = True

2 possibilities:
- Load and merge samples and labels.
- Load pre-built dataset.

(go to either of them with Ctrl+F)

#### Load and merge samples and labels.

In [2]:
path = '/media/hdd/salaun/wikiedit/annotations/data_v1/'

file_list = ['00_09_42',
             '00_09_73',
             '10_19_42',
             '20_24_42',
             '25_29_42',
             '30_34_42',
             '35_39_42',
             '40_44_42_uniq_parag',
             '40_44_42',
             '40_44_73',
             '45_49_42',
             '45_49_73',
             '50_54_42',
             '50_54_73',
             '55_59_42',
             '55_59_73',
             '60_65_42',
             '66-69_72_42',
             '66-69_72_73'
              ]
df_sample = pd.concat([pd.read_csv(path + file + '_sample.tsv', sep="\t") for file in file_list], ignore_index=True)
df_post_annot = pd.concat([pd.read_csv(path + file + '_post_annot_S.csv', sep=';', header=None) for file in file_list], ignore_index=True)
df_post_annot.columns = ['index', 'sample_id', 'raw_annot']

len(file_list)
df_sample.shape
df_post_annot.shape

19

(1900, 35)

(1900, 3)

In [3]:
# get dummies
print('Length df_sample:', len(df_sample))
print('Length df_post_annot:', len(df_post_annot))

labels_lists = ['ortho_gram_typo','wiki_formatting', 'vandal', 'reordering', 
              'revert_vandal', 'revert_other', 'content_remove', 'content_add',
              'nbr_wr', 'nbr_rw', 'nbr_x', 
              'semant_simil', 'semant_diff', 
              'other']

gross_labels_list = ['semant', 'nbr', 'semant_or_nbr']

for label in labels_lists:
    df_post_annot[label] = df_post_annot.raw_annot.str.contains(label)


for label in gross_labels_list: #initialization
    df_post_annot[label] = False


for i in range(0, df_post_annot.shape[0]):
    
    if df_post_annot.semant_simil.iloc[i] or df_post_annot.semant_diff.iloc[i]:
        df_post_annot.loc[i,'semant'] = True

    if df_post_annot.nbr_wr.iloc[i] or df_post_annot.nbr_rw.iloc[i] or df_post_annot.nbr_x.iloc[i]:
        df_post_annot.loc[i,'nbr'] = True

    if df_post_annot.semant.iloc[i]==True or df_post_annot.nbr.iloc[i]==True:
        df_post_annot.loc[i,'semant_or_nbr'] = True

df_sample.shape
df_post_annot.shape

Length df_sample: 1900
Length df_post_annot: 1900


(1900, 35)

(1900, 20)

In [4]:
print("DISTRIBUTION OF LABELS OVER ALL ANNOTATIONS \n")

df_annot_distrib = pd.DataFrame(index=labels_lists, columns=['absolute number', 'share of all observations'])
df_annot_distrib_bis = pd.DataFrame(index=gross_labels_list, columns=['absolute number', 'share of all observations'])

for label in labels_lists:
    df_annot_distrib.loc[label,'absolute number'] = (df_post_annot[label]==1).sum()
    df_annot_distrib.loc[label,'share of all observations'] = 100*((df_post_annot[label]==1).sum())/len(df_post_annot)
df_annot_distrib 
    
for label in gross_labels_list:
    df_annot_distrib_bis.loc[label,'absolute number'] = (df_post_annot[label]==1).sum()
    df_annot_distrib_bis.loc[label,'share of all observations'] = 100*((df_post_annot[label]==1).sum())/len(df_post_annot)
df_annot_distrib_bis   

DISTRIBUTION OF LABELS OVER ALL ANNOTATIONS 



Unnamed: 0,absolute number,share of all observations
ortho_gram_typo,506,26.6316
wiki_formatting,696,36.6316
vandal,114,6.0
reordering,91,4.78947
revert_vandal,12,0.631579
revert_other,6,0.315789
content_remove,116,6.10526
content_add,477,25.1053
nbr_wr,39,2.05263
nbr_rw,7,0.368421


Unnamed: 0,absolute number,share of all observations
semant,355,18.6842
nbr,48,2.52632
semant_or_nbr,403,21.2105


#### Merge samples and labels.

In [5]:
df_XY = pd.concat([df_sample, df_post_annot.loc[:,'raw_annot':'semant_or_nbr']], axis=1)
df_XY.shape
df_XY.columns
df_XY = df_XY.fillna('')
df_XY.tail(2)

(1900, 53)

Index(['Unnamed: 0', 'comment', 'filt_bot', 'filt_coher', 'filt_confli',
       'filt_contradic', 'filt_erreur', 'filt_erron', 'filt_faux',
       'filt_frappe', 'filt_gramma', 'filt_ortho', 'filt_revert', 'filt_sens',
       'filt_tromp', 'filt_typo', 'filt_vandalisme', 'filt_vraise', 'format',
       'id', 'id_file', 'id_modif', 'label_incoherence', 'minor', 'model',
       'modif', 'modif_add', 'modif_remove', 'parentid', 'registered',
       'sample_id', 'timestamp', 'title', 'user_id', 'username', 'raw_annot',
       'ortho_gram_typo', 'wiki_formatting', 'vandal', 'reordering',
       'revert_vandal', 'revert_other', 'content_remove', 'content_add',
       'nbr_wr', 'nbr_rw', 'nbr_x', 'semant_simil', 'semant_diff', 'other',
       'semant', 'nbr', 'semant_or_nbr'],
      dtype='object')

Unnamed: 0.1,Unnamed: 0,comment,filt_bot,filt_coher,filt_confli,filt_contradic,filt_erreur,filt_erron,filt_faux,filt_frappe,filt_gramma,filt_ortho,filt_revert,filt_sens,filt_tromp,filt_typo,filt_vandalisme,filt_vraise,format,id,id_file,id_modif,label_incoherence,minor,model,modif,modif_add,modif_remove,parentid,registered,sample_id,timestamp,title,user_id,username,raw_annot,ortho_gram_typo,wiki_formatting,vandal,reordering,revert_vandal,revert_other,content_remove,content_add,nbr_wr,nbr_rw,nbr_x,semant_simil,semant_diff,other,semant,nbr,semant_or_nbr
1898,98,,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,text/x-wiki,95886840,p5847438p6039053,1279779,False,False,wikitext,"- '''Michael Loewe''', de son vrai nom Mihai L...","'''Michael Loewe''', de son vrai nom Mihai Leu...","'''Michael Loewe''', de son vrai nom Mihai Leu...",95886829.0,True,p5847438p6039053_1279779,2013-08-17T21:34:39Z,Michael Loewe,31767,Skblzz1,ortho_gram_typo,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1899,99,/* Synopsis */,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,text/x-wiki,74105978,p5847438p6039053,1046153,False,False,wikitext,- Un homme et deux femmes se mettent à table. ...,Un homme et deux femmes se mettent à table. Au...,Un homme et deux femmes se mettent à table. Au...,74105972.0,True,p5847438p6039053_1046153,2012-01-09T02:18:33Z,Le Repas fantastique,418936,Celette,ortho_gram_typo,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


#### Load pre-built dataset.

In [None]:
#df_XY.to_csv(path + 'full_dataset_with_all_labels.tsv', sep='\t')

In [6]:
df_XY = pd.read_csv(path + 'full_dataset_with_all_labels.tsv', sep='\t')
df_XY.columns
df_XY = df_XY.loc[:,'comment':'semant_or_nbr']
df_XY.shape

Index(['Unnamed: 0', 'Unnamed: 0.1', 'comment', 'filt_bot', 'filt_coher',
       'filt_confli', 'filt_contradic', 'filt_erreur', 'filt_erron',
       'filt_faux', 'filt_frappe', 'filt_gramma', 'filt_ortho', 'filt_revert',
       'filt_sens', 'filt_tromp', 'filt_typo', 'filt_vandalisme',
       'filt_vraise', 'format', 'id', 'id_file', 'id_modif',
       'label_incoherence', 'minor', 'model', 'modif', 'modif_add',
       'modif_remove', 'parentid', 'registered', 'sample_id', 'timestamp',
       'title', 'user_id', 'username', 'raw_annot', 'ortho_gram_typo',
       'wiki_formatting', 'vandal', 'reordering', 'revert_vandal',
       'revert_other', 'content_remove', 'content_add', 'nbr_wr', 'nbr_rw',
       'nbr_x', 'semant_simil', 'semant_diff', 'other', 'semant', 'nbr',
       'semant_or_nbr'],
      dtype='object')

(1900, 52)

### Compute vector difference for each pair of strings
Vector size = 300

In [7]:
%%time

# use embeddings > sum(vecs_s1) - sum(vecs_s2) = (1,300)

path = '/media/hdd/sileo/embeddings/'

if init_vec:
    word_vectors = KeyedVectors.load_word2vec_format(path+'wiki.fr.vec', binary=False, encoding='utf8')
    word_vectors_en = KeyedVectors.load_word2vec_format(path+'wiki-news-300d-1M.vec', binary=False, encoding='utf8')
    #word_vectors = KeyedVectors.load_word2vec_format(path+'wiki.fr.vec', binary=False, encoding='utf8')  # C text format
    #word_vectors_en_sub = KeyedVectors.load_word2vec_format('/media/hdd/sileo/embeddings/wiki-news-300d-1M-subword.vec', binary=False, encoding='utf8')  # C text format
init_vec = False

CPU times: user 6min 9s, sys: 2.72 s, total: 6min 11s
Wall time: 6min 8s


In [8]:
model = Word2Vec.load('/media/hdd/sileo/wiki-fr/text/full_filtered_language_model_300')

In [9]:
# diff-vec add - remove

k=0
g=0
e=0
v=0
vec_diff_list = []
string_pattern = '([a-zA-Z0-9ÀàÂâÆæÇçÈèÉéÊêËëÎîÏïÔôŒœÙùÛûÜü«»€₣]+|ref>|/ref>|\n\t|\'|[.,\/#\|!$%\^&\*;:<>\[\]{}=\-_`~()])'
for i in range(0, df_XY.shape[0]):
    
    remove_token_list = []
    add_token_list = []
    
    #_______________________________________________________________
    # sum vectors of all elements of past version text

    token_list = []
    s = df_XY.modif_remove.iloc[i].lower()
    token_list = re.findall(string_pattern, s)  
    
    for word in token_list:
        #try:
        #if word in word_vectors:
        if word in model.wv.vocab:
            g+=1
            #remove_token_list.append(np.reshape(word_vectors[word], (1,300)))
            remove_token_list.append(np.reshape(model.wv.word_vec(word), (1,300)))
        elif word in word_vectors_en:
            k+=1
            remove_token_list.append(np.reshape(word_vectors_en[word], (1,300)))
        else:
            e+=1
            remove_token_list.append(np.zeros((1,300)))
            
    remove_matrix = np.vstack(remove_token_list)
    sum_remove = np.sum(remove_matrix, axis=0, keepdims=True)

    #_______________________________________________________________    
    # sum vectors of all elements of new version text

    token_list = []
    s = df_XY.modif_add.iloc[i].lower()
    token_list = re.findall(string_pattern, s)  
    
    for word in token_list:
        #try:
        #if word in word_vectors:
        if word in model.wv.vocab:
            g+=1
            #add_token_list.append(np.reshape(word_vectors[word], (1,300)))
            add_token_list.append(np.reshape(model.wv.word_vec(word), (1,300)))
        elif word in word_vectors_en:
            k+=1
            add_token_list.append(np.reshape(word_vectors_en[word], (1,300)))
        else:
            e+=1
            add_token_list.append(np.zeros((1,300)))

    add_matrix = np.vstack(add_token_list)
    sum_add = np.sum(add_matrix, axis=0, keepdims=True)
    
    #_______________________________________________________________     
    # difference between both sums

    vec_diff = sum_add - sum_remove
    vec_diff_list.append(vec_diff)


vec_diff_matrix = np.vstack(vec_diff_list)
print(vec_diff_matrix.shape)
print('FR voc: {:,}'.format(g))
print('EN voc: {:,}'.format(k))
print('empty vec: {:,}'.format(v))
print('Missing voc: {:,}'.format(e))


"""
(1900, 300)
FR voc: 681,623
EN voc: 150,884
empty vec: 0
Missing voc: 5,305
CPU times: user 3.38 s, sys: 44 ms, total: 3.42 s
Wall time: 3.41 s
"""

(1900, 300)
FR voc: 765,595
EN voc: 65,607
empty vec: 0
Missing voc: 6,610


'\n(1900, 300)\nFR voc: 681,623\nEN voc: 150,884\nempty vec: 0\nMissing voc: 5,305\nCPU times: user 3.38 s, sys: 44 ms, total: 3.42 s\nWall time: 3.41 s\n'

Add features (300 vector coordinates) to the dataframe.

In [10]:
list_col = []
# generate columns labels for each of the 300 dimensions
for i in range(0, 300):
    list_col.append('vdd_{}'.format(i))

vdd = pd.DataFrame(vec_diff_matrix) # vec_diff_dim
vdd.columns = list_col

df_XY = pd.concat([df_XY, vdd], axis=1)
df_XY.shape

(1900, 352)

Generate development and test sets.

In [11]:
df_train = df_XY.sample(frac=0.8)
df_test = df_XY.drop(df_train.index)

df_train = df_train.fillna('')
df_test = df_test.fillna('')

df_train.shape
df_test.shape

(1520, 352)

(380, 352)

#### Check length of train strings.

In [12]:
pd.options.display.float_format = '{:,.2f}'.format

def noun_count(s):
    # s = regex.sub(' ', s) # uncomment if want to remove non-alphanumeric character
    words_len = len(re.findall(string_pattern, s)) #s.split())
    return words_len

def length_summary(df):
    print("Summary statistics modif_remove:")
    print('{}'.format(df.modif_remove.apply(noun_count).describe()), '\n')
    print("Summary statistics modif_add:")
    print('{}'.format(df.modif_add.apply(noun_count).describe()), '\n')

In [13]:
print('CHECK LENGTH OF TRAIN STRINGS \n')
length_summary(df_train)

CHECK LENGTH OF TRAIN STRINGS 

Summary statistics modif_remove:
count    1,520.00
mean       158.04
std        507.49
min          1.00
25%         50.00
50%         94.00
75%        171.00
max     18,733.00
Name: modif_remove, dtype: float64 

Summary statistics modif_add:
count     1,520.00
mean        304.48
std       5,137.61
min           1.00
25%          59.00
50%         105.00
75%         186.00
max     199,405.00
Name: modif_add, dtype: float64 



In [14]:
print('CHECK LENGTH OF TEST STRINGS \n')
length_summary(df_test)

CHECK LENGTH OF TEST STRINGS 

Summary statistics modif_remove:
count     380.00
mean      168.35
std       294.01
min         4.00
25%        61.00
50%       106.00
75%       184.00
max     4,708.00
Name: modif_remove, dtype: float64 

Summary statistics modif_add:
count     380.00
mean      186.38
std       321.16
min         4.00
25%        68.00
50%       117.00
75%       204.25
max     5,188.00
Name: modif_add, dtype: float64 



#### Check diversity of classes between both sets

In [15]:
labels_list = ['ortho_gram_typo', 'wiki_formatting', 'vandal', 'reordering', 'revert_vandal', 'revert_other', 'content_remove', 'content_add', 'nbr_wr', 'nbr_rw', 'nbr_x', 'semant_simil', 'semant_diff', 'other']
labels_list_expand = labels_list +['nbr', 'semant', 'semant_or_nbr']

def labels_sum_stats(df):
    
    df_sum_stats = pd.DataFrame(index=labels_list_expand, columns=['Absolute number', 'Share of all observations'])
    
    for label in labels_list_expand:
        df_sum_stats.loc[label, 'Absolute number'] = df[label].sum()
        df_sum_stats.loc[label, 'Share of all observations'] = df[label].sum() / len(df)
        
    return df_sum_stats

In [16]:
labels_sum_stats(df_train)

Unnamed: 0,Absolute number,Share of all observations
ortho_gram_typo,404,0.27
wiki_formatting,556,0.37
vandal,88,0.06
reordering,72,0.05
revert_vandal,12,0.01
revert_other,6,0.0
content_remove,94,0.06
content_add,387,0.25
nbr_wr,34,0.02
nbr_rw,4,0.0


In [17]:
labels_sum_stats(df_test)

Unnamed: 0,Absolute number,Share of all observations
ortho_gram_typo,102,0.27
wiki_formatting,140,0.37
vandal,26,0.07
reordering,19,0.05
revert_vandal,0,0.0
revert_other,0,0.0
content_remove,22,0.06
content_add,90,0.24
nbr_wr,5,0.01
nbr_rw,3,0.01


#### Set features

In [18]:
def features_labels_builder(df):

    X = df.loc[:, 'vdd_0':'vdd_299'].as_matrix()#.reshape((df_train.shape[0],300))
    '''
    X = np.concatenate((
                          #X_train_modif_diff, 
                          X_train_comment,
                          #X_train_ratio_punct_count_by_diff_len,
                          X_train_registered,
                          #X_train_delta_length,
                          #X_train_delta_nbr_words,
                          #X_train_ratio_UPPERCASE_diff,
                          X_train_vdd,
                          X_train_vdrem,
                          X_train_vdadd
                         ), axis=1)
    '''
    
    y = df.loc[:,'ortho_gram_typo':'other'].as_matrix().astype(int) # semant_or_nbr
    print('X shape:', X.shape)
    print('y shape:', y.shape)
    return X, y

In [19]:
X_train, y_train = features_labels_builder(df_train)
X_test, y_test = features_labels_builder(df_test)

X shape: (1520, 300)
y shape: (1520, 14)
X shape: (380, 300)
y shape: (380, 14)


In [20]:
import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=3, shuffle=True)

list_labels = [
            'ortho_gram_typo',
            'wiki_formatting',
            'vandal',
            'reordering',
            'revert_vandal',

            'revert_other',
            'content_remove',
            'content_add',
            'nbr_wr',
            'nbr_rw',

            'nbr_x',
            'semant_simil',
            'semant_diff',
            'other',

            #'semant',
            #'nbr',
            #'semant_or_nbr'
            ]

metrics_list = ['precision', 'recall', 'f1', 'roc_auc'] 




def cross_val_fn(X, y, clf, jobs):
    
    df_scores = pd.DataFrame(index=list_labels, columns=metrics_list)
    
    for i in range(len(list_labels)):
                
        for j in metrics_list:
            try:
                cv_array = cross_val_score(clf, X, y[:,i], cv=kf, n_jobs=jobs, scoring=j)
                df_scores.loc[list_labels[i], j] = '{:.2f} (+/- {:.2f})'.format(cv_array.mean(), cv_array.std())
            except:
                df_scores.loc[list_labels[i], j] = 'ill-defined'
    
    return df_scores




def model_testing(X_train, y_train, X_test, y_test, clf):
    
    df_scores_test = pd.DataFrame(index=list_labels, columns=metrics_list)
    
    for i in range(len(list_labels)):
    
        clf.fit(X_train,y_train[:,i])
        y_pred = clf.predict(X_test)
        y_pred_proba = clf.predict_proba(X_test)
        
        df_scores_test.loc[list_labels[i], 'precision'] = '{:.2f}'.format(precision_score(y_test[:,i], y_pred))
        df_scores_test.loc[list_labels[i], 'recall'] = '{:.2f}'.format(recall_score(y_test[:,i], y_pred))
        df_scores_test.loc[list_labels[i], 'f1'] = '{:.2f}'.format(f1_score(y_test[:,i], y_pred))
        try:
            df_scores_test.loc[list_labels[i], 'roc_auc'] = '{:.2f}'.format(roc_auc_score(y_test[:,i], y_pred_proba[:,1], average='weighted'))
        except:
            df_scores_test.loc[list_labels[i], 'roc_auc'] = 'ill-defined'
            
    return df_scores_test

#### Test of different models for identifying each label for all samples

In [21]:
%%time
clf = LogisticRegression(solver='sag')
cv_table = cross_val_fn(X_train, y_train, clf, 1)
test_table = model_testing(X_train, y_train, X_test, y_test, clf)

CPU times: user 2min 16s, sys: 656 ms, total: 2min 17s
Wall time: 35.6 s


In [22]:
cv_table
test_table

Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.31 (+/- 0.02),0.36 (+/- 0.03),0.32 (+/- 0.02),0.60 (+/- 0.01)
wiki_formatting,0.42 (+/- 0.04),0.50 (+/- 0.12),0.45 (+/- 0.08),0.56 (+/- 0.11)
vandal,0.06 (+/- 0.03),0.42 (+/- 0.16),0.11 (+/- 0.01),0.53 (+/- 0.09)
reordering,0.02 (+/- 0.00),0.23 (+/- 0.11),0.04 (+/- 0.03),0.45 (+/- 0.11)
revert_vandal,0.02 (+/- 0.01),0.59 (+/- 0.07),0.03 (+/- 0.02),0.60 (+/- 0.10)
revert_other,0.01 (+/- 0.01),0.72 (+/- 0.21),0.02 (+/- 0.01),ill-defined
content_remove,0.16 (+/- 0.05),0.75 (+/- 0.09),0.26 (+/- 0.07),0.77 (+/- 0.03)
content_add,0.25 (+/- 0.11),0.39 (+/- 0.18),0.33 (+/- 0.09),0.49 (+/- 0.15)
nbr_wr,0.04 (+/- 0.01),0.44 (+/- 0.08),0.06 (+/- 0.02),0.64 (+/- 0.08)
nbr_rw,0.00 (+/- 0.00),0.11 (+/- 0.16),0.00 (+/- 0.00),0.62 (+/- 0.14)


Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.28,0.35,0.31,0.57
wiki_formatting,0.39,0.47,0.43,0.54
vandal,0.06,0.38,0.1,0.46
reordering,0.04,0.26,0.07,0.38
revert_vandal,0.0,0.0,0.0,ill-defined
revert_other,0.0,0.0,0.0,ill-defined
content_remove,0.11,0.59,0.18,0.69
content_add,0.19,0.3,0.23,0.37
nbr_wr,0.03,0.6,0.05,0.66
nbr_rw,0.02,0.67,0.03,0.69


In [23]:
%%time
clf = LogisticRegression(solver='sag', C=0.01)
cv_table = cross_val_fn(X_train, y_train, clf, 1)
test_table = model_testing(X_train, y_train, X_test, y_test, clf)

CPU times: user 2min 16s, sys: 608 ms, total: 2min 17s
Wall time: 35.4 s


In [24]:
cv_table
test_table

Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.30 (+/- 0.01),0.38 (+/- 0.02),0.34 (+/- 0.00),0.58 (+/- 0.02)
wiki_formatting,0.42 (+/- 0.07),0.49 (+/- 0.14),0.47 (+/- 0.09),0.56 (+/- 0.11)
vandal,0.06 (+/- 0.01),0.50 (+/- 0.12),0.12 (+/- 0.04),0.50 (+/- 0.06)
reordering,0.03 (+/- 0.02),0.19 (+/- 0.06),0.05 (+/- 0.01),0.44 (+/- 0.05)
revert_vandal,0.02 (+/- 0.01),0.58 (+/- 0.12),0.03 (+/- 0.01),0.72 (+/- 0.18)
revert_other,0.01 (+/- 0.01),0.83 (+/- 0.24),0.01 (+/- 0.00),ill-defined
content_remove,0.15 (+/- 0.02),0.74 (+/- 0.06),0.25 (+/- 0.02),0.75 (+/- 0.03)
content_add,0.26 (+/- 0.09),0.42 (+/- 0.17),0.33 (+/- 0.13),0.51 (+/- 0.14)
nbr_wr,0.03 (+/- 0.01),0.37 (+/- 0.06),0.05 (+/- 0.03),0.61 (+/- 0.01)
nbr_rw,0.00 (+/- 0.00),0.17 (+/- 0.24),0.00 (+/- 0.01),0.57 (+/- 0.08)


Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.29,0.36,0.32,0.57
wiki_formatting,0.39,0.47,0.42,0.54
vandal,0.06,0.38,0.1,0.46
reordering,0.04,0.26,0.07,0.38
revert_vandal,0.0,0.0,0.0,ill-defined
revert_other,0.0,0.0,0.0,ill-defined
content_remove,0.11,0.59,0.18,0.69
content_add,0.19,0.3,0.23,0.37
nbr_wr,0.03,0.6,0.05,0.66
nbr_rw,0.02,0.67,0.03,0.69


In [25]:
%%time
clf = LogisticRegression(solver='sag', C=0.001)
cv_table = cross_val_fn(X_train, y_train, clf, 1)
test_table = model_testing(X_train, y_train, X_test, y_test, clf)

CPU times: user 2min 15s, sys: 696 ms, total: 2min 16s
Wall time: 35.1 s


In [26]:
cv_table
test_table

Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.30 (+/- 0.03),0.36 (+/- 0.04),0.35 (+/- 0.03),0.59 (+/- 0.03)
wiki_formatting,0.41 (+/- 0.09),0.49 (+/- 0.14),0.44 (+/- 0.11),0.55 (+/- 0.11)
vandal,0.06 (+/- 0.02),0.40 (+/- 0.06),0.09 (+/- 0.02),0.46 (+/- 0.09)
reordering,0.04 (+/- 0.02),0.16 (+/- 0.06),0.05 (+/- 0.01),0.45 (+/- 0.04)
revert_vandal,0.02 (+/- 0.01),0.54 (+/- 0.19),0.03 (+/- 0.01),0.57 (+/- 0.11)
revert_other,0.01 (+/- 0.00),0.89 (+/- 0.16),0.02 (+/- 0.02),0.69 (+/- 0.22)
content_remove,0.16 (+/- 0.02),0.74 (+/- 0.08),0.25 (+/- 0.03),0.77 (+/- 0.03)
content_add,0.26 (+/- 0.07),0.36 (+/- 0.18),0.32 (+/- 0.11),0.48 (+/- 0.16)
nbr_wr,0.03 (+/- 0.00),0.44 (+/- 0.10),0.07 (+/- 0.01),0.60 (+/- 0.04)
nbr_rw,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.01),ill-defined


Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.28,0.35,0.31,0.57
wiki_formatting,0.39,0.47,0.43,0.54
vandal,0.06,0.38,0.1,0.46
reordering,0.04,0.26,0.07,0.39
revert_vandal,0.0,0.0,0.0,ill-defined
revert_other,0.0,0.0,0.0,ill-defined
content_remove,0.11,0.59,0.18,0.69
content_add,0.19,0.3,0.23,0.37
nbr_wr,0.03,0.6,0.05,0.66
nbr_rw,0.02,0.67,0.03,0.69


In [27]:
%%time
clf = DecisionTreeClassifier()
cv_table = cross_val_fn(X_train, y_train, clf, 1)
test_table = model_testing(X_train, y_train, X_test, y_test, clf)

CPU times: user 23.5 s, sys: 0 ns, total: 23.5 s
Wall time: 23.1 s


In [28]:
cv_table
test_table

Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.32 (+/- 0.03),0.41 (+/- 0.02),0.38 (+/- 0.02),0.51 (+/- 0.02)
wiki_formatting,0.58 (+/- 0.03),0.57 (+/- 0.02),0.59 (+/- 0.02),0.68 (+/- 0.02)
vandal,0.07 (+/- 0.05),0.12 (+/- 0.02),0.10 (+/- 0.07),0.52 (+/- 0.04)
reordering,0.09 (+/- 0.05),0.15 (+/- 0.09),0.11 (+/- 0.01),0.58 (+/- 0.04)
revert_vandal,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),0.50 (+/- 0.00)
revert_other,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),0.50 (+/- 0.00)
content_remove,0.28 (+/- 0.04),0.46 (+/- 0.03),0.24 (+/- 0.04),0.64 (+/- 0.04)
content_add,0.40 (+/- 0.02),0.46 (+/- 0.03),0.39 (+/- 0.05),0.62 (+/- 0.02)
nbr_wr,0.03 (+/- 0.04),0.03 (+/- 0.04),0.05 (+/- 0.04),0.53 (+/- 0.03)
nbr_rw,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),ill-defined


Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.34,0.42,0.38,0.55
wiki_formatting,0.57,0.64,0.6,0.67
vandal,0.18,0.19,0.19,0.55
reordering,0.07,0.11,0.08,0.50
revert_vandal,0.0,0.0,0.0,ill-defined
revert_other,0.0,0.0,0.0,ill-defined
content_remove,0.15,0.23,0.18,0.57
content_add,0.38,0.44,0.41,0.60
nbr_wr,0.0,0.0,0.0,0.49
nbr_rw,0.0,0.0,0.0,0.50


In [29]:
%%time
clf = RandomForestClassifier(n_estimators=10)
cv_table = cross_val_fn(X_train, y_train, clf, 1)
test_table = model_testing(X_train, y_train, X_test, y_test, clf)

CPU times: user 9.38 s, sys: 0 ns, total: 9.38 s
Wall time: 9.38 s


In [30]:
cv_table
test_table

Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.46 (+/- 0.09),0.18 (+/- 0.06),0.30 (+/- 0.02),0.60 (+/- 0.02)
wiki_formatting,0.73 (+/- 0.05),0.53 (+/- 0.02),0.61 (+/- 0.04),0.79 (+/- 0.02)
vandal,0.00 (+/- 0.00),0.01 (+/- 0.01),0.00 (+/- 0.00),0.59 (+/- 0.03)
reordering,0.00 (+/- 0.00),0.02 (+/- 0.03),0.09 (+/- 0.01),0.65 (+/- 0.02)
revert_vandal,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),0.60 (+/- 0.04)
revert_other,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),0.52 (+/- 0.09)
content_remove,0.49 (+/- 0.15),0.12 (+/- 0.05),0.29 (+/- 0.06),0.77 (+/- 0.02)
content_add,0.54 (+/- 0.04),0.28 (+/- 0.05),0.34 (+/- 0.03),0.76 (+/- 0.01)
nbr_wr,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),0.72 (+/- 0.08)
nbr_rw,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),ill-defined


Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.53,0.25,0.34,0.64
wiki_formatting,0.78,0.55,0.64,0.79
vandal,0.0,0.0,0.0,0.66
reordering,0.0,0.0,0.0,0.64
revert_vandal,0.0,0.0,0.0,ill-defined
revert_other,0.0,0.0,0.0,ill-defined
content_remove,0.14,0.05,0.07,0.75
content_add,0.58,0.29,0.39,0.79
nbr_wr,0.0,0.0,0.0,0.68
nbr_rw,0.0,0.0,0.0,0.47


In [31]:
%%time
clf = RandomForestClassifier(n_estimators=100)
cv_table = cross_val_fn(X_train, y_train, clf, 1)
test_table = model_testing(X_train, y_train, X_test, y_test, clf)

CPU times: user 1min 27s, sys: 0 ns, total: 1min 27s
Wall time: 1min 27s


In [32]:
cv_table
test_table

Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.61 (+/- 0.07),0.14 (+/- 0.01),0.24 (+/- 0.02),0.65 (+/- 0.00)
wiki_formatting,0.76 (+/- 0.02),0.61 (+/- 0.01),0.69 (+/- 0.01),0.83 (+/- 0.01)
vandal,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),0.64 (+/- 0.03)
reordering,0.11 (+/- 0.16),0.03 (+/- 0.02),0.05 (+/- 0.04),0.74 (+/- 0.04)
revert_vandal,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),0.44 (+/- 0.13)
revert_other,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),ill-defined
content_remove,0.49 (+/- 0.10),0.20 (+/- 0.02),0.28 (+/- 0.05),0.85 (+/- 0.03)
content_add,0.63 (+/- 0.01),0.36 (+/- 0.04),0.44 (+/- 0.03),0.80 (+/- 0.02)
nbr_wr,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),0.81 (+/- 0.01)
nbr_rw,0.00 (+/- 0.00),0.00 (+/- 0.00),0.00 (+/- 0.00),ill-defined


Unnamed: 0,precision,recall,f1,roc_auc
ortho_gram_typo,0.56,0.15,0.23,0.62
wiki_formatting,0.8,0.59,0.68,0.82
vandal,0.0,0.0,0.0,0.71
reordering,0.0,0.0,0.0,0.64
revert_vandal,0.0,0.0,0.0,ill-defined
revert_other,0.0,0.0,0.0,ill-defined
content_remove,0.4,0.18,0.25,0.81
content_add,0.67,0.43,0.53,0.83
nbr_wr,0.0,0.0,0.0,0.79
nbr_rw,0.0,0.0,0.0,0.51
