In [4]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack, vstack


class BaseLayerEstimator1(ABC):
    @abstractmethod
    def train(self, x_train, y_train):
        pass
    
    @abstractmethod
    def predict(self, x_train):
        pass 
    
    

class OneVSOneReg(BaseLayerEstimator1):
    def __init__(self, x_train, y_train, model='logistic'):
        """
        x_train: sparse matrix, raw tfidf
        y_train: dataframe, with only label columns. should be 6 columns in total
        model: only support logistic or svc
        """
        self.r = {}
        self.setModelName(model)
        assert self.model_name in ['logistic', 'svc']
        self.param = {}
        self.param['logistic'] = {'identity_hate': 9.0,
                                     'insult': 1.5,
                                     'obscene': 1.0,
                                     'severe_toxic': 4.0,
                                     'threat': 9.0,
                                     'toxic': 2.7}
        self.param['svc'] = {'identity_hate': 0.9,
                             'insult': 0.15,
                             'obscene': 0.15,
                             'severe_toxic': 0.15,
                             'threat': 1.0,
                             'toxic': 0.29}
        
        
        
        for col in y_train.columns:
            print('calculating naive bayes for {}'.format(col))
            self.r[col] = np.log(self.pr(1, y_train[col].values, x_train) / self.pr(0, y_train[col], x_train))
        print('initializing done')
        print('OneVsOne is using {} kernel'.format(self.model_name))
        
    def setModelName(self, name):
        self.model_name = name
        assert self.model_name in ['logistic', 'svc']
        print('OneVsOne is using {} kernel'.format(self.model_name))
        
    def pr(self, y_i, y, train_features):
        p = train_features[np.array(y==y_i)].sum(0)
        return (p + 1) / (np.array(y == y_i).sum() + 1)
    
    def oneVsOneSplit(self, x_train, y_train, label):
        print('Starting One vs One dataset splitting')
        if isinstance(y_train, pd.Series):
            y_train = y_train.values
        model_train = x_train[np.array(y_train == 1)]
        y_model_train = y_train[np.array(y_train == 1)]
        non_model_train = x_train[np.array(y_train == 0)]
        non_model_train = non_model_train[:model_train.shape[0]]
        y_non_model_train = y_train[np.array(y_train == 0)]
        y_non_model_train = y_non_model_train[:model_train.shape[0]]
        x_model_stack = vstack([model_train, non_model_train])
        y_model_stack = np.concatenate([y_model_train, y_non_model_train])
        x_nb = x_model_stack.multiply(self.r[label]).tocsr()
        y_nb = y_model_stack
        print('splitting done!')
        return (x_nb, y_nb)
    
    def train(self, x_train, y_train, label, oneVSone=True):
        ### construct one vs one
        if oneVSone:
            x_nb, y_nb = self.oneVsOneSplit(x_train, y_train, label)
        else:
            print('Training on whole dataset. One on One is diabled!')
            if isinstance(y_train, pd.Series):
                y_nb = y_train.values
            else:
                y_nb = y_train
            x_nb = x_train
        ### start training
        if self.model_name is 'logistic':
            print('start training logistic regression')
            if oneVSone:
                self.model = LogisticRegression(C=self.param['logistic'][label])
            else:
                self.model = LogisticRegression(C=0.25)
            self.model.fit(x_nb, y_nb)
            print('training done')
            
        else:
            print('start training linear svc regression')
            if oneVSone:
                 lsvc = LinearSVC(C=self.param['svc'][label])
            else:
                lsvc = LinearSVC(C=0.02)
            self.model = CalibratedClassifierCV(lsvc) 
            self.model.fit(x_nb, y_nb)
            print('training done')
        

    
    def predict(self, x_test, label):
        print('applying naive bayes to dataset')
        x_nb_test = x_test.multiply(self.r[label]).tocsr()
        print('predicting')
        pred = self.model.predict_proba(x_nb_test)[:,1]
        print('predicting done')
        return pred
    
##### example        
# aa = OneVSOneReg(train_tfidf, train[label_cols], model='logistic')
# aa.setModelName('svc')
# aa.train(train_tfidf,train['toxic'], 'toxic')
# aa.predict(test_tfidf, 'toxic')

In [1]:
from onevsone_data import onevsone_data_process

In [2]:
x_train_1v1, y_train_1v1, x_test_1v1, data_id_1v1 = onevsone_data_process()

loading data done!
fitting char


KeyboardInterrupt: 

In [6]:
onevsone_svc = OneVSOneReg(x_train_1v1, y_train_1v1, model='svc')

OneVsOne is using svc kernel
calculating naive bayes for toxic
calculating naive bayes for severe_toxic
calculating naive bayes for obscene
calculating naive bayes for threat
calculating naive bayes for insult
calculating naive bayes for identity_hate
initializing done
OneVsOne is using svc kernel


In [8]:
onevsone_svc.train(x_train_1v1, y_train_1v1['toxic'], 'toxic')

Starting One vs One dataset splitting
splitting done!
start training linear svc regression
training done


In [11]:
onevsone_svc.predict(x_test_1v1, 'toxic')

applying naive bayes to dataset
predicting
predicting done


array([ 0.99999993,  0.02563926,  0.25582663, ...,  0.00467807,
        0.08956339,  0.99991951])