In [6]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack, vstack


In [2]:
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

class SkipGramCountVectorizer(CountVectorizer):
    """
    To vectorize text with skip-grams in scikit-learn simply passing the skip gram tokens as the vocabulary 
    to CountVectorizer will not work. You need to modify the way tokens are processed which can be done with 
    a custom analyzer. Below is an example vectorizer that produces 1-skip-2-grams
    """
    def build_analyzer(self):    
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        return lambda doc: self._word_skip_grams(
                compose(tokenize, preprocess, self.decode)(doc),
                stop_words)

    def _word_skip_grams(self, tokens, stop_words=None):
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]

        return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)
    
# """
# examples:
# text = ['the rain in Spain falls mainly on the plain']

# vect = SkipGramVectorizer()
# vect.fit(text)
# vect.get_feature_names()
# """
from sklearn.utils.validation import check_is_fitted
class SkipGramTfidfVectorizer(SkipGramCountVectorizer):
    """Convert a collection of raw documents to a matrix of TF-IDF features.

    Equivalent to CountVectorizer followed by TfidfTransformer.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------
    input : string {'filename', 'file', 'content'}
        If 'filename', the sequence passed as an argument to fit is
        expected to be a list of filenames that need reading to fetch
        the raw content to analyze.

        If 'file', the sequence items must have a 'read' method (file-like
        object) that is called to fetch the bytes in memory.

        Otherwise the input is expected to be the sequence strings or
        bytes items are expected to be analyzed directly.

    encoding : string, 'utf-8' by default.
        If bytes or files are given to analyze, this encoding is used to
        decode.

    decode_error : {'strict', 'ignore', 'replace'}
        Instruction on what to do if a byte sequence is given to analyze that
        contains characters not of the given `encoding`. By default, it is
        'strict', meaning that a UnicodeDecodeError will be raised. Other
        values are 'ignore' and 'replace'.

    strip_accents : {'ascii', 'unicode', None}
        Remove accents during the preprocessing step.
        'ascii' is a fast method that only works on characters that have
        an direct ASCII mapping.
        'unicode' is a slightly slower method that works on any characters.
        None (default) does nothing.

    analyzer : string, {'word', 'char'} or callable
        Whether the feature should be made of word or character n-grams.

        If a callable is passed it is used to extract the sequence of features
        out of the raw, unprocessed input.

    preprocessor : callable or None (default)
        Override the preprocessing (string transformation) stage while
        preserving the tokenizing and n-grams generation steps.

    tokenizer : callable or None (default)
        Override the string tokenization step while preserving the
        preprocessing and n-grams generation steps.
        Only applies if ``analyzer == 'word'``.

    ngram_range : tuple (min_n, max_n)
        The lower and upper boundary of the range of n-values for different
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
        will be used.

    stop_words : string {'english'}, list, or None (default)
        If a string, it is passed to _check_stop_list and the appropriate stop
        list is returned. 'english' is currently the only supported string
        value.

        If a list, that list is assumed to contain stop words, all of which
        will be removed from the resulting tokens.
        Only applies if ``analyzer == 'word'``.

        If None, no stop words will be used. max_df can be set to a value
        in the range [0.7, 1.0) to automatically detect and filter stop
        words based on intra corpus document frequency of terms.

    lowercase : boolean, default True
        Convert all characters to lowercase before tokenizing.

    token_pattern : string
        Regular expression denoting what constitutes a "token", only used
        if ``analyzer == 'word'``. The default regexp selects tokens of 2
        or more alphanumeric characters (punctuation is completely ignored
        and always treated as a token separator).

    max_df : float in range [0.0, 1.0] or int, default=1.0
        When building the vocabulary ignore terms that have a document
        frequency strictly higher than the given threshold (corpus-specific
        stop words).
        If float, the parameter represents a proportion of documents, integer
        absolute counts.
        This parameter is ignored if vocabulary is not None.

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document
        frequency strictly lower than the given threshold. This value is also
        called cut-off in the literature.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.
        This parameter is ignored if vocabulary is not None.

    max_features : int or None, default=None
        If not None, build a vocabulary that only consider the top
        max_features ordered by term frequency across the corpus.

        This parameter is ignored if vocabulary is not None.

    vocabulary : Mapping or iterable, optional
        Either a Mapping (e.g., a dict) where keys are terms and values are
        indices in the feature matrix, or an iterable over terms. If not
        given, a vocabulary is determined from the input documents.

    binary : boolean, default=False
        If True, all non-zero term counts are set to 1. This does not mean
        outputs will have only 0/1 values, only that the tf term in tf-idf
        is binary. (Set idf and normalization to False to get 0/1 outputs.)

    dtype : type, optional
        Type of the matrix returned by fit_transform() or transform().

    norm : 'l1', 'l2' or None, optional
        Norm used to normalize term vectors. None for no normalization.

    use_idf : boolean, default=True
        Enable inverse-document-frequency reweighting.

    smooth_idf : boolean, default=True
        Smooth idf weights by adding one to document frequencies, as if an
        extra document was seen containing every term in the collection
        exactly once. Prevents zero divisions.

    sublinear_tf : boolean, default=False
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).

    Attributes
    ----------
    vocabulary_ : dict
        A mapping of terms to feature indices.

    idf_ : array, shape = [n_features], or None
        The learned idf vector (global term weights)
        when ``use_idf`` is set to True, None otherwise.

    stop_words_ : set
        Terms that were ignored because they either:

          - occurred in too many documents (`max_df`)
          - occurred in too few documents (`min_df`)
          - were cut off by feature selection (`max_features`).

        This is only available if no vocabulary was given.

    See also
    --------
    CountVectorizer
        Tokenize the documents and count the occurrences of token and return
        them as a sparse matrix

    TfidfTransformer
        Apply Term Frequency Inverse Document Frequency normalization to a
        sparse matrix of occurrence counts.

    Notes
    -----
    The ``stop_words_`` attribute can get large and increase the model size
    when pickling. This attribute is provided only for introspection and can
    be safely removed using delattr or set to None before pickling.
    """

    def __init__(self, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):

        super(SkipGramTfidfVectorizer, self).__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
                                       smooth_idf=smooth_idf,
                                       sublinear_tf=sublinear_tf)

    # Broadcast the TF-IDF parameters to the underlying transformer instance
    # for easy grid search and repr

    @property
    def norm(self):
        return self._tfidf.norm

    @norm.setter
    def norm(self, value):
        self._tfidf.norm = value

    @property
    def use_idf(self):
        return self._tfidf.use_idf

    @use_idf.setter
    def use_idf(self, value):
        self._tfidf.use_idf = value

    @property
    def smooth_idf(self):
        return self._tfidf.smooth_idf

    @smooth_idf.setter
    def smooth_idf(self, value):
        self._tfidf.smooth_idf = value

    @property
    def sublinear_tf(self):
        return self._tfidf.sublinear_tf

    @sublinear_tf.setter
    def sublinear_tf(self, value):
        self._tfidf.sublinear_tf = value

    @property
    def idf_(self):
        return self._tfidf.idf_

    def fit(self, raw_documents, y=None):
        """Learn vocabulary and idf from training set.

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        Returns
        -------
        self : TfidfVectorizer
        """
        X = super(SkipGramTfidfVectorizer, self).fit_transform(raw_documents)
        self._tfidf.fit(X)
        return self

    def fit_transform(self, raw_documents, y=None):
        """Learn vocabulary and idf, return term-document matrix.

        This is equivalent to fit followed by transform, but more efficiently
        implemented.

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Tf-idf-weighted document-term matrix.
        """
        X = super(SkipGramTfidfVectorizer, self).fit_transform(raw_documents)
        self._tfidf.fit(X)
        # X is already a transformed view of raw_documents so
        # we set copy to False
        return self._tfidf.transform(X, copy=False)

    def transform(self, raw_documents, copy=True):
        """Transform documents to document-term matrix.

        Uses the vocabulary and document frequencies (df) learned by fit (or
        fit_transform).

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        copy : boolean, default True
            Whether to copy X and operate on the copy or perform in-place
            operations.

        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Tf-idf-weighted document-term matrix.
        """
        check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')

        X = super(SkipGramTfidfVectorizer, self).transform(raw_documents)
        return self._tfidf.transform(X, copy=False)

In [3]:
PATH = '../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

# train_sentence = train['comment_text_cleaned']
# test_sentence = test['comment_text_cleaned']
train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']
# train_sentence = train['comment_text_cleaned_features']
# test_sentence = test['comment_text_cleaned_features']

train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']
# text = pd.concat([train_sentence, test_sentence])
text = train_sentence
# text_retain_punctuation = pd.concat([train_sentence_retain_punctuation, test_sentence_retain_punctuation])
text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)

(159571, 30)
(153164, 24)


In [4]:
# from textblob import TextBlob
# # x = my_series.apply(my_function, args = (arg1,))
# pol = pd.DataFrame()
# pol_test = pd.DataFrame()
# def add_polarity_phrase(x, col):
#     score = int (TextBlob(x).sentiment.polarity * 20) 
#     if score > 0:
#         senti = 'positive'
#     elif score < 0:
#         senti = 'negative'
#         score = score * (-1)
#     else:
#         senti = 'neutral'
#     return ' {}_{}_{} '.format(col,senti,score)

# pol['cleaned'] = train['comment_text_cleaned'].apply(add_polarity_phrase, args=('cleaned',))
# pol['original'] = train['comment_text'].apply(add_polarity_phrase, args=('original',))

# print('train set done')

# pol_test['cleaned'] = test['comment_text_cleaned'].apply(add_polarity_phrase, args=('cleaned',))
# pol_test['original'] = test['comment_text'].apply(add_polarity_phrase, args=('original',))

# train['comment_text_cleaned_polarity'] = train['comment_text_cleaned'] + pol['cleaned'] + pol['original']
# test['comment_text_cleaned_polarity'] = test['comment_text_cleaned'] + pol_test['cleaned'] + pol_test['original']

# train.to_csv(PATH + 'cleaned_train.csv', index=False)
# test.to_csv(PATH + 'cleaned_test.csv', index=False)

In [5]:


print('getting skip gram tfidf')

skip_vectorizer = SkipGramTfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=10000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)
print('fitting skip 1 n-gram 2')
# skip_vectorizer.fit(text.values)
print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')
# train_skip = skip_vectorizer.transform(train_sentence.values)
print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)

print('transforming test skip gram')
# test_skip = skip_vectorizer.transform(test_sentence.values)
print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)

# train_tfidf = hstack((train_skip, train_char, train_phrase), format='csr')
# test_tfidf = hstack((test_skip, test_char, test_phrase), format='csr')

train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting skip gram tfidf
fitting skip 1 n-gram 2
fitting char
fitting phrase
transforming train skip gram
transforming train char
transforming train phrase
transforming test skip gram
transforming test char
transforming test phrase


<159571x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 162813546 stored elements in Compressed Sparse Row format>

In [None]:
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

In [7]:
# Split the dataset
split_index = round(len(train) * 0.9) #################################
x_train = train_tfidf[:split_index]
y_train_df = train.iloc[:split_index]

#######
x_val = train_tfidf[split_index:]
y_val_df = train.iloc[split_index:]
# Get test data ready
x_test = test_tfidf

In [191]:
import gc
gc.collect()

4937

In [192]:
# r = np.log(pr(1, y_stack, x_stack) / pr(0, y_stack, x_stack))
r = np.log(pr(1, train['toxic'].values, train_tfidf) / pr(0, train['toxic'].values, train_tfidf))
x_nb = x_stack.multiply(r).tocsr()
x_val_nb = x_val.multiply(r).tocsr()
lsvc = LinearSVC()
model_svc = CalibratedClassifierCV(lsvc) 
model_svc.fit(x_nb, y_stack)

CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=3, method='sigmoid')

In [240]:
############################# start Grid Search
x_nb = {}
y_nb = {}
x_val_nb = {}

for col in label_cols:
    print(col)
    model_train = x_train[np.array(y_train_df[col] == 1)]
    y_model_train = y_train_df[np.array(y_train_df[col] == 1)][col].values
    
    non_model_train = x_train[np.array(y_train_df[col] == 0)]
    non_model_train = non_model_train[:model_train.shape[0]]

    y_non_model_train = y_train_df[np.array(y_train_df[col] == 0)]
    y_non_model_train = y_non_model_train[:model_train.shape[0]][col].values
    
    x_model_stack = vstack([model_train, non_model_train])
    y_model_stack = np.concatenate([y_model_train, y_non_model_train])
    
    r = np.log(pr(1, train[col].values, train_tfidf) / pr(0, train[col].values, train_tfidf))
    x_nb[col] = x_model_stack.multiply(r).tocsr()
    x_val_nb[col] = x_val.multiply(r).tocsr()
    y_nb[col] = y_model_stack

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [234]:
from sklearn.metrics import roc_auc_score
%env JOBLIB_TEMP_FOLDER=/tmp
##### grid search for logress
df_log = pd.DataFrame({'label':[],'C':[],'ROC':[], 'precision':[], 'tpr':[], 'fpr':[]})
para = [9,8,7,6,5,4,3.5,3.3,3.2,3.1,3,2.9,2.8,2.7,2.6,2.5,2.4,2.3,2.2,2, 1.5, 1.0, 0.9, 0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]
i = 0
for col in label_cols:
    y = y_val_df[col].values
    print(col)
    for cc in para:
        model_log = LogisticRegression(C=cc)
        model_log.fit(x_nb[col], y_nb[col])
        pred_prob = model_log.predict_proba(x_val_nb[col])[:,1]
        pred = model_log.predict(x_val_nb[col])
        roc = roc_auc_score(y,pred_prob)
        tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
        precision = tp / (tp + fp)
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)
        df_log.loc[i, 'C'] = cc
        df_log.loc[i, 'label'] = col
        df_log.loc[i, 'precision'] = precision
        df_log.loc[i, 'tpr'] = tpr
        df_log.loc[i, 'fpr'] = fpr
        df_log.loc[i, 'ROC'] = roc
        print(df_log.iloc[i])
        i += 1
        print('==================================================')
    
    

toxic
C                    9
ROC           0.983467
fpr          0.0478701
label            toxic
precision     0.669223
tpr           0.904731
Name: 0, dtype: object
C                    8
ROC           0.983529
fpr          0.0478701
label            toxic
precision     0.669065
tpr           0.904083
Name: 1, dtype: object
C                    7
ROC           0.983592
fpr          0.0473151
label            toxic
precision     0.672115
tpr           0.906027
Name: 2, dtype: object
C                    6
ROC           0.983665
fpr          0.0468988
label            toxic
precision     0.673903
tpr           0.905379
Name: 3, dtype: object
C                    5
ROC           0.983746
fpr          0.0469682
label            toxic
precision     0.673578
tpr           0.905379
Name: 4, dtype: object
C                   4
ROC          0.983811
fpr          0.046552
label           toxic
precision    0.675218
tpr          0.904083
Name: 5, dtype: object
C                  3.5
ROC        

C                     3.3
ROC              0.988885
fpr             0.0271399
label        severe_toxic
precision        0.240708
tpr              0.906667
Name: 38, dtype: object
C                     3.2
ROC              0.988885
fpr             0.0271399
label        severe_toxic
precision        0.240708
tpr              0.906667
Name: 39, dtype: object
C                     3.1
ROC              0.988883
fpr             0.0271399
label        severe_toxic
precision        0.240708
tpr              0.906667
Name: 40, dtype: object
C                       3
ROC              0.988883
fpr             0.0271399
label        severe_toxic
precision        0.240708
tpr              0.906667
Name: 41, dtype: object
C                     2.9
ROC              0.988879
fpr             0.0270766
label        severe_toxic
precision        0.241135
tpr              0.906667
Name: 42, dtype: object
C                     2.8
ROC              0.988876
fpr             0.0269501
label        severe_to

C                  2.7
ROC           0.992334
fpr          0.0223945
label          obscene
precision     0.705061
tpr           0.935185
Name: 75, dtype: object
C                  2.6
ROC           0.992341
fpr          0.0223282
label          obscene
precision     0.705677
tpr           0.935185
Name: 76, dtype: object
C                  2.5
ROC           0.992348
fpr          0.0221957
label          obscene
precision     0.706912
tpr           0.935185
Name: 77, dtype: object
C                  2.4
ROC           0.992357
fpr          0.0220632
label          obscene
precision     0.707638
tpr            0.93287
Name: 78, dtype: object
C                 2.3
ROC          0.992366
fpr          0.021997
label         obscene
precision     0.70826
tpr           0.93287
Name: 79, dtype: object
C                  2.2
ROC           0.992377
fpr          0.0219307
label          obscene
precision     0.708627
tpr           0.931713
Name: 80, dtype: object
C                    2
ROC        

C                    1
ROC           0.993305
fpr          0.0164707
label           threat
precision      0.14658
tpr                0.9
Name: 114, dtype: object
C                  0.9
ROC           0.993265
fpr          0.0165336
label           threat
precision     0.146104
tpr                0.9
Name: 115, dtype: object
C                  0.8
ROC           0.993243
fpr          0.0165336
label           threat
precision     0.146104
tpr                0.9
Name: 116, dtype: object
C                  0.7
ROC             0.9932
fpr          0.0166593
label           threat
precision     0.145161
tpr                0.9
Name: 117, dtype: object
C                  0.6
ROC           0.993165
fpr          0.0169108
label           threat
precision     0.143312
tpr                0.9
Name: 118, dtype: object
C                  0.5
ROC           0.993068
fpr          0.0169108
label           threat
precision     0.143312
tpr                0.9
Name: 119, dtype: object
C                  0.4

C                  0.2
ROC           0.983598
fpr          0.0311757
label           insult
precision     0.602024
tpr           0.873929
Name: 153, dtype: object
C                  0.1
ROC           0.982839
fpr          0.0280713
label           insult
precision     0.621886
tpr           0.855569
Name: 154, dtype: object
identity_hate
C                        9
ROC               0.987176
fpr               0.034655
label        identity_hate
precision         0.192931
tpr               0.909722
Name: 155, dtype: object
C                        8
ROC               0.987163
fpr              0.0345286
label        identity_hate
precision         0.193501
tpr               0.909722
Name: 156, dtype: object
C                        7
ROC               0.987146
fpr              0.0345286
label        identity_hate
precision         0.193501
tpr               0.909722
Name: 157, dtype: object
C                        6
ROC               0.987123
fpr              0.0344653
label        ident

In [235]:
df_log.to_csv(PATH + 'OneVsOne_logreg_parameters.csv', index=False)

In [253]:
from sklearn.metrics import roc_auc_score
%env JOBLIB_TEMP_FOLDER=/tmp
##### grid search for logress
df_svm = pd.DataFrame({'label':[],'C':[],'ROC':[], 'precision':[], 'tpr':[], 'fpr':[]})
para = [0.01,0.05,0.1,0.15,0.2,0.23,0.25,0.27,0.28,0.29,0.3,0.32,0.35,0.4,0.5,0.7,0.9,1]
i = 0
for col in label_cols:
    y = y_val_df[col].values
    print(col)
    for cc in para:
        lsvc = LinearSVC(C=cc)
        model_svc = CalibratedClassifierCV(lsvc) 
        model_svc.fit(x_nb[col], y_nb[col])
        pred_prob = model_svc.predict_proba(x_val_nb[col])[:,1]
        pred = model_svc.predict(x_val_nb[col])
        roc = roc_auc_score(y,pred_prob)
        tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
        precision = tp / (tp + fp)
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)
        df_svm.loc[i, 'C'] = cc
        df_svm.loc[i, 'label'] = col
        df_svm.loc[i, 'precision'] = precision
        df_svm.loc[i, 'tpr'] = tpr
        df_svm.loc[i, 'fpr'] = fpr
        df_svm.loc[i, 'ROC'] = roc
        print(df_svm.iloc[i])
        i += 1
        print('==================================================')
    
    

env: JOBLIB_TEMP_FOLDER=/tmp
toxic
C                 0.01
ROC            0.97836
fpr          0.0573748
label            toxic
precision     0.626805
tpr           0.900194
Name: 0, dtype: object
C                 0.05
ROC             0.9826
fpr          0.0554322
label            toxic
precision     0.638625
tpr             0.9151
Name: 1, dtype: object
C                  0.1
ROC           0.983652
fpr          0.0541834
label            toxic
precision     0.644839
tpr           0.918989
Name: 2, dtype: object
C                 0.15
ROC           0.984012
fpr          0.0539059
label            toxic
precision     0.646497
tpr           0.920933
Name: 3, dtype: object
C                  0.2
ROC           0.984175
fpr          0.0533509
label            toxic
precision     0.648377
tpr           0.918989
Name: 4, dtype: object
C                 0.23
ROC            0.98423
fpr          0.0529346
label            toxic
precision      0.65016
tpr           0.918989
Name: 5, dtype: object

C                 0.05
ROC           0.992399
fpr          0.0251772
label          obscene
precision     0.681208
tpr           0.939815
Name: 37, dtype: object
C                  0.1
ROC           0.992512
fpr          0.0252435
label          obscene
precision     0.681438
tpr           0.943287
Name: 38, dtype: object
C                0.15
ROC          0.992519
fpr          0.025376
label         obscene
precision    0.680833
tpr          0.945602
Name: 39, dtype: object
C                  0.2
ROC            0.99251
fpr          0.0255085
label          obscene
precision     0.679967
tpr           0.946759
Name: 40, dtype: object
C                 0.23
ROC           0.992497
fpr          0.0255748
label          obscene
precision     0.679402
tpr           0.946759
Name: 41, dtype: object
C                 0.25
ROC           0.992484
fpr          0.0255748
label          obscene
precision     0.679402
tpr           0.946759
Name: 42, dtype: object
C                0.27
ROC         

C                  0.2
ROC           0.985129
fpr          0.0428666
label           insult
precision      0.53709
tpr           0.921665
Name: 76, dtype: object
C                 0.23
ROC           0.985105
fpr          0.0431968
label           insult
precision     0.535181
tpr           0.921665
Name: 77, dtype: object
C                 0.25
ROC           0.985083
fpr          0.0432629
label           insult
precision     0.534801
tpr           0.921665
Name: 78, dtype: object
C                 0.27
ROC           0.985071
fpr          0.0428666
label           insult
precision      0.53742
tpr           0.922889
Name: 79, dtype: object
C                 0.28
ROC           0.985066
fpr          0.0428005
label           insult
precision     0.537803
tpr           0.922889
Name: 80, dtype: object
C                 0.29
ROC           0.985057
fpr          0.0428005
label           insult
precision     0.537803
tpr           0.922889
Name: 81, dtype: object
C                  0.3
ROC  

In [254]:
df_svm.to_csv(PATH + 'OneVsOne_svm_parameters.csv', index=False)

In [238]:
def select_best_parameter(label_cols, df_log, base_on = 'ROC'):
    result = {}
    for each in label_cols:
        df = df_log[df_log['label'] == each]
        result[each] = df.sort_values([base_on], ascending=False)['C'].iloc[0]
    return result
    

In [243]:
############ final training set

x_train = train_tfidf
y_train_df = train.copy()

############################# start Grid Search

x_train_all = {}
y_train_all = {}
x_test_all = {}
for col in label_cols:
    print(col)
    model_train = x_train[np.array(y_train_df[col] == 1)]
    y_model_train = y_train_df[np.array(y_train_df[col] == 1)][col].values
    
    non_model_train = x_train[np.array(y_train_df[col] == 0)]
    non_model_train = non_model_train[:model_train.shape[0]]

    y_non_model_train = y_train_df[np.array(y_train_df[col] == 0)]
    y_non_model_train = y_non_model_train[:model_train.shape[0]][col].values
    
    x_model_stack = vstack([model_train, non_model_train])
    y_model_stack = np.concatenate([y_model_train, y_non_model_train])
    
    r = np.log(pr(1, train[col].values, train_tfidf) / pr(0, train[col].values, train_tfidf))
    x_train_all[col] = x_model_stack.multiply(r).tocsr()
    x_test_all[col] = test_tfidf.multiply(r).tocsr()
    y_train_all[col] = y_model_stack

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [244]:
#####################################
############# logistic regression
#####################################
#### based on ROC
params =   select_best_parameter(label_cols, df_log, base_on = 'ROC')
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    model_log = LogisticRegression(C=params[col])
    model_log.fit(x_train_all[col], y_train_all[col])
    pred_prob[col] = model_log.predict_proba(x_test_all[col])[:,1]
pred_prob.to_csv(PATH + 'OneVsOne_LogRegress_BEST_ROC.csv', index=False)

#### based on tpr
params =   select_best_parameter(label_cols, df_log, base_on = 'tpr')
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    model_log = LogisticRegression(C=params[col])
    model_log.fit(x_train_all[col], y_train_all[col])
    pred_prob[col] = model_log.predict_proba(x_test_all[col])[:,1]
pred_prob.to_csv(PATH + 'OneVsOne_LogRegress_BEST_tpr.csv', index=False)

#### based on tpr
params =   select_best_parameter(label_cols, df_log, base_on = 'precision')
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    model_log = LogisticRegression(C=params[col])
    model_log.fit(x_train_all[col], y_train_all[col])
    pred_prob[col] = model_log.predict_proba(x_test_all[col])[:,1]
pred_prob.to_csv(PATH + 'OneVsOne_LogRegress_BEST_precision.csv', index=False)

In [247]:
select_best_parameter(label_cols, df_log, base_on = 'ROC')

{'identity_hate': 9.0,
 'insult': 1.5,
 'obscene': 1.0,
 'severe_toxic': 4.0,
 'threat': 9.0,
 'toxic': 2.7000000000000002}

In [248]:
select_best_parameter(label_cols, df_log, base_on = 'tpr')

{'identity_hate': 9.0,
 'insult': 9.0,
 'obscene': 9.0,
 'severe_toxic': 9.0,
 'threat': 9.0,
 'toxic': 2.7000000000000002}

In [249]:
select_best_parameter(label_cols, df_log, base_on = 'precision')

{'identity_hate': 0.10000000000000001,
 'insult': 0.10000000000000001,
 'obscene': 0.10000000000000001,
 'severe_toxic': 0.10000000000000001,
 'threat': 2.5,
 'toxic': 0.10000000000000001}

In [258]:
#####################################
############# svm regression
#####################################
#### based on ROC
params =   select_best_parameter(label_cols, df_svm, base_on = 'ROC')
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    lsvc = LinearSVC(C=params[col])
    model_svc = CalibratedClassifierCV(lsvc) 
    model_svc.fit(x_train_all[col], y_train_all[col])
    pred_prob[col] = model_svc.predict_proba(x_test_all[col])[:,1]
pred_prob.to_csv(PATH + 'OneVsOne_SVM_BEST_ROC.csv', index=False)

#### based on tpr
params =   select_best_parameter(label_cols, df_svm, base_on = 'tpr')
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    lsvc = LinearSVC(C=params[col])
    model_svc = CalibratedClassifierCV(lsvc) 
    model_svc.fit(x_train_all[col], y_train_all[col])
    pred_prob[col] = model_svc.predict_proba(x_test_all[col])[:,1]
pred_prob.to_csv(PATH + 'OneVsOne_SVM_BEST_tpr.csv', index=False)

#### based on tpr
params =   select_best_parameter(label_cols, df_svm, base_on = 'precision')
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    lsvc = LinearSVC(C=params[col])
    model_svc = CalibratedClassifierCV(lsvc) 
    model_svc.fit(x_train_all[col], y_train_all[col])
    pred_prob[col] = model_svc.predict_proba(x_test_all[col])[:,1]
pred_prob.to_csv(PATH + 'OneVsOne_SVM_BEST_precision.csv', index=False)

In [255]:
select_best_parameter(label_cols, df_svm, base_on = 'ROC')

{'identity_hate': 0.90000000000000002,
 'insult': 0.14999999999999999,
 'obscene': 0.14999999999999999,
 'severe_toxic': 0.14999999999999999,
 'threat': 1.0,
 'toxic': 0.28999999999999998}

In [256]:
select_best_parameter(label_cols, df_svm, base_on = 'tpr')

{'identity_hate': 0.28999999999999998,
 'insult': 0.40000000000000002,
 'obscene': 0.28999999999999998,
 'severe_toxic': 0.28999999999999998,
 'threat': 0.28999999999999998,
 'toxic': 0.90000000000000002}

In [257]:
select_best_parameter(label_cols, df_svm, base_on = 'precision')

{'identity_hate': 0.01,
 'insult': 0.01,
 'obscene': 0.01,
 'severe_toxic': 0.01,
 'threat': 0.10000000000000001,
 'toxic': 0.34999999999999998}

In [260]:
###################################################
###################################################

In [282]:
import pandas as pd
import numpy as np
from abc import ABC, abstractmethod
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack, vstack




def OneVsOneDataLoader(path_train, path_test, label_cols):
    """
    cleaned_train.csv
    cleaned_test.csv
    return :x_train: sparse matrix
            y_train: DataFrame
            x_test: sparse matrix
    """
    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)

    train_sentence = train['comment_text_cleaned_polarity']
    test_sentence = test['comment_text_cleaned_polarity']

    train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
    test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']
    print('loading data done!')
    #########################################
    phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
    char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                      strip_accents='unicode', 
                                      max_features=200000, 
                                      analyzer='char', 
                                      sublinear_tf=True)
    

    print('fitting char')
    char_vectorizer.fit(text_retain_punctuation.values)
    print('fitting phrase')
    phrase_vectorizer.fit(text.values)


    print('transforming train char')
    train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
    print('transforming train phrase')
    train_phrase = phrase_vectorizer.transform(train_sentence.values)


    print('transforming test char')
    test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
    print('transforming test phrase')
    test_phrase = phrase_vectorizer.transform(test_sentence.values)


    x_train = hstack((train_char, train_phrase), format='csr')
    x_test = hstack((test_char, test_phrase), format='csr')
    y_train = train[label_cols]
    idd = 'wordtfidf_ng13_mf10w_chartfidf_ng25_mf20w'
    
    return (x_train, y_train, x_test, idd)





class BaseLayerEstimator(ABC):
    @abstractmethod
    def train(self, x_train, y_train):
        pass
    
    @abstractmethod
    def predict(self, x_train):
        pass 
    
    

class OneVSOneReg(BaseLayerEstimator):
    def __init__(self, x_train, y_train, model='logistic'):
        """
        x_train: sparse matrix, raw tfidf
        y_train: dataframe, with only label columns. should be 6 columns in total
        model: only support logistic or svc
        """
        self.r = {}
        self.setModelName(model)
        assert self.model_name in ['logistic', 'svc']
        self.param = {}
        self.param['logistic'] = {'identity_hate': 9.0,
                                     'insult': 1.5,
                                     'obscene': 1.0,
                                     'severe_toxic': 4.0,
                                     'threat': 9.0,
                                     'toxic': 2.7}
        self.param['svc'] = {'identity_hate': 0.9,
                             'insult': 0.15,
                             'obscene': 0.15,
                             'severe_toxic': 0.15,
                             'threat': 1.0,
                             'toxic': 0.29}
        
        
        
        for col in y_train.columns:
            print('calculating naive bayes for {}'.format(col))
            self.r[col] = np.log(self.pr(1, train[col].values, train_tfidf) / self.pr(0, train[col].values, train_tfidf))
        print('initializing done')
        print('OneVsOne is using {} kernel'.format(self.model_name))
        
    def setModelName(self, name):
        self.model_name = name
        assert self.model_name in ['logistic', 'svc']
        print('OneVsOne is using {} kernel'.format(self.model_name))
        
    def pr(self, y_i, y, train_features):
        p = train_features[y==y_i].sum(0)
        return (p + 1) / ((y == y_i).sum() + 1)
    
    def oneVsOneSplit(self, x_train, y_train, label):
        print('Starting One vs One dataset splitting')
        if isinstance(y_train, pd.Series):
            y_train = y_train.values
        model_train = x_train[np.array(y_train == 1)]
        y_model_train = y_train[np.array(y_train == 1)]
        non_model_train = x_train[np.array(y_train == 0)]
        non_model_train = non_model_train[:model_train.shape[0]]
        y_non_model_train = y_train[np.array(y_train == 0)]
        y_non_model_train = y_non_model_train[:model_train.shape[0]]
        x_model_stack = vstack([model_train, non_model_train])
        y_model_stack = np.concatenate([y_model_train, y_non_model_train])
        x_nb = x_model_stack.multiply(self.r[label]).tocsr()
        y_nb = y_model_stack
        print('splitting done!')
        return (x_nb, y_nb)
    
    def train(self, x_train, y_train, label):
        ### construct one vs one
        x_nb, y_nb = self.oneVsOneSplit(x_train, y_train, label)
        ### start training
        if self.model_name is 'logistic':
            print('start training logistic regression')
            self.model = LogisticRegression(C=self.param['logistic'][label])
            self.model.fit(x_nb, y_nb)
            print('training done')
            
        else:
            print('start training linear svc regression')
            lsvc = LinearSVC(C=self.param['svc'][label])
            self.model = CalibratedClassifierCV(lsvc) 
            self.model.fit(x_nb, y_nb)
            print('training done')
        

    
    def predict(self, x_test, label):
        print('applying naive bayes to dataset')
        x_nb_test = x_test.multiply(self.r[label]).tocsr()
        print('predicting')
        pred = self.model.predict_proba(x_nb_test)[:,1]
        print('predicting done')
        return pred
    
##### example        
# aa = OneVSOneReg(train_tfidf, train[label_cols], model='logistic')
# aa.setModelName('svc')
# aa.train(train_tfidf,train['toxic'], 'toxic')
# aa.predict(test_tfidf, 'toxic')

OneVsOne is using logistic kernel
calculating naive bayes for toxic
calculating naive bayes for severe_toxic
calculating naive bayes for obscene
calculating naive bayes for threat
calculating naive bayes for insult
calculating naive bayes for identity_hate
initializing done
OneVsOne is using logistic kernel
