In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack


In [2]:
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

class SkipGramCountVectorizer(CountVectorizer):
    """
    To vectorize text with skip-grams in scikit-learn simply passing the skip gram tokens as the vocabulary 
    to CountVectorizer will not work. You need to modify the way tokens are processed which can be done with 
    a custom analyzer. Below is an example vectorizer that produces 1-skip-2-grams
    """
    def build_analyzer(self):    
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        return lambda doc: self._word_skip_grams(
                compose(tokenize, preprocess, self.decode)(doc),
                stop_words)

    def _word_skip_grams(self, tokens, stop_words=None):
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]

        return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)
    
# """
# examples:
# text = ['the rain in Spain falls mainly on the plain']

# vect = SkipGramVectorizer()
# vect.fit(text)
# vect.get_feature_names()
# """
from sklearn.utils.validation import check_is_fitted
class SkipGramTfidfVectorizer(SkipGramCountVectorizer):
    """Convert a collection of raw documents to a matrix of TF-IDF features.

    Equivalent to CountVectorizer followed by TfidfTransformer.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------
    input : string {'filename', 'file', 'content'}
        If 'filename', the sequence passed as an argument to fit is
        expected to be a list of filenames that need reading to fetch
        the raw content to analyze.

        If 'file', the sequence items must have a 'read' method (file-like
        object) that is called to fetch the bytes in memory.

        Otherwise the input is expected to be the sequence strings or
        bytes items are expected to be analyzed directly.

    encoding : string, 'utf-8' by default.
        If bytes or files are given to analyze, this encoding is used to
        decode.

    decode_error : {'strict', 'ignore', 'replace'}
        Instruction on what to do if a byte sequence is given to analyze that
        contains characters not of the given `encoding`. By default, it is
        'strict', meaning that a UnicodeDecodeError will be raised. Other
        values are 'ignore' and 'replace'.

    strip_accents : {'ascii', 'unicode', None}
        Remove accents during the preprocessing step.
        'ascii' is a fast method that only works on characters that have
        an direct ASCII mapping.
        'unicode' is a slightly slower method that works on any characters.
        None (default) does nothing.

    analyzer : string, {'word', 'char'} or callable
        Whether the feature should be made of word or character n-grams.

        If a callable is passed it is used to extract the sequence of features
        out of the raw, unprocessed input.

    preprocessor : callable or None (default)
        Override the preprocessing (string transformation) stage while
        preserving the tokenizing and n-grams generation steps.

    tokenizer : callable or None (default)
        Override the string tokenization step while preserving the
        preprocessing and n-grams generation steps.
        Only applies if ``analyzer == 'word'``.

    ngram_range : tuple (min_n, max_n)
        The lower and upper boundary of the range of n-values for different
        n-grams to be extracted. All values of n such that min_n <= n <= max_n
        will be used.

    stop_words : string {'english'}, list, or None (default)
        If a string, it is passed to _check_stop_list and the appropriate stop
        list is returned. 'english' is currently the only supported string
        value.

        If a list, that list is assumed to contain stop words, all of which
        will be removed from the resulting tokens.
        Only applies if ``analyzer == 'word'``.

        If None, no stop words will be used. max_df can be set to a value
        in the range [0.7, 1.0) to automatically detect and filter stop
        words based on intra corpus document frequency of terms.

    lowercase : boolean, default True
        Convert all characters to lowercase before tokenizing.

    token_pattern : string
        Regular expression denoting what constitutes a "token", only used
        if ``analyzer == 'word'``. The default regexp selects tokens of 2
        or more alphanumeric characters (punctuation is completely ignored
        and always treated as a token separator).

    max_df : float in range [0.0, 1.0] or int, default=1.0
        When building the vocabulary ignore terms that have a document
        frequency strictly higher than the given threshold (corpus-specific
        stop words).
        If float, the parameter represents a proportion of documents, integer
        absolute counts.
        This parameter is ignored if vocabulary is not None.

    min_df : float in range [0.0, 1.0] or int, default=1
        When building the vocabulary ignore terms that have a document
        frequency strictly lower than the given threshold. This value is also
        called cut-off in the literature.
        If float, the parameter represents a proportion of documents, integer
        absolute counts.
        This parameter is ignored if vocabulary is not None.

    max_features : int or None, default=None
        If not None, build a vocabulary that only consider the top
        max_features ordered by term frequency across the corpus.

        This parameter is ignored if vocabulary is not None.

    vocabulary : Mapping or iterable, optional
        Either a Mapping (e.g., a dict) where keys are terms and values are
        indices in the feature matrix, or an iterable over terms. If not
        given, a vocabulary is determined from the input documents.

    binary : boolean, default=False
        If True, all non-zero term counts are set to 1. This does not mean
        outputs will have only 0/1 values, only that the tf term in tf-idf
        is binary. (Set idf and normalization to False to get 0/1 outputs.)

    dtype : type, optional
        Type of the matrix returned by fit_transform() or transform().

    norm : 'l1', 'l2' or None, optional
        Norm used to normalize term vectors. None for no normalization.

    use_idf : boolean, default=True
        Enable inverse-document-frequency reweighting.

    smooth_idf : boolean, default=True
        Smooth idf weights by adding one to document frequencies, as if an
        extra document was seen containing every term in the collection
        exactly once. Prevents zero divisions.

    sublinear_tf : boolean, default=False
        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).

    Attributes
    ----------
    vocabulary_ : dict
        A mapping of terms to feature indices.

    idf_ : array, shape = [n_features], or None
        The learned idf vector (global term weights)
        when ``use_idf`` is set to True, None otherwise.

    stop_words_ : set
        Terms that were ignored because they either:

          - occurred in too many documents (`max_df`)
          - occurred in too few documents (`min_df`)
          - were cut off by feature selection (`max_features`).

        This is only available if no vocabulary was given.

    See also
    --------
    CountVectorizer
        Tokenize the documents and count the occurrences of token and return
        them as a sparse matrix

    TfidfTransformer
        Apply Term Frequency Inverse Document Frequency normalization to a
        sparse matrix of occurrence counts.

    Notes
    -----
    The ``stop_words_`` attribute can get large and increase the model size
    when pickling. This attribute is provided only for introspection and can
    be safely removed using delattr or set to None before pickling.
    """

    def __init__(self, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):

        super(SkipGramTfidfVectorizer, self).__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
                                       smooth_idf=smooth_idf,
                                       sublinear_tf=sublinear_tf)

    # Broadcast the TF-IDF parameters to the underlying transformer instance
    # for easy grid search and repr

    @property
    def norm(self):
        return self._tfidf.norm

    @norm.setter
    def norm(self, value):
        self._tfidf.norm = value

    @property
    def use_idf(self):
        return self._tfidf.use_idf

    @use_idf.setter
    def use_idf(self, value):
        self._tfidf.use_idf = value

    @property
    def smooth_idf(self):
        return self._tfidf.smooth_idf

    @smooth_idf.setter
    def smooth_idf(self, value):
        self._tfidf.smooth_idf = value

    @property
    def sublinear_tf(self):
        return self._tfidf.sublinear_tf

    @sublinear_tf.setter
    def sublinear_tf(self, value):
        self._tfidf.sublinear_tf = value

    @property
    def idf_(self):
        return self._tfidf.idf_

    def fit(self, raw_documents, y=None):
        """Learn vocabulary and idf from training set.

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        Returns
        -------
        self : TfidfVectorizer
        """
        X = super(SkipGramTfidfVectorizer, self).fit_transform(raw_documents)
        self._tfidf.fit(X)
        return self

    def fit_transform(self, raw_documents, y=None):
        """Learn vocabulary and idf, return term-document matrix.

        This is equivalent to fit followed by transform, but more efficiently
        implemented.

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Tf-idf-weighted document-term matrix.
        """
        X = super(SkipGramTfidfVectorizer, self).fit_transform(raw_documents)
        self._tfidf.fit(X)
        # X is already a transformed view of raw_documents so
        # we set copy to False
        return self._tfidf.transform(X, copy=False)

    def transform(self, raw_documents, copy=True):
        """Transform documents to document-term matrix.

        Uses the vocabulary and document frequencies (df) learned by fit (or
        fit_transform).

        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects

        copy : boolean, default True
            Whether to copy X and operate on the copy or perform in-place
            operations.

        Returns
        -------
        X : sparse matrix, [n_samples, n_features]
            Tf-idf-weighted document-term matrix.
        """
        check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')

        X = super(SkipGramTfidfVectorizer, self).transform(raw_documents)
        return self._tfidf.transform(X, copy=False)

In [38]:
PATH = '../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

# train_sentence = train['comment_text_cleaned']
# test_sentence = test['comment_text_cleaned']
train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']


train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']
# text = pd.concat([train_sentence, test_sentence])
text = train_sentence
# text_retain_punctuation = pd.concat([train_sentence_retain_punctuation, test_sentence_retain_punctuation])
text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)

(159571, 30)
(153164, 24)


In [3]:
# from textblob import TextBlob
# # x = my_series.apply(my_function, args = (arg1,))
# pol = pd.DataFrame()
# pol_test = pd.DataFrame()
# def add_polarity_phrase(x, col):
#     score = int (TextBlob(x).sentiment.polarity * 20) 
#     if score > 0:
#         senti = 'positive'
#     elif score < 0:
#         senti = 'negative'
#         score = score * (-1)
#     else:
#         senti = 'neutral'
#     return ' {}_{}_{} '.format(col,senti,score)

# pol['cleaned'] = train['comment_text_cleaned'].apply(add_polarity_phrase, args=('cleaned',))
# pol['original'] = train['comment_text'].apply(add_polarity_phrase, args=('original',))

# print('train set done')

# pol_test['cleaned'] = test['comment_text_cleaned'].apply(add_polarity_phrase, args=('cleaned',))
# pol_test['original'] = test['comment_text'].apply(add_polarity_phrase, args=('original',))

# train['comment_text_cleaned_polarity'] = train['comment_text_cleaned'] + pol['cleaned'] + pol['original']
# test['comment_text_cleaned_polarity'] = test['comment_text_cleaned'] + pol_test['cleaned'] + pol_test['original']

# train.to_csv(PATH + 'cleaned_train.csv', index=False)
# test.to_csv(PATH + 'cleaned_test.csv', index=False)

train set done


In [39]:


print('getting skip gram tfidf')

skip_vectorizer = SkipGramTfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=10000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')

phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(1,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)
print('fitting skip 1 n-gram 2')
# skip_vectorizer.fit(text.values)
print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')
# train_skip = skip_vectorizer.transform(train_sentence.values)
print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)

print('transforming test skip gram')
# test_skip = skip_vectorizer.transform(test_sentence.values)
print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)

# train_tfidf = hstack((train_skip, train_char, train_phrase), format='csr')
# test_tfidf = hstack((test_skip, test_char, test_phrase), format='csr')

train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting skip gram tfidf
fitting skip 1 n-gram 2
fitting char


KeyboardInterrupt: 

In [23]:
##### getting naive bayes matrix
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_conditional_matrix(m_train,m_test,y):
    x_train = (m_train > 0) * 1
    x_test = (m_test > 0 ) * 1
    count = x_train.sum(0)
    appearance = x_train[y == 1].sum(0)
    x_train = x_train.multiply(appearance / count).tocsr()
    x_test = x_test.multiply(appearance / count).tocsr()
    return x_train, x_test
    # conditional probality

# train_tfidf_nb = {}
# test_tfidf_nb = {}
# train_appearance_cp = {}
# test_appearance_cp = {}
train_model = {}
test_model = {}
for col in label_cols:
    print(col)
    y = train[col].values
    r = np.log(pr(1, y, train_tfidf) / pr(0, y, train_tfidf))
    train_tfidf_nb = train_tfidf.multiply(r).tocsr()
    test_tfidf_nb = test_tfidf.multiply(r).tocsr()
    print('getting appearance')
    train_appearance_cp,test_appearance_cp = get_conditional_matrix(train_tfidf,test_tfidf,y)
    
    train_model[col] = hstack((train_tfidf_nb, train_appearance_cp), format='csr')
    test_model[col] = hstack((test_tfidf_nb, test_appearance_cp), format='csr')
    


toxic
getting appearance
severe_toxic
getting appearance
obscene
getting appearance
threat
getting appearance
insult
getting appearance
identity_hate
getting appearance


In [37]:
import gc
gc.collect()
%env JOBLIB_TEMP_FOLDER=/tmp
#### train model
for cc in [ 0.15, 0.10, 0.08, 0.07, 0.05 ]:
    print(cc)
    preds_train = pd.DataFrame()
    preds_test = pd.DataFrame()
    for col in label_cols:
        print(col)
        model = LogisticRegression(C=cc)
        model.fit(train_model[col], train[col].values)
        preds_test[col] = model.predict_proba(test_model[col])[:, 1]
        preds_train[col] = model.predict_proba(train_model[col])[:, 1]
        print('accuracy is {}'.format(roc_auc_score(train[col].values, preds_train[col])))
    preds_test['id'] = test['id']
    preds_test.to_csv(PATH + 'nblogreg_ori_trainOntrain_char_punctuation_polarity_appearance_c_{}.csv'.format(cc), index= False)

    

env: JOBLIB_TEMP_FOLDER=/tmp
0.15
toxic
accuracy is 0.9894004626373385
severe_toxic
accuracy is 0.9958259680887998
obscene
accuracy is 0.9959957320951517
threat
accuracy is 0.9993480563866923
insult
accuracy is 0.9912481775218893
identity_hate
accuracy is 0.9959097457993027
0.1
toxic
accuracy is 0.9869235967498293
severe_toxic
accuracy is 0.9946969445618739
obscene
accuracy is 0.9952001217346296
threat
accuracy is 0.998815737023057
insult
accuracy is 0.9897287820357448
identity_hate
accuracy is 0.9940579276972978
0.08
toxic
accuracy is 0.9855126097609673
severe_toxic
accuracy is 0.9940842646944665
obscene
accuracy is 0.9947330375466854
threat
accuracy is 0.9984193214321342
insult
accuracy is 0.9888819928590162
identity_hate
accuracy is 0.992961669668828
0.07
toxic
accuracy is 0.9846491697183067
severe_toxic
accuracy is 0.9937224383752272
obscene
accuracy is 0.994446665779438
threat
accuracy is 0.9981407417103235
insult
accuracy is 0.9883692334128905
identity_hate
accuracy is 0.99229760

In [30]:
aa = pd.read_csv('../data/nblogreg_ori_trainOntrain_char_punctuation_polarity_appearance_c_0.2') 

In [35]:
aa.columns

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate',
       'id'],
      dtype='object')

In [36]:
aa.to_csv('../data/nblogreg_ori_trainOntrain_char_punctuation_polarity_appearance_c_0.2.csv', index=False) 

In [10]:


def get_nblogreg_model(label_cols, train_features, train, test_features, cc):
    preds = np.zeros((test.shape[0], len(label_cols)))
    train_preds = np.zeros((train.shape[0], len(label_cols)))
    for i, j in enumerate(label_cols):
        print('fit', j)
        y = train[j].values
        r = np.log(pr(1, y, train_features) / pr(0, y, train_features))
        model = LogisticRegression(C=cc, max_iter = 300, n_jobs=10)
        x_nb = train_features.multiply(r).tocsr()
        model.fit(x_nb, y)
        preds[:, i] = model.predict_proba(test_features.multiply(r))[:, 1]
        train_preds[:, i] = model.predict_proba(x_nb)[:, 1]
        print('accuracy is {}'.format(roc_auc_score(y, train_preds[:, i])))
    return preds, train_preds

def save(model_name, y_test, label_cols, path, is_train=False):
    if is_train:
        submission = pd.read_csv(path + 'sample_train.csv')
        file_name = 'train_' + model_name
    else:
        submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = model_name
    submission[label_cols] = y_test
    submission.to_csv(path + file_name + '.csv', index=False)
    
print('done')

done


In [11]:

path_save = PATH = '../data/'
for cc in [ 0.20, 0.25, 0.15, 0.10, 0.08, 0.07 ]:
    print('predicting C %s' % cc)
    y_test, y_train = get_nblogreg_model(label_cols, train_tfidf, train, test_tfidf, cc)
    print('total score is {}'.format(roc_auc_score(train[label_cols], y_train)))
    ########################################
    print('saving files')
    model_name = 'nblogreg_ori_trainOntrain_char_punctuation_polarity_c_{}'.format(cc)
    print(model_name)
    save(model_name, y_test, label_cols, PATH)



env: JOBLIB_TEMP_FOLDER=/tmp
predicting C 0.2
fit toxic


  " = {}.".format(self.n_jobs))


accuracy is 0.986595103115305
fit severe_toxic
accuracy is 0.9965187402776788
fit obscene
accuracy is 0.9960058109640987
fit threat
accuracy is 0.9995928672755734
fit insult
accuracy is 0.9911548001731012
fit identity_hate
accuracy is 0.9970560571007812
total score is 0.9944872298177563
saving files
nblogreg_ori_trainOntrain_char_punctuation_polarity_c_0.2
predicting C 0.25
fit toxic
accuracy is 0.9875469413436043
fit severe_toxic
accuracy is 0.9971036392496745
fit obscene
accuracy is 0.9963109589225135
fit threat
accuracy is 0.9997286526995723
fit insult
accuracy is 0.9918339267281295
fit identity_hate
accuracy is 0.9977976267377627
total score is 0.9950536242802094
saving files
nblogreg_ori_trainOntrain_char_punctuation_polarity_c_0.25
predicting C 0.15
fit toxic
accuracy is 0.9853769137852342
fit severe_toxic
accuracy is 0.9957561626360292
fit obscene
accuracy is 0.9956171694955567
fit threat
accuracy is 0.9993460707582762
fit insult
accuracy is 0.9903076310462371
fit identity_hate


In [6]:
one = []
for word in phrase_vectorizer.vocabulary_:
    if len(word.split(' ')) == 1:
        one.append(word)

In [7]:
cleaned = []
original = []
for each in one:
    if each[:7] == 'cleaned':
        cleaned.append(each)
    elif each[:8] == 'original':
        original.append(each)

In [9]:
len(cleaned)

38

In [None]:
original