Importing all the required Libraries

In [1]:
import string
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from matplotlib import pyplot as plt
from gensim.models import Word2Vec
import matplotlib
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
%matplotlib inline

### 1: Loading the data set from csv files

In [2]:
# loading the data
train_data = pd.read_csv('./data/all/train.csv')
test_data = pd.read_csv('./data/all/test.csv')

In [16]:
train_data.shape

(20000, 8)

In [3]:
CLASS_LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Utility classes/functions to extract/prepare the data set

In [4]:
# PrepareData
class PrepareData:
    """
    Class to perpare the data for taraining, and testing 
    The class provide instance and static methods to prepare the data.

    Args:
    data_set : data set
    """

    # get and initialize the dataset
    def __init__(self, data_set):
        self.data_set = data_set

    def split_data(self, train_size=0.7, test_size=0.30, random_state=42, shuffle=True):
        """ This method split the data into train data set and test data set
        by default it splits the data as train_data = 70% and test_data = 30%

        Args:
        train_size : percentage of train data, default value - 0.7
                        default:0.7
        test_size : percetage of test data, default value - 0.3
                        default:0.3
        Returns:
        train_x : train data
        test_x : test data
        train_y :train data
        test_y : test data
        """
        train_x, test_x = train_test_split(
            self.data_set, random_state=random_state, test_size=test_size, shuffle=True)

        return train_x, test_x
    
    @staticmethod
    def prepare_data_func(train, test):
        """ This method split the data into train data set and test data set
        by default it splits the data as train_data = 70% and test_data = 30%

        Args:
        train_percent : percentage of train data, default value - 0.7
                        default:0.7
        test_pecent : percetage of test data, default value - 0.3
                        default:0.3
        seed : None optional

        Returns:
        train_data : tranin data
        test_data : test data
        """
        train_data = train[:20000]
        test_data = test[:7000]

        return train_data, test_data


    @staticmethod
    def prepare_test_data(labels, data_set):
        """
        Utility method to prepare the test data. 
        This method keeps the lables which are required for testing 
        by removing other lables from data frame

        Args:
        labels : list of expected lables in test data set
        data_set : test data set

        Returns:
        data_set : processed dataset
        """

        deleting_labels = [label for label in list(
            data_set.columns) if label not in labels]
        data_set = data_set.drop(deleting_labels, axis=1)
        return data_set

Split the data into train and test data

In [5]:
train_data, test_data = PrepareData.prepare_data_func(train_data, test_data)

In [6]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
train_data.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.09735,0.0111,0.05385,0.00335,0.05105,0.00905
std,0.296441,0.104773,0.225727,0.057784,0.220105,0.094702
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


### 2. Cleaning the train and test data

In [8]:
class CleanData:
    """
    Utiliy class to clean the data. provides instance and static methods 
    for cleaning the train/test data

    Args:
    data_set : data set to be cleaned. 
    """

    def __init__(self, data_set):
        self.data_set = data_set

    def fill_null(self):
        """
        Method to fill all the null values in the data set

        Args:

        Returns:
        data_set : data set with non null values

        """
        self.data_set.fillna(' ')
        return self.data_set

    def clean_text(self, field):
        """
        Method to clean the text data. 

        Args:
        field : class label in the data set

        Returns:
        data_set : cleaned data set

        """
        self.data_set[field] = self.data_set[field].str.replace(r"http\S+", "")
        self.data_set[field] = self.data_set[field].str.replace(r"http", "")
        self.data_set[field] = self.data_set[field].str.replace(r"@\S+", "")
        self.data_set[field] = self.data_set[field].str.replace(
            r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
        self.data_set[field] = self.data_set[field].str.replace(r"@", "at")
        self.data_set[field] = self.data_set[field].str.replace(
            r"what's", "what is")
        self.data_set[field] = self.data_set[field].str.replace(
            r"\'ve", " have")
        self.data_set[field] = self.data_set[field].str.replace(
            r"can't", "can not")
        self.data_set[field] = self.data_set[field].str.replace(
            r"don't", "do not")
        self.data_set[field] = self.data_set[field].str.replace(
            r"weren't", "were not")
        self.data_set[field] = self.data_set[field].str.replace(
            r"doesn't", "does not")
        self.data_set[field] = self.data_set[field].str.replace(
            r"i'm", "i am ")
        self.data_set[field] = self.data_set[field].str.replace(
            r"\'re", " are ")
        self.data_set[field] = self.data_set[field].str.replace(
            r"\'d", " would ")
        self.data_set[field] = self.data_set[field].str.replace(r"\'s", " ")
        self.data_set[field] = self.data_set[field].str.replace(r"\W", " ")
        self.data_set[field] = self.data_set[field].str.replace(r"\s+", " ")
        self.data_set[field] = self.data_set[field].str.lower()
        return self.data_set

    def remove_stop_words(self, field):
        """
        Method to remove the stopwords from the text.

        Args:
        field: class label in the dataset

        Returns:
        data_set : data set without stopwords.
        """
        self.data_set[field].apply(lambda x: ' '.join(
            [word for word in x.split() if word not in stopwords.words('english')]))
        return self.data_set

    def get_text(self, field):
        """
        Utility method to get particular column from data set

        Args:
        field : name of the required field

        Returns:
        data_set : returns the text of metioned field
        """

        return self.data_set[field]

    @staticmethod
    def get_all_text(data_set_one, data_set_two, field):
        """
        Static method to combine a particular column of two data sets.

        Args:
        data_set_one : data frame 
        data_set_two : data frame
        field : column name 

        Returns:
        data_set : new data set

        """
        data_set = pd.concat([data_set_one[field], data_set_two[field]])
        return data_set

    @staticmethod
    def get_all_text(data_set_one, data_set_two):
        """
        Static method to combine two data sets.

        Args:
        data_set_one : first data set of type data frame
        data_set_two : second data set of type 

        Returns:
        data_set_merged : new combined data set
        """
        data_set_merged = pd.concat([data_set_one, data_set_two])
        return data_set_merged

The above class provides some utility methods to clean the data.

Inorder to proceed further with generating the word embeddings, we need to clean the data

In [9]:
# cleanng the training data.
clean_train_data = CleanData(train_data)
train_data = clean_train_data.fill_null()

In [17]:
train_data.shape

(20000, 8)

In [10]:
# cleanng the training data.
clean_train_data = CleanData(test_data)
test_data = clean_train_data.fill_null()


In [18]:
test_data.shape

(7000, 2)

### 3. Feature Implementation, Build and Train Model

In [11]:
class WordEmbedding:
    """
    This class provides methods to generate the word embeddings.

    Args:
    train_data : training data set
    test_data : test data set
    """

    def __init__(self, train_data, test_data):
        self.train_data = train_data
        self.test_data = test_data

    def get_embeddings(self, field):
        """
        This method generates the word wmbeddings for train and test data.
        It makes use of Word2Vec and train the embeddings.

        Args:
        field : class name in the data set

        Returns
        word_embedding : WordVector
        """
        df = self.train_data[field].append(self.test_data[field])
        vocab = df.apply(lambda x: str(x).strip().split())
        print(len(vocab))
        models = Word2Vec(vocab, size=100)
        word_embedding = dict(zip(models.wv.index2word, models.wv.vectors))
        return word_embedding

Below class SentEmbeddingVectorizer implements the fit and transform methods to transform the word embeddings to sentence level embeddings to train the model.

In [12]:
class SentEmbeddingVectorizer:
    """
    This class provides the fit and transform methods to move from word embedding to sentence embeddings.

    Args:
    word_embedding : Word2Vec 
    """

    def __init__(self, word_embedding):
        self.word_embedding = word_embedding
        self.dim = 100

    def fit(self, X, y):
        """
        Fit the model according to the given training data.

        Args:
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and n_features is the number of features.
        y : array-like, shape (n_samples,) or (n_samples, n_targets)
            Target vector relative to X.
        sample_weight : array-like, shape (n_samples,) optional
                        Array of weights that are assigned to individual samples. 
                        If not provided, then each sample is given unit weight.

        Returns:
        self : object
        """
        return self

    def transform(self, X):
        """
        Method to transform the word embeddings to sentence embedding

        Args:
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and n_features is the number of features.
        """
        return np.array([np.mean([self.word_embedding[word] for word in words if word in self.word_embedding] or [np.zeros(self.dim)], axis=0) for words in X])

Below class TfidfSentEmbeddingVectorizer implements the fit and transform methods to transform the word embeddings to sentence level embeddings to train the model, this class also adds Tfidf weightings which supposed to give better performance

In [13]:
class TfidfSentEmbeddingVectorizer:
    """
    This class provides the fit and transform methods to move from word embedding to sentence embeddings.
    It adds Tfidf weights.

    Args:
    word_embedding : Word2Vec
    """

    def __init__(self, word_embedding):
        self.word_embedding = word_embedding
        self.word_weight = None
        self.dim = 100

    def fit(self, X, y):
        """
        Fit the model according to the given training data.

        Args:
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and n_features is the number of features.
        y : array-like, shape (n_samples,) or (n_samples, n_targets)
            Target vector relative to X.
        sample_weight : array-like, shape (n_samples,) optional
                        Array of weights that are assigned to individual samples. 
                        If not provided, then each sample is given unit weight.

        Returns:
        self : object
        """

        tfidfs = TfidfVectorizer(ngram_range=(1, 1), analyzer=lambda x: x)
        tfidfs.fit(X)
        max_idf = max(tfidfs.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, [(w, tfidfs.idf_[i]) for w, i in tfidfs.vocabulary_.items()])
        return self

    def transform(self, X):
        """
        Method to transform the word embeddings to sentence embedding

        Args:
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and n_features is the number of features.
        """

        return np.array([np.mean([self.word2vec[w] * self.word2weight[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) for words in X])

Defining the classifier Logistic Regression with Naive Bayes features.

In [14]:
class NbLogRegClassifier(BaseEstimator, ClassifierMixin):

    """
    The Logistic Regression classifier makes use of basic Naive Bayes feature.
    The classifier inherits from BaseEstimator and ClassifierMixin sklearn's base package.

    Args:
    C :  Inverse of regularization strength; must be a positive float. 
        Like in support vector machines, smaller values specify stronger regularization.
        default: 1.0
    solver : Algorithm to use in the optimization problem.
            default: 'sag'
            prefered for multiclass problem - 'newton-cg’, ‘sag’, ‘saga’ or ‘lbfgs'
    n_jobs : Number of CPU cores used when parallelizing over classes if multi_class=’ovr’”.
            default: -1 (using all processors)
    max_iter : Maximum number of iterations taken for the solvers to converge.
            default: 4000
    dual: Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. 
          Prefer dual=False when n_samples > n_features.
          default: False

    """

    def __init__(self, C=4, dual=False, n_jobs=-1, solver='sag', max_iter=4000):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs
        self.max_iter = max_iter
        self.solver = solver

    def predict(self, X):
        """

        Predict class labels for samples in X.

        Args:
        X : array_like or sparse matrix, shape (n_samples, n_features)

        Returns:

        C : array, shape [n_samples]
            Predicted class label per sample.

        """
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(X.multiply(self._r))

    def predict_proba(self, X):
        """

        Probability estimates.

        The returned estimates for all classes are ordered by the
        label of classes.

        Args:
        X : array-like, shape = [n_samples, n_features]

        Returns:
        T : 

        """

        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(X.multiply(self._r))

    def fit(self, X, y):
        """

        Fit the model according to the given training data.

        Args:
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and n_features is the number of features.
        y : array-like, shape (n_samples,) or (n_samples, n_targets)
            Target vector relative to X.
        sample_weight : array-like, shape (n_samples,) optional
                        Array of weights that are assigned to individual samples. 
                        If not provided, then each sample is given unit weight.

        Returns:
        self : object

        """

        y = y.values
        X, y = check_X_y(X, y, accept_sparse=True)

        def pr(X, y_i, y):
            """
            Method to implement the basic Naive Bayes feature
            """
            p = X[y == y_i].sum(0)
            return (p+1) / ((y == y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(X, 1, y) / pr(X, 0, y)))
        x_nb = X.multiply(self._r)
        self._clf = LogisticRegression(
            C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

Getting the word embeddingd for traiing and testing data.

In [15]:
word_embeding = WordEmbedding(train_data=train_data, test_data=test_data)
word_vector = word_embeding.get_embeddings(field='comment_text')

27000


In [17]:
svc_model=Pipeline([("word_vector",SentEmbeddingVectorizer(word_vector)),("multilabel",OneVsRestClassifier(LinearSVC(random_state=0)))])

### 4. Validating the Model

In [18]:
y_train=train_data[[i for i in train_data.columns if i not in ["comment_text","id"]]]

In [19]:
# training the model
svc_model.fit(train_data['comment_text'],y_train)

Pipeline(memory=None,
     steps=[('word_vector', <__main__.SentEmbeddingVectorizer object at 0x1a2c9309e8>), ('multilabel', OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0),
          n_jobs=None))])

In [22]:
# predicting
pred=svc_model.predict(train_data['comment_text'])