# Understanding the "Topic" of toxicity

[https://www.kaggle.com/jagangupta/understanding-the-topic-of-toxicity](https://www.kaggle.com/jagangupta/understanding-the-topic-of-toxicity)

<br>

This kernel is an extension of the EDA notebook: Stop the S@#$ - [Toxic Comments EDA](https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda)

<br>

# 1. Topic Modeling:

Topic modeling can be a useful tool to summarize the context of a huge corpus(text) by guessing what the "Topic" or the general.

This can also be used as inputs to our classifier if they can identify patterns or "Topics" that indicate toxicity.

Let's find out!

The steps followed in this kernel:

- Preprocessing (Tokenization using gensim's simple_preprocess)
- Cleaning
  - Stop word removal
  - Bigram collation
  - Lemmatization
- Creation of dictionary (list of all words in the cleaned text)
- Topic modeling using LDA
- Visualization with pyLDAviz
- Convert topics to sparse vectors
- Feed sparse vectors to the model

<br>

In [1]:
# import required packages

# basic
import pandas as pd
import numpy as np

# misc
import gc
import time
import warnings

# viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import pyLDAvis.gensim

# nlp
import string
import re # for regex
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary

# Modeling
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from scipy import sparse

# settings
start_time = time.time()
color = sns.color_palette()
sns.set_style("dark")

# constants
eng_stopwords = set(stopwords.words("english"))

# settings
warnings.filterwarnings("ignore")
lem = WordNetLemmatizer()
tokenizer = ToktokTokenizer()

%matplotlib inline

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
start_time = time.time()

# importing the dataset
train = pd.read_csv("../../input/jigsaw-toxic-comment-classification-challenge/train.csv")
test = pd.read_csv("../../input/jigsaw-toxic-comment-classification-challenge/test.csv")

end_import = time.time()

print("Time till import:", end_import - start_time, "s")

Time till import: 2.7765743732452393 s


In [3]:
# to seperate sentenses into words
def preprocess(comment):
    """
    Function to build tokenized texts from input comment
    """
    return gensim.utils.simple_preprocess(comment, deacc=True, min_len=3)

In [4]:
# tokenize the comments
train_text = train.comment_text.apply(lambda x: preprocess(x))
test_text = test.comment_text.apply(lambda x: preprocess(x))

In [5]:
all_text = train_text.append(test_text)

In [6]:
# checks
print("Total number of comments:", len(all_text))
print("Before preprocessing:", train.comment_text.iloc[30])
print("After preprocessing:", all_text.iloc[30])

Total number of comments: 312735
Before preprocessing: How could I post before the block expires?  The funny thing is, you think I'm being uncivil!
After preprocessing: ['how', 'could', 'post', 'before', 'the', 'block', 'expires', 'the', 'funny', 'thing', 'you', 'think', 'being', 'uncivil']


In [7]:
# Phrases help us group together bigrams : new + york --> new_york
bigram = gensim.models.Phrases(all_text)

In [8]:
# check bigram collation functionality
print("Before Pharses:", all_text.iloc[30])
print("After Pharses:", bigram[all_text.iloc[30]])

Before Pharses: ['how', 'could', 'post', 'before', 'the', 'block', 'expires', 'the', 'funny', 'thing', 'you', 'think', 'being', 'uncivil']
After Pharses: ['how', 'could', 'post', 'before', 'the', 'block_expires', 'the', 'funny_thing', 'you', 'think', 'being_uncivil']


In [9]:
diff = []

for word in bigram[all_text.iloc[30]]:
    if word not in all_text.iloc[30]:
        diff.append(word)
        
print(diff)

['block_expires', 'funny_thing', 'being_uncivil']


In [10]:
def clean(word_list):
    """
    Function to clean the pre-processed word lists
    
    Following transformations will be done
    1) Stop words removal from the nltk stopword list
    2) Bigram collation 
       (Finding common bigrams and grouping them together using gensim.models.phrases)
    3) Lemmatization
       (Converting word to its root form : babies --> baby ; children --> child)
    """
    
    # remove stop words
    clean_words = [w for w in word_list if not w in eng_stopwords]
    
    # collect bigrams
    clean_words = bigram[clean_words]
    
    # Lemmatize
    clean_words = [lem.lemmatize(word, "v") for word in clean_words]
    
    return (clean_words)

In [11]:
# check clean function
print("Before clean:", all_text.iloc[1])
print("After clean:", clean(all_text.iloc[1]))

Before clean: ['aww', 'matches', 'this', 'background', 'colour', 'seemingly', 'stuck', 'with', 'thanks', 'talk', 'january', 'utc']
After clean: ['aww', 'match', 'background', 'colour', 'seemingly', 'stick', 'thank', 'talk', 'january_utc']


In [12]:
# scale it to all text
all_text = all_text.apply(lambda x: clean(x))

end_clean = time.time()

print("Time till cleaning corpus:", end_clean - start_time, "s")

Time till cleaning corpus: 437.84178829193115 s


In [13]:
all_text.iloc[1]

['aww',
 'match',
 'background',
 'colour',
 'seemingly',
 'stick',
 'thank',
 'talk',
 'january_utc']

In [14]:
# create the dictionary
dictionary = Dictionary(all_text)
print("There are", len(dictionary), "number of words in the final dictionary")

There are 322843 number of words in the final dictionary


In [15]:
# convert into lookup tuples within the dictionary using doc2bow
print(dictionary.doc2bow(all_text.iloc[1]))
print("Wordlist from the sentence:", all_text.iloc[1])

[(21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]
Wordlist from the sentence: ['aww', 'match', 'background', 'colour', 'seemingly', 'stick', 'thank', 'talk', 'january_utc']


In [16]:
# to check
print("Wordlist from the dictionary lookup:",
      dictionary[21],
      dictionary[22],
      dictionary[23],
      dictionary[24],
      dictionary[25],
      dictionary[26],
      dictionary[27],
      dictionary[28],
      dictionary[29])

Wordlist from the dictionary lookup: aww background colour january_utc match seemingly stick talk thank


In [17]:
# scale it to all text
corpus = [dictionary.doc2bow(text) for text in all_text]
end_corpus = time.time()
print("Time till corpus creation:", end_corpus - start_time, "s")

Time till corpus creation: 504.83364510536194 s


In [18]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1)]

In [19]:
# create the LDA model
ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary)

end_lda = time.time()
print("Time till LDA model creation:", end_lda - start_time, "s")

Time till LDA model creation: 879.0080683231354 s


In [20]:
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

In [None]:
end_viz = time.time()
print("Time till viz:", end_viz - start_time, "s")

**Chart Desc:**

The above visuals are from the awesome pyLDAviz package which is the python version of R package LDAviz.

The Left side shows the multi-dimensional "word-space" superimposed on two "Principal components" and the relative positions of all the topics.

The size of the circle represents what % of the corpus it contains.

The right side shows the word frequencies within the topic and in the whole corpus.

Clearly, some of the topics show a pattern of toxicity (ie) have a high contribution from toxic words.

Now let's feed these topics into a model.

In [None]:
# creating the topic probability matrix
topic_probability_mat = ldamodel[corpus]

In [None]:
# split it to test and train
train_matrix = topic_probability_mat[:train.shape[0]]
test_matrix = topic_probability_mat[train.shape[0]:]

In [None]:
del(topic_probability_mat)
del(corpus)
del(all_text)
gc.collect()

In [None]:
# convert to sparse format (Csr matrix)
train_sparse = gensim.matutils.corpus2csc(train_matrix)
test_sparse = gensim.matutils.corpus2csc(test_matrix)

end_time = time.time()
print("total time till Sparse mat creation", end_time - start_time, "s")

In [None]:
# custom NB model
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs
        
    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))
    
    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))
    
    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)
        
        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)
        
        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, 
                                       dual=self.dual, 
                                       n_jobs=self.n_jobs).fit(x_nb, y)
        return self
    
model = NbSvmClassifier(C=2, dual=True, n_jobs=-1)

In [None]:
# set the target columns
target_x = train_sparse.transpose()
TARGET_COLS=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
target_y=train[TARGET_COLS]

In [None]:
del(train_sparse)
gc.collect()

In [None]:
model = NbSvmClassifier(C=4, dual=True, n_jobs=-1)

X_train, X_valid, y_train, y_valid = train_test_split(target_x, target_y, 
                                                      test_size=0.33, random_state=2018)

train_loss = []
valid_loss = []

preds_train = np.zeros((X_train.shape[0], y_train.shape[1]))
preds_valid = np.zeros((X_valid.shape[0], y_valid.shape[1]))

for i, j in enumerate(TARGET_COLS):
    
    print("Class:= " + j)
    
    model.fit(X_train, y_train[j])
    
    preds_valid[:,i] = model.predict_proba(X_valid)[:, 1]
    preds_valid[:,i] = model.predict_proba(X_train)[:, 1]
    
    train_loss_class = log_loss(y_train[j], preds_train[:,i])
    valid_loss_class = log_loss(y_valid[j], preds_valid[:,i])
    
    print("Trainloss=log loss:", train_loss_class)
    print("Validloss=log loss:", valid_loss_class)
    
    train_loss.append(train_loss_class)
    valid_loss.append(valid_loss_class)
    
print("mean column-wise log loss:Train dataset", np.mean(train_loss))
print("mean column-wise log loss:Validation dataset", np.mean(valid_loss))

end_time = time.time()
print("total time till NB base model creation", end_time - start_time)

In [None]:
#credits
#pyLDAviz
#https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf

#to be continued 
#to do next
#paragraph vectors
#https://arxiv.org/abs/1507.07998
#https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb