## Imports

In [1]:
import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import metrics

import string
import spacy

np.random.seed(42)

## Data Loading

In [2]:
data = pd.read_csv("toxic_comments.csv", engine="python")
data = data[["comment_text", "toxic"]]
data

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
495,"Sarek of Vulcan: Unfortunately for you, you ca...",0
496,Keep your chin up! Darwinism was not accepted ...,0
497,"""""""Nazi filth"""" is impolite 04:27, 20 Jan 200...",1
498,Interesting. I checked the other case number K...,0


In [3]:
data["toxic"].value_counts()

toxic
0    451
1     49
Name: count, dtype: int64

## Data Processing

In [4]:
nlp = spacy.load("en_core_web_sm")

stop_words = nlp.Defaults.stop_words
print(stop_words)

{'from', 'amongst', 'very', 'wherever', "'m", "'re", 'among', 'make', 'themselves', 'using', 'whereupon', 'thereby', 'move', 'besides', 'meanwhile', 'such', 'made', 'down', 'only', 'twenty', 'see', 'should', 'and', 'will', 'someone', 'has', 'of', 'others', 'we', 'hereupon', 'part', 'no', 'between', 'how', 'seems', 'thereupon', 'behind', 'say', 'two', 'in', 'side', 'these', 'below', 'towards', 'hereafter', 'it', 'other', 'latterly', 'via', 'hence', 'an', 'a', 'i', 'eleven', 'with', 'becoming', 'was', 'seeming', 'more', 'why', 'fifteen', 'becomes', 'were', 'do', 'afterwards', 'per', 'most', 'whereafter', 'by', 'can', 'further', 'nine', 'or', 'throughout', 'somehow', 'did', "'ll", 'nevertheless', 'except', 'yourself', 'keep', 'take', '‘re', 'within', 'though', 'own', 'your', 'but', 'everything', 'somewhere', 'after', 'top', 'you', '‘m', 'five', '’d', 'as', 'even', 'something', 'the', 'indeed', 'nowhere', '‘ll', "'d", 'his', 'am', 'whence', 'anything', 'neither', 'seem', 'since', 'whoever'

In [5]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


**Creating our tokenizer function**

In [6]:
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase. Lemma=> root word
    tokens = [ word.lemma_.lower().strip() for word in doc ]

    # Removing stop words
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return tokens

In [7]:
sentence = "I am drinking water ?"
spacy_tokenizer(sentence)

['drink', 'water']

There are several techniques for text representation, including the Bag of Words model, Count Vectorization, and TF-IDF. In addition, more advanced techniques like Word2Vec and Doc2Vec can be used for word embedding.

### 1. Count Vectorization

In [8]:
count_vector = CountVectorizer(tokenizer = spacy_tokenizer)

In [9]:
count_vector.fit_transform(["I am eating apple, I like apple","I am playing cricket"]).toarray() 



array([[2, 0, 1, 1, 0],
       [0, 1, 0, 0, 1]])

In [10]:
count_vector.get_feature_names_out() # here length of feature is 5.

array(['apple', 'cricket', 'eat', 'like', 'play'], dtype=object)

In [11]:
count_vector.vocabulary_

{'eat': 2, 'apple': 0, 'like': 3, 'play': 4, 'cricket': 1}

In [12]:
from sklearn.model_selection import train_test_split

X = data['comment_text'] # the features we want to analyze
y = data['toxic'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(400,)
(400,)
(100,)
(100,)


In [14]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [15]:
X_train_vectors= count_vector.fit_transform(X_train)
X_test_vectors= count_vector.transform(X_test)



In [16]:
X_train_vectors.shape

(400, 4175)

In [17]:
X_test_vectors.shape # 4175 word features

(100, 4175)

In [18]:
count_vector.vocabulary_  # word to index value

{'eye': 1302,
 'care': 638,
 'ok': 2430,
 'people': 2539,
 'want': 3664,
 'clean': 729,
 'god': 1517,
 'freakin': 1431,
 'stupid': 3273,
 'omnigan': 2438,
 'vandalism': 3613,
 'matt': 2151,
 'shirvington': 3075,
 'article': 387,
 'revert': 2918,
 'ban': 464,
 'report': 2889,
 'thank': 3391,
 'wikipedia': 3717,
 'administrator': 235,
 'intervention': 1836,
 'remove': 2880,
 'vital': 3648,
 'functioning': 1458,
 'user': 3593,
 'encourage': 1199,
 'warn': 3667,
 'generally': 1484,
 'able': 179,
 'block': 539,
 'receive': 2827,
 'recent': 2828,
 'final': 1367,
 'mention': 2190,
 'recently': 2829,
 'vandalize': 3614,
 'appear': 341,
 'occur': 2413,
 'continue': 847,
 'aiv': 274,
 'noticeboard': 2377,
 'talk': 3349,
 'request': 2894,
 '...': 13,
 'future': 1466,
 'sign': 3094,
 'tag': 3343,
 'photography': 2574,
 'workshop': 3753,
 'page': 2491,
 'bot': 564,
 'currently': 924,
 'i.e.': 1718,
 '—': 4117,
 'precede': 2657,
 'unsigned': 3570,
 'comment': 771,
 'add': 226,
 '14.07.09': 54,
 'kno

In [19]:
X_test_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

**Fit the model**

In [20]:
classifier.fit(X_train_vectors, y_train)

In [21]:
predicted = classifier.predict(X_test_vectors)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.92
Logistic Regression Precision: 1.0
Logistic Regression Recall: 0.2


### 2. TF-IDF (Term Frequency - Inverse Document Frequency)

In [22]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [23]:
X_train_vectors= tfidf_vector.fit_transform(X_train)
X_test_vectors= tfidf_vector.transform(X_test)



In [24]:
classifier = LogisticRegression()

classifier.fit(X_train_vectors, y_train)
predicted = classifier.predict(X_test_vectors)

print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9
Logistic Regression Precision: 0.0
Logistic Regression Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
