<a href="https://colab.research.google.com/github/vrjayaprakash/Model/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics

In [3]:
import string
import spacy
np.random.seed(42)

In [4]:
data = pd.read_csv('/content/nlp/toxic_comments_500.csv')

In [5]:
data.head()

Unnamed: 0,comment_text,toxic
0,Everything you say is nonsense.,1
1,You're the worst person ever.,1
2,Nobody cares what you think.,1
3,Just shut up already.,1
4,"Great job, keep going!",0


In [6]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'might', 'moreover', 'her', 'becomes', 'many', 'anything', 'due', 'next', 'be', 'off', 'eight', 'whereby', 'will', 'show', 'same', 'below', 're', 'than', 'on', 'those', 'through', 'everyone', 'formerly', 'should', 'would', 'elsewhere', 'always', 'all', 'latter', 'less', 'someone', 'latterly', 'anyhow', 'which', '’re', 'noone', 'call', 'cannot', 'been', 'seemed', 'whose', 'take', 'never', 'used', 'sometime', 'name', 'thence', 'move', 'may', 'do', 'front', 'one', 'at', 'top', 'although', 'n‘t', 'to', 'his', '’ve', 'two', 'once', 'alone', 'neither', 'within', '‘ll', 'whom', 'anyway', 'toward', 'who', 'twelve', 'however', 'our', 'make', 'so', 'why', 'whither', 'sixty', 'even', 'too', 'somehow', 'into', 'unless', 'this', 'what', '’d', "'d", 'former', 'by', 'first', 'more', 'since', 'everything', 'an', '’s', 'whatever', 'or', 'whoever', 'seeming', 'whence', 'bottom', 'whenever', 'they', 'wherein', '’ll', 'doing', 'five', 'also', 'again', '‘s', 'using', 'really', 'already', 'out', 'empty', '

In [7]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [8]:
def spacy_tokenizer(sentence):
  doc = nlp(sentence)
  #print(doc)

  mytokens = [word.lemma_.lower().strip() for word in doc ]
  #print(mytokens)

  mytokens = [word for word  in mytokens if word not in stop_words and word not in punctuations]
  return mytokens


In [9]:
sentence = "I am eating apple"
spacy_tokenizer(sentence)

['eat', 'apple']

In [10]:
count_vector = CountVectorizer(tokenizer = spacy_tokenizer)
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [11]:
count_vector.fit_transform(["I am eating apple", "I am playing cricket"]).toarray()



array([[1, 0, 1, 0],
       [0, 1, 0, 1]])

In [12]:
count_vector.get_feature_names_out()

array(['apple', 'cricket', 'eat', 'play'], dtype=object)

In [13]:
count_vector.vocabulary_

{'eat': 2, 'apple': 0, 'play': 3, 'cricket': 1}

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X = data['comment_text']
y = data['toxic']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [18]:
X_train_vectors = count_vector.fit_transform(X_train)
X_test_vectors = count_vector.transform(X_test)



In [19]:
X_train_vectors.shape

(400, 70)

In [20]:
X_test_vectors.shape

(100, 70)

In [21]:
X_train_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
classifier.fit(X_train_vectors, y_train)

In [23]:
y_pred = classifier.predict(X_test_vectors)


In [24]:
print("Logistic Regression accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Logistic Regression precision:",metrics.precision_score(y_test, y_pred))
print("Logistic Regression recall:",metrics.recall_score(y_test, y_pred))

Logistic Regression accuracy: 1.0
Logistic Regression precision: 1.0
Logistic Regression recall: 1.0


In [25]:
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer)
X_train_vectors = tfidf.fit_transform(X_train)
X_test_vectors = tfidf.transform(X_test)



In [26]:
classifier = LogisticRegression()
classifier.fit(X_train_vectors, y_train)
y_pred = classifier.predict(X_test_vectors)
print("Logistic Regression:", metrics.accuracy_score(y_test, y_pred))
print("Logistic Regression:", metrics.precision_score(y_test, y_pred))
print("Logistic Regression:", metrics.recall_score(y_test, y_pred))

Logistic Regression: 1.0
Logistic Regression: 1.0
Logistic Regression: 1.0
