<a href="https://colab.research.google.com/github/vrjayaprakash/Model/blob/main/wv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics

In [3]:
import spacy
import string
np.random.seed(42)

In [4]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'whence', 'using', 'moreover', 'almost', 'any', 'them', 'we', 'alone', 'out', 'less', 'twelve', 'get', 'fifteen', 'also', 'each', 'therefore', 'they', 'or', 'too', 'seeming', 'should', 'since', 'thereby', '’s', 'no', "'m", 'because', 'became', 'our', 'once', 'take', 'myself', 'being', 'several', 'whatever', 'anyway', 'give', 'whereby', 'around', 'down', 'until', 'have', 'between', 'you', 'becomes', 'was', 'hundred', 'cannot', "'ll", 'even', 'further', 'latterly', 'i', 'via', 'towards', 'call', 'those', 'every', "'d", 'former', 'elsewhere', 'whole', '’ve', 'more', 'its', 'my', 'when', 'either', 'meanwhile', 'in', 'without', 'hence', 'herself', 'everywhere', 'sometimes', 'are', 'ever', '’re', 'forty', 'here', 'herein', 'mine', 'often', 'become', 'thru', 'above', 'while', 'least', 'a', 'only', 'whereas', 'afterwards', 'back', 'do', 'now', 'for', 'may', 'top', 'everything', 'fifty', 'please', 'always', 'much', 'beyond', 'over', 'somewhere', 'does', 'he', 'somehow', 'move', 'other', 'his',

In [5]:
punctuation = string.punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
def spacy_tokenizer(sentence):
  doc = nlp(sentence)

  mytoken = [word.lemma_.lower().strip() for word in doc]
  #print(mytoken)

  mytoken = [word for word in mytoken if word not in stop_words and word not in punctuation]
  return(mytoken)


In [7]:
sentence = "I love Nlp and Machine learning"
spacy_tokenizer(sentence)

['love', 'nlp', 'machine', 'learning']

In [8]:
count_vector = CountVectorizer(tokenizer = spacy_tokenizer)


In [9]:
count_vector.fit_transform(["I love Nlp and I love Machine Learning"]).toarray()



array([[1, 2, 1, 1]])

In [10]:
count_vector.vocabulary_

{'love': 1, 'nlp': 3, 'machine': 2, 'learning': 0}

In [11]:
data = pd.read_csv("/content/nlp/toxicity_en.csv")

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
encoder = LabelEncoder()
data['is_toxic'] = encoder.fit_transform(data['is_toxic'])

In [14]:
X = data['text']
y = data['is_toxic']

In [15]:
X

Unnamed: 0,text
0,"Elon Musk is a piece of shit, greedy capitalis..."
1,The senile credit card shrill from Delaware ne...
2,He does that a lot -- makes everyone look good...
3,F*ck Lizzo
4,Epstein and trump were best buds!!! Pedophiles...
...,...
995,My maternal abuelita taught me how to make pla...
996,Funnily enough I was looking online last week ...
997,I can't bear how nice this is.\n \n I guess it...
998,Going to buy a share of Tesla just to ensure i...


In [16]:
y

Unnamed: 0,is_toxic
0,1
1,1
2,1
3,1
4,1
...,...
995,0
996,0
997,0
998,0


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
X_train_vec = count_vector.fit_transform(X_train)
X_test_vec = count_vector.transform(X_test)



In [19]:
X_train_vec.shape

(800, 3687)

In [20]:
X_test_vec.shape

(200, 3687)

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
X_train_vec.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
classifier = LogisticRegression()

In [24]:
classifier.fit(X_train_vec, y_train)

In [25]:
y_pred = classifier.predict(X_test_vec)

In [26]:
print("LogisticRegression:", metrics.accuracy_score(y_test, y_pred))
print("LogisticRegression:", metrics.confusion_matrix(y_test, y_pred))

LogisticRegression: 0.82
LogisticRegression: [[88 12]
 [24 76]]


In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       100
           1       0.86      0.76      0.81       100

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200



In [29]:
tfidf_vector  = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [36]:
tfidf_vector.fit_transform(["Ilove nlp" and "I love Machine nlp"]).toarray()



array([[0.57735027, 0.57735027, 0.57735027]])

In [37]:
tfidf_vector.vocabulary_

{'love': 0, 'machine': 1, 'nlp': 2}

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
X_train_vec = tfidf_vector.fit_transform(X_train)
X_test_vec = tfidf_vector.transform(X_test)



In [41]:
classifier = LogisticRegression()
classifier.fit(X_train_vec, y_train)
y_pred = classifier.predict(X_test_vec)

In [43]:
print("LogisticRegression:",metrics.accuracy_score(y_test, y_pred))
print("LogisticRegression:",metrics.precision_score(y_test, y_pred))
print("LogisticRegression:",metrics.recall_score(y_test, y_pred))


LogisticRegression: 0.875
LogisticRegression: 0.8865979381443299
LogisticRegression: 0.86


In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       100
           1       0.89      0.86      0.87       100

    accuracy                           0.88       200
   macro avg       0.88      0.88      0.87       200
weighted avg       0.88      0.88      0.87       200

