In [1]:
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline

df_train = pd.read_csv('train_toxic.csv')
print(df_train.head(10))

print("\n\n\n")

df_test = pd.read_csv("test_toxic.csv")
print(df_test.head(10))


df_mixed_train = pd.read_csv("train_toxic_mixed.csv")
print(df_test.head(10))

                                                text is_toxic
0  Yine de Pestagyang kadar adım atarak Gestapo t...      yes
1  Kendimi aptal hissettirdi ya günlerce delirdim...      yes
2  Hakkında hayırlısı olsun Bul birini kuttulak s...      yes
3  Makaleyi yeniden adlandır Bu makalenin adı Alb...       no
4  Atların dişleri yaşlandıkça uzar ve atın gerçe...       no
5      delii ayyaş işte ayık olsa normal dusunecekte      yes
6                     Çilesiz bir günüm olmadı gitti       no
7      ne kıvırcık ne kısa ezelin kör anası mısın aq      yes
8             Bunların topu yakacan bunlar tam kahpe      yes
9  lan dur amk rumu zaten ortalık karışık sen zat...      yes




                                                text is_toxic
0  Lan siktirin gidin AMK pozitif sik kafaları Ül...      yes
1  Kabul Dream Chaser programı ile ilgili olmayan...       no
2  Thanx Teşekkürler Bay horoz Bana son bir şans ...       no
3                Zeka yaşı falan da orası ayrı sjsjd      yes
4  L

In [2]:
x_train = df_train.text
y_train = df_train.is_toxic

x_test= df_test.text
y_test = df_test.is_toxic

x_mixed_train = df_mixed_train.text
y_mixed_train = df_mixed_train.is_toxic

x_mixed_test= df_test.text
y_test = df_test.is_toxic


In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(x_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(x_test)

print("------Naive Bayes, base training------")
my_labels = ["yes", "no"]
print('accuracy %s' % accuracy_score(y_pred, y_test))
NB_base_report = classification_report(y_test, y_pred,target_names=my_labels)
print(NB_base_report)

------Naive Bayes, base training------
accuracy 0.875
              precision    recall  f1-score   support

         yes       0.86      0.89      0.88       995
          no       0.89      0.86      0.87      1005

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.87      2000
weighted avg       0.88      0.88      0.87      2000



In [4]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(x_mixed_train, y_mixed_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(x_mixed_test)

print("------Naive Bayes, data augmented training------")

print('accuracy %s' % accuracy_score(y_pred, y_test))
NB_augmented_report = classification_report(y_test, y_pred,target_names=my_labels)
print(NB_augmented_report)


------Naive Bayes, data augmented training------
accuracy 0.825
              precision    recall  f1-score   support

         yes       0.74      0.99      0.85       995
          no       0.99      0.66      0.79      1005

    accuracy                           0.82      2000
   macro avg       0.87      0.83      0.82      2000
weighted avg       0.87      0.82      0.82      2000



In [5]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(x_train, y_train)


y_pred = sgd.predict(x_test)
print("------Linear Support Vector Machine, base training------")

print('accuracy %s' % accuracy_score(y_pred, y_test))
LSVM_base_report = classification_report(y_test, y_pred,target_names=my_labels)
print(LSVM_base_report)

------Linear Support Vector Machine, base training------
accuracy 0.8545
              precision    recall  f1-score   support

         yes       0.81      0.93      0.86       995
          no       0.92      0.78      0.84      1005

    accuracy                           0.85      2000
   macro avg       0.86      0.85      0.85      2000
weighted avg       0.86      0.85      0.85      2000



In [6]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(x_mixed_train, y_mixed_train)


print("------Linear Support Vector Machine, data augmented training------")

y_pred = sgd.predict(x_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
LSVM_augmented_report = classification_report(y_test, y_pred,target_names=my_labels)
print(LSVM_augmented_report)

------Linear Support Vector Machine, data augmented training------
accuracy 0.7495
              precision    recall  f1-score   support

         yes       0.67      0.99      0.80       995
          no       0.99      0.51      0.67      1005

    accuracy                           0.75      2000
   macro avg       0.83      0.75      0.73      2000
weighted avg       0.83      0.75      0.73      2000



In [7]:
reports =  [NB_base_report,NB_augmented_report,LSVM_base_report,LSVM_augmented_report]


for report in reports:
  print(report)

              precision    recall  f1-score   support

         yes       0.86      0.89      0.88       995
          no       0.89      0.86      0.87      1005

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.87      2000
weighted avg       0.88      0.88      0.87      2000

              precision    recall  f1-score   support

         yes       0.74      0.99      0.85       995
          no       0.99      0.66      0.79      1005

    accuracy                           0.82      2000
   macro avg       0.87      0.83      0.82      2000
weighted avg       0.87      0.82      0.82      2000

              precision    recall  f1-score   support

         yes       0.81      0.93      0.86       995
          no       0.92      0.78      0.84      1005

    accuracy                           0.85      2000
   macro avg       0.86      0.85      0.85      2000
weighted avg       0.86      0.85      0.85      2000

              preci