In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV,StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

cls = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv(r"train.csv").fillna("NA")
test = pd.read_csv(r"test.csv").fillna("NA")
train_text = train["comment_text"]
test_text = test["comment_text"]
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
print(train_word_features.shape)

(159571, 10000)


In [8]:
scores=[]
for cl in cls:
    target = train[cl]
    xgboost = XGBClassifier()
    parms= {}
    XGB=GridSearchCV(xgboost,param_grid=parms,cv=StratifiedKFold(5),scoring='roc_auc')
    XGB.fit(train_word_features,target)
    print(r"Best Score",XGB.best_score_,f"for class {cl}")
    scores.append(XGB.best_score_)
print(f"Mean ROC AUC Score: {np.mean(scores)}")

Best Score 0.9490041326648591 for class toxic
Best Score 0.9794065285799093 for class severe_toxic
Best Score 0.9805609330813553 for class obscene
Best Score 0.9649650344498788 for class threat
Best Score 0.9667239147287798 for class insult
Best Score 0.9622063732136337 for class identity_hate
Mean ROC AUC Score: 0.9671444861197359
