In [1]:
import numpy as np
import pandas as pd

# Importing Data
-------

In [2]:
df_train = pd.read_csv('train.csv')
del(df_train['id'])
df_train.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
df_valid = pd.read_csv('validation.csv')
del(df_valid['id'])
df_valid.head()

Unnamed: 0,comment_text,translated,lang,toxic
0,Este usuario ni siquiera llega al rango de ...,This user does not even make it to the rank of...,es,0
1,Il testo di questa voce pare esser scopiazzato...,The text of this entry appears to be like I di...,it,0
2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,It is worth it. Only expose my past. All time ...,es,1
3,Bu maddenin alt başlığı olarak uluslararası i...,Of this article as a sub-heading with maintain...,tr,0
4,Belçika nın şehirlerinin yanında ilçe ve belde...,"I guess while they're At of the city, district...",tr,0


In [4]:
df_test = pd.read_csv('test.csv')
del(df_test['id'])
df_test.head()

Unnamed: 0,content,lang,translated
0,Doctor Who adlı viki başlığına 12. doctor olar...,tr,Title named Doctor Who wiki 12. doctor has add...
1,"Вполне возможно, но я пока не вижу необходимо...",ru,"It is possible, but I don't see the need to a..."
2,"Quindi tu sei uno di quelli conservativi , ...",it,"Then you're one of those conservative , who wo..."
3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr,"Unfortunately, it was not performed, but had s..."
4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr,:Resim:Seldabagcan.jpg the image of the source...


# Data Analysis
------------------

## Train

In [5]:
import plotly.express as px
import plotly.graph_objects as go

cols = [col for col in df_train.columns]
cols.remove('comment_text')
toxic_cats = {}

for i in cols:
    i1 = i.capitalize()
    i1 = i1.replace("_", " ")
    toxic_cats[i1] = df_train[i].value_counts()[1]





fig = px.bar(x=toxic_cats.values(), y=toxic_cats.keys(), text=toxic_cats.values(),
             width=700, height=400, title='Nº of comments per toxicity level',
             color=toxic_cats.values(),
             labels={'x': 'Nº of comments', 'y': 'Level'})
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})

with_toxic = {}

for i in cols:
    i1 = i.capitalize()
    i1 = i1.replace("_", " ")
    with_toxic[i1] = sum(np.where((df_train['toxic'] == df_train[i]) & (df_train['toxic'] == 1),
                                   True, False))

fig = px.bar(x=with_toxic.values(), y=with_toxic.keys(), text=with_toxic.values(),
             width=700, height=400, title='Nº of comments per toxicity level',
             color=with_toxic.values(),
             labels={'x': 'Nº of comments', 'y': 'Level'})
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
fig.show()

In [6]:
fig = px.pie(values=toxic_cats.values(), names=toxic_cats.keys(), width=700, height=400,
            title="Distribution of comments' toxicity categories")
fig.show()

In [7]:
fig = go.Figure(data=[
    go.Bar(y=[a for a in toxic_cats.values()], x=[a for a in toxic_cats.keys()],
           name='Total', marker_color='purple'),
    go.Bar(y=[a for a in with_toxic.values()], x=[a for a in with_toxic.keys()],
          name='Toxic as well', marker_color='yellow')
])

fig.update_layout(title='Are comments in other categories in toxic as well?', barmode='group', xaxis={'categoryorder':'total descending'})


fig.show()

We can clearly see the relation between toxic and other categories, so we will replace the comments that are classified as non-toxic to toxic if they are included in other level of toxicity.

In [8]:
toxic_bfr = df_train.toxic.value_counts()[1]

for i in range(len(df_train)):
    if df_train.loc[i,'toxic'] == 0 and (df_train.loc[i, 'obscene'] == 1 or
                                         df_train.loc[i, 'severe_toxic'] == 1 or
                                         df_train.loc[i, 'threat'] == 1 or
                                         df_train.loc[i, 'insult'] == 1 or
                                         df_train.loc[i, 'identity_hate'] == 1):
        df_train.loc[i,'toxic'] = 1
        
toxic_after = df_train.toxic.value_counts()[1]
toxic_comments = toxic_after - toxic_bfr
print('There are %i new toxic comments.' %toxic_comments)

There are 1084 new toxic comments.


It's a huge dataset, so we will delete some columns and dataframes to save RAM Memory.

In [9]:
import gc 

del(df_train['obscene'])
del(df_train['identity_hate'])
del(df_train['insult'])
del(df_train['threat'])
del(df_train['severe_toxic'])

gc.collect()

1466

## Validation

In [10]:
languages_val = {a:b for a,b in zip(df_valid['lang'].unique(), df_valid['lang'].value_counts())}
languages_val['Spanish'] = languages_val.pop('es')
languages_val['Italian'] = languages_val.pop('it')
languages_val['Turkish'] = languages_val.pop('tr')


fig = px.pie(values=languages_val.values(), names=languages_val.keys(), width=700, height=400,
            title="Distribution of comments' languages in validation data")
fig.show()

## Test

In [11]:
languages_test = {a:b for a,b in zip(df_test['lang'].unique(), df_test['lang'].value_counts())}
languages_test['Spanish'] = languages_test.pop('es')
languages_test['Italian'] = languages_test.pop('it')
languages_test['Turkish'] = languages_test.pop('tr')
languages_test['Russian'] = languages_test.pop('ru')
languages_test['French'] = languages_test.pop('fr')
languages_test['Portuguese'] = languages_test.pop('pt')

fig = px.pie(values=languages_test.values(), names=languages_test.keys(), width=700, height=400,
            title="Distribution of comments' languages in testing data")
fig.show()

# Preprocessing
-------

We already imported a dataset translated to english using Yandex.Translate, so we will use only the translated comments.

In [12]:
print("There are %.2f%% toxic comments in the training data."%(df_train['toxic'].value_counts()[1]/df_train['toxic'].value_counts()[0]*100))

There are 11.17% toxic comments in the training data.


In [13]:
print("There are %.2f%% toxic comments in the validation data."%(df_valid['toxic'].value_counts()[1]/df_valid['toxic'].value_counts()[0]*100))

There are 18.17% toxic comments in the validation data.


In [14]:
print("The validation dataframe represents a %.2f%% of the training data." %(df_valid.shape[0]/(df_train.shape[0]+df_valid.shape[0])))

The validation dataframe represents a 0.03% of the training data.


Our validation dataframe represents only 0.03% of training data, and the toxic comments are disproportionate distributed between both dataframes. So we will need to join them and split them randomly to have a more accurate result.

In [15]:
del(df_valid['lang'])
del(df_valid['comment_text'])
df_valid = df_valid.rename(columns={'translated':'comment_text'})

gc.collect()

1061

In [None]:
df = pd.concat([df_train, df_valid], ignore_index=True, axis=0)

df

In [None]:
from sklearn.model_selection import train_test_split

X = df['comment_text']
y = df['toxic']

x_train, x_valid, y_train, y_valid = train_test_split(X, y,
                                                       random_state=1,
                                                       train_size=0.8
                                                      )

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(decode_error='ignore',stop_words='english', max_df=0.8, max_features=1600)
x_train = vec.fit_transform(x_train).todense()
x_train = pd.DataFrame(x_train, columns=vec.get_feature_names())

In [None]:
x_valid = vec.transform(x_valid).todense()
x_valid = pd.DataFrame(x_valid, columns=vec.get_feature_names())

In [None]:
del(df)

In [None]:
print("There are %.2f%% toxic comments in train data."%(y_train.sum()/len(y_train)*100))

The training dataset is not well balanced (there are way more non-toxic comments than toxic ones). We will use SMOTE to add new toxic comments. 

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=1)

x_train, y_train = sm.fit_resample(x_train, y_train)

In [None]:
x_train.shape

In [None]:
x_train.tail()

# Training our Model
-----------

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

model = XGBClassifier(n_estimators=800,
                      use_label_encoder=False,
                      learning_rate=0.1,
                      max_depth=6,
                      colsample_bytree=1,
                      gamma=1,
                      n_jobs=4,
                      scale_pos_weight=5,
                      random_state=1)

model.fit(x_train, y_train)

preds = model.predict(x_valid)

f1 = f1_score(preds, y_valid)

print("F1 Score: %.4f" %f1)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_valid, preds, labels=model.classes_, normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=model.classes_)
disp.plot() 

# Predictions
------

In [None]:
x_test = vec.transform(df_test['translated']).todense()
x_test = pd.DataFrame(x_test, columns=vec.get_feature_names())

In [None]:
preds_test = model.predict(x_test)

# Save test predictions to file
output = pd.DataFrame({'id': df_test.index,
                       'toxic': preds_test})
output.to_csv('submission.csv', index=False)

output.head()

In [None]:
output['toxic'].value_counts()