In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump
from joblib import load

In [4]:
data=pd.read_csv('jigsaw-toxic-comment-train.csv')

In [5]:
data.shape

(223549, 8)

In [6]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
data.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

In [8]:
def clean_text(text):
    text=text.lower()
    text=re.sub(r"what's","what is", text)
    text=re.sub(r"\ 's", " ", text)
    text=re.sub(r"\ 've", "have ", text)
    text=re.sub(r"can't", "cannot ", text)
    text=re.sub(r"n't", "not ", text)
    text=re.sub(r"i'm", "i am ", text)
    text=re.sub(r"\ 're", "are ", text)
    text=re.sub(r"\ 'd", "would ", text)
    text=re.sub(r"\ 'll", "will ", text)
    text=re.sub(r"\ 'scuse", "excuse ", text)
    text=re.sub('\W', ' ', text)
    text=re.sub('\s+', ' ', text)
    text=text.strip(' ')
    return text

data['comment_text']=data['comment_text'].map(lambda com : clean_text(com))

In [9]:
labels = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
text=data['comment_text']

In [10]:
tfidf = TfidfVectorizer(max_features=5000) 
features = tfidf.fit_transform(text)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [15]:
x_train_df=pd.DataFrame(x_train)
x_test_df=pd.DataFrame(x_test)
y_train_df=pd.DataFrame(y_train)
y_test_df=pd.DataFrame(y_test)

In [16]:
x_train_df.to_csv('x_train.csv')
x_test_df.to_csv('x_test.csv')
y_train_df.to_csv('y_train.csv')
y_test_df.to_csv('y_test.csv')

In [17]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators
rf_classifier.fit(x_train, y_train)

RandomForestClassifier(random_state=42)

In [16]:
dump(rf_classifier, 'random_forest_model.joblib')
dump(tfidf, 'rf_model_tfidf.joblib')
#rf_classifier = load('random_forest_model.joblib')

['rf_model_tfidf.joblib']

In [17]:
rf_classifier = load('random_forest_model.joblib')

In [21]:
y_pred = rf_classifier.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
accuracy

0.9145605010064862

In [23]:
print(classification_report)

              precision    recall  f1-score   support

           0       0.86      0.56      0.68      4266
           1       0.52      0.04      0.08       384
           2       0.86      0.64      0.74      2486
           3       0.57      0.09      0.16       133
           4       0.80      0.46      0.58      2294
           5       0.64      0.09      0.15       408

   micro avg       0.85      0.51      0.64      9971
   macro avg       0.71      0.31      0.40      9971
weighted avg       0.82      0.51      0.62      9971
 samples avg       0.05      0.04      0.04      9971

