# Toxicity Analysis using SVM

### Importing necessary Libraries

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report
import pandas as pd

### Getting Train and Test Datasets

In [2]:
trainData = pd.read_csv("./data/train.csv")

testData = pd.read_csv("./data/test.csv")

testLabels=pd.read_csv("./data/test_labels.csv")

### Pre-Processing Phase

In [3]:
trainData.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


###### Data is in One-Hot Encoding with multiple labels, inverting the One-Hot Encoding with labels Toxic and Non-Toxic

In [4]:
train_data=pd.DataFrame()
train_data['message']=trainData['comment_text']
train_data['tag']=''

test_data=pd.DataFrame()
test_data['message']=['' for i in range(len(testData))]
test_data['tag']=''
for i in range(len(trainData)):
    if trainData.loc[i]["toxic"]==1 or trainData.loc[i]["severe_toxic"]==1 or trainData.loc[i]["obscene"]==1 or trainData.loc[i]["threat"]==1 or trainData.loc[i]["insult"]==1 or trainData.loc[i]["identity_hate"]==1:
        train_data.loc[i]['tag']="Toxic"
    else:
        train_data.loc[i]['tag']="Non-Toxic"

for i in range(len(testData)):
    if testLabels.loc[i]["toxic"]==-1 and testLabels.loc[i]["severe_toxic"]==-1 and testLabels.loc[i]["obscene"]==-1 and testLabels.loc[i]["threat"]==-1 and testLabels.loc[i]["insult"]==-1 and testLabels.loc[i]["identity_hate"]==-1:
        continue
    else:
        if testLabels.loc[i]["toxic"]==1 or testLabels.loc[i]["severe_toxic"]==1 or testLabels.loc[i]["obscene"]==1 or testLabels.loc[i]["threat"]==1 or testLabels.loc[i]["insult"]==1 or testLabels.loc[i]["identity_hate"]==1:
            test_data.loc[i]['message']=testData.loc[i]['comment_text']
            test_data.loc[i]['tag']="Toxic"
        else:
            test_data.loc[i]['message']=testData.loc[i]['comment_text']
            test_data.loc[i]['tag']="Non-Toxic"
test_data.replace("",float("NaN"), inplace=True)
test_data=test_data.dropna()

###### After converting, the dataset looks like 

In [5]:
train_data.head(15),test_data.head(15)

(                                              message        tag
 0   Explanation\nWhy the edits made under my usern...  Non-Toxic
 1   D'aww! He matches this background colour I'm s...  Non-Toxic
 2   Hey man, I'm really not trying to edit war. It...  Non-Toxic
 3   "\nMore\nI can't make any real suggestions on ...  Non-Toxic
 4   You, sir, are my hero. Any chance you remember...  Non-Toxic
 5   "\n\nCongratulations from me as well, use the ...  Non-Toxic
 6        COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK      Toxic
 7   Your vandalism to the Matt Shirvington article...  Non-Toxic
 8   Sorry if the word 'nonsense' was offensive to ...  Non-Toxic
 9   alignment on this subject and which are contra...  Non-Toxic
 10  "\nFair use rationale for Image:Wonju.jpg\n\nT...  Non-Toxic
 11  bbq \n\nbe a man and lets discuss it-maybe ove...  Non-Toxic
 12  Hey... what is it..\n@ | talk .\nWhat is it......      Toxic
 13  Before you start throwing accusations and warn...  Non-Toxic
 14  Oh, a

### Training Phase

###### Creating vectors for Train and Test Messages

In [6]:
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

train_vectors = vectorizer.fit_transform(train_data['message'])
test_vectors = vectorizer.transform(test_data['message'])

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_data['tag'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1


print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(test_data['tag'], prediction_linear, output_dict=True)
print('positive: ', report['Toxic'])
print('negative: ', report['Non-Toxic'])

Results for SVC(kernel=linear)
Training time: 1955.871615s; Prediction time: 381.704015s
positive:  {'precision': 0.5997149305143129, 'recall': 0.8087457952907257, 'f1-score': 0.688719137907516, 'support': 6243}
negative:  {'precision': 0.9785093324213898, 'recall': 0.941629860569845, 'f1-score': 0.9597154306494606, 'support': 57735}


###### Model on New Message

In [10]:
text_vector = vectorizer.transform([input()])
result = classifier_linear.predict(text_vector)
print("Result",result[0])

pedophile
Result Toxic
