# Assignment 3: Toxicity Classification in Online Comments 

### Process Data

In [21]:
import pandas as pd
import numpy as np

In [22]:
dev_dataset = pd.read_csv("dataset/dev_tfidf.csv")
train_dataset = pd.read_csv("dataset/train_tfidf.csv")
test_dataset = pd.read_csv("dataset/test_tfidf.csv")

In [23]:
#split the feature and label set
def split_f_l(df):
    features = df.iloc[:, 26:].to_numpy()
    label = df.iloc[:,1].to_numpy()
    return features, label

dev_features, dev_label = split_f_l(dev_dataset)
train_features, train_label = split_f_l(train_dataset)
test_features, _ = split_f_l(test_dataset)
test_ids = test_dataset["ID"].tolist()

### Supervised ML algorithm and Evaluation

In [24]:
from sklearn.metrics import precision_recall_fscore_support

def print_score(clf_name,classifier):
    dev_preds = classifier.predict(dev_features)
    precision_score, recall_score, fscore_score, _ = precision_recall_fscore_support(dev_label, dev_preds,average="weighted")
    
    print(clf_name," Accuracy", classifier.score(dev_features, dev_label))
    print(clf_name," Precision score is ", precision_score)
    print(clf_name," Recall score is ", recall_score)
    print(clf_name," F1_score is ", fscore_score)

In [25]:
#naive bayes
from sklearn.naive_bayes import GaussianNB

NBclassifier = GaussianNB()
NBclassifier.fit(train_features, train_label)

print_score("Naive Bayes",NBclassifier)

Naive Bayes  Accuracy 0.6368
Naive Bayes  Precision score is  0.7917125322109563
Naive Bayes  Recall score is  0.6368
Naive Bayes  F1_score is  0.6761213728499024


In [26]:
#logistic regression
from sklearn.linear_model import LogisticRegression

LRclassifier = LogisticRegression(random_state=66, max_iter=300)
LRclassifier.fit(train_features, train_label)

print_score("Logistic Regressions",LRclassifier)

Logistic Regressions  Accuracy 0.8276
Logistic Regressions  Precision score is  0.8015794457107356
Logistic Regressions  Recall score is  0.8276
Logistic Regressions  F1_score is  0.7894524512179725


In [27]:
#decision tree
from sklearn.tree import DecisionTreeClassifier
DTclassifier = DecisionTreeClassifier(random_state=0)
DTclassifier.fit(train_features, train_label)

print_score("Decision Tree",DTclassifier)

Decision Tree  Accuracy 0.7659333333333334
Decision Tree  Precision score is  0.7494172521425234
Decision Tree  Recall score is  0.7659333333333334
Decision Tree  F1_score is  0.7567892463894531


### Baseline Model

In [28]:
from sklearn.dummy import DummyClassifier
Frequentc = DummyClassifier(strategy="most_frequent")
Frequentc.fit(train_features, train_label)

print_score("Zero Rule baseline accuracy",Frequentc)

Zero Rule baseline accuracy  Accuracy 0.811
Zero Rule baseline accuracy  Precision score is  0.657721
Zero Rule baseline accuracy  Recall score is  0.811
Zero Rule baseline accuracy  F1_score is  0.7263622308117064


  _warn_prf(average, modifier, msg_start, len(result))


### Research Question 2: Bias in different sub-group

In [43]:
def select_sub_group(df):
    christian_df = df[(df['Christian']==1)&(df['Muslim']==0)&(df['Female']==0)&(df['Homosexual gay or lesbian']==0)&(df['Male']==0)]
    muslim_df = df[(df['Christian']==0)&(df['Muslim']==1)&(df['Female']==0)&(df['Homosexual gay or lesbian']==0)&(df['Male']==0)]
    female_df = df[(df['Christian']==0)&(df['Muslim']==0)&(df['Female']==1)&(df['Homosexual gay or lesbian']==0)&(df['Male']==0)]
    homosexual_df = df[(df['Christian']==0)&(df['Muslim']==0)&(df['Female']==0)&(df['Homosexual gay or lesbian']==1)&(df['Male']==0)]
    male_df = df[(df['Christian']==0)&(df['Muslim']==0)&(df['Female']==0)&(df['Homosexual gay or lesbian']==0)&(df['Male']==1)]
    return christian_df, muslim_df, female_df, homosexual_df, male_df
#dev dataset by subgroup
christian_df, muslim_df, female_df, homosexual_df, male_df = select_sub_group(dev_dataset)
christian_features, christian_labels = split_f_l(christian_df)
muslim_features, muslim_labels = split_f_l(muslim_df)
female_features, female_labels = split_f_l(female_df)
homosexual_features, homosexual_labels = split_f_l(homosexual_df)
male_features, male_labels = split_f_l(male_df)

In [42]:
def print_bias_variance(name,classifier):
    dev_acc_scores = []

    dev_acc_scores.append(classifier.score(christian_features, christian_labels))
    print(name,"accuracy in christian group ", dev_acc_scores[-1])

    dev_acc_scores.append(classifier.score(muslim_features, muslim_labels))
    print(name,"accuracy in muslim group ", dev_acc_scores[-1])

    dev_acc_scores.append(classifier.score(female_features, female_labels))
    print(name,"accuracy in female group ", dev_acc_scores[-1])

    dev_acc_scores.append(classifier.score(homosexual_features, homosexual_labels))
    print(name,"accuracy in homosexual group ", dev_acc_scores[-1])

    dev_acc_scores.append(classifier.score(male_features, male_labels))
    print(name,"accuracy in male group ", dev_acc_scores[-1])

    print(name,"variance is ", np.var(dev_acc_scores))

In [31]:
print_bias_variance("LR",LRclassifier)

LR accuracy in christian group  0.9309473684210526
LR accuracy in muslim group  0.7858974358974359
LR accuracy in female group  0.84930966469428
LR accuracy in homosexual group  0.7716535433070866
LR accuracy in male group  0.8247903075489282
LR variance is  0.0031815953747023154


### Solution1 : Class Balance

In [45]:
##trainset by subgroup
christian_tdf, muslim_tdf, female_tdf, homosexual_tdf, male_tdf = select_sub_group(train_dataset)
christian_features2, christian_labels2 = split_f_l(christian_tdf)
muslim_features2, muslim_labels2 = split_f_l(muslim_tdf)
female_features2, female_labels2 = split_f_l(female_tdf)
homosexual_features2, homosexual_labels2 = split_f_l(homosexual_tdf)
male_features2, male_labels2= split_f_l(male_tdf)

In [46]:
from collections import Counter
def show_label_distribution(name,labels,labels2):
    labels_counter = Counter(labels)
    labels_counter2 = Counter(labels2)
    print(name,"——Trainset Class distribution 0:1 =", float(labels_counter[0]) / float(labels_counter[1]),
         " Devset Class distribution 0:1 =", float(labels_counter2[0]) / float(labels_counter2[1]))

show_label_distribution("All",train_label,dev_label)

print("----------------")

show_label_distribution("christian",christian_labels2,christian_labels)
show_label_distribution("muslim",muslim_labels2,muslim_labels)
show_label_distribution("female",female_labels2,female_labels)
show_label_distribution("homosexual",homosexual_labels2,homosexual_labels)
show_label_distribution("male",male_labels2,male_labels)

All ——Trainset Class distribution 0:1 = 5.226096237658988  Devset Class distribution 0:1 = 4.291005291005291
----------------
christian ——Trainset Class distribution 0:1 = 13.834617664493184  Devset Class distribution 0:1 = 12.268156424581006
muslim ——Trainset Class distribution 0:1 = 3.5236749116607773  Devset Class distribution 0:1 = 3.262295081967213
female ——Trainset Class distribution 0:1 = 6.406841783750764  Devset Class distribution 0:1 = 5.401515151515151
homosexual ——Trainset Class distribution 0:1 = 2.645210727969349  Devset Class distribution 0:1 = 2.8484848484848486
male ——Trainset Class distribution 0:1 = 5.427030913012222  Devset Class distribution 0:1 = 4.2727272727272725


In [47]:
christian_tdf, muslim_tdf, female_tdf, homosexual_tdf, male_tdf 
cdf1=christian_df[christian_df["Toxicity"]==1]#positive sample
cdf0=christian_df[christian_df["Toxicity"]==0]#negative sample
#down-sampling the negative sample
cdf2=cdf0.sample(frac=0.89)
#combine the positive and downsampled negative sample
cdf_new=pd.concat([cdf1,cdf2])

mdf1=muslim_tdf[muslim_tdf["Toxicity"]==1]
mdf0=muslim_tdf[muslim_tdf["Toxicity"]==0]
mdf2=mdf0.sample(frac=0.92)
mdf_new=pd.concat([mdf1,mdf2])

fdf1=female_df[female_df["Toxicity"]==1]
fdf0=female_df[female_df["Toxicity"]==0]
fdf2=fdf0.sample(frac=0.84)
fdf_new=pd.concat([fdf1,fdf2])

hdf1=homosexual_df[homosexual_df["Toxicity"]==1]
hdf0=homosexual_df[homosexual_df["Toxicity"]==0]
hdf2=hdf1.sample(frac=0.93)
hdf_new=pd.concat([hdf0,hdf2])

adf1=male_df[male_df["Toxicity"]==1]
adf0=male_df[male_df["Toxicity"]==0]
adf2=adf0.sample(frac=0.78)
adf_new=pd.concat([adf1,adf2])

train_newdf = pd.concat([cdf_new,mdf_new,fdf_new,hdf_new,adf_new])
train_newfeatures = train_newdf.iloc[:, 26:].to_numpy()
train_newlabel = train_newdf.iloc[:,1].to_numpy()

In [48]:
LRclassifier_balanced = LogisticRegression(random_state=66, max_iter=300)
LRclassifier_balanced.fit(train_newfeatures, train_newlabel)

print("Logistic Regressions with balanced label accuracy", LRclassifier_balanced.score(dev_features, dev_label))
print_bias_variance("Logistic Regressions with balanced label",LRclassifier_balanced)

Logistic Regressions with balanced label accuracy 0.8266666666666667
Logistic Regressions with balanced label accuracy in christian group  0.9305263157894736
Logistic Regressions with balanced label accuracy in muslim group  0.7884615384615384
Logistic Regressions with balanced label accuracy in female group  0.855621301775148
Logistic Regressions with balanced label accuracy in homosexual group  0.7779527559055118
Logistic Regressions with balanced label accuracy in male group  0.8252562907735321
Logistic Regressions with balanced label variance is  0.0030128165945094753


### Solution 2 (Overfitting): Bagging

In [49]:
from sklearn.ensemble import BaggingClassifier
LRclassifier= LogisticRegression(random_state=66, max_iter=300)
LRclassifier_bagging = BaggingClassifier(LRclassifier,n_estimators=6, random_state=6).fit(train_features, train_label)

print("Logistic Regression bagging accuracy", LRclassifier_bagging.score(dev_features, dev_label))
print_bias_variance("Logistic Regression bagging accuracy",LRclassifier_bagging)

Logistic Regression bagging accuracy 0.8274666666666667
Logistic Regression bagging accuracy accuracy in christian group  0.9305263157894736
Logistic Regression bagging accuracy accuracy in muslim group  0.7884615384615384
Logistic Regression bagging accuracy accuracy in female group  0.84930966469428
Logistic Regression bagging accuracy accuracy in homosexual group  0.7716535433070866
Logistic Regression bagging accuracy accuracy in male group  0.8233923578751164
Logistic Regression bagging accuracy variance is  0.003123241534944309


### Feature enginnering

In [58]:
#read file
train_df = pd.read_csv("dataset/train_raw.csv")
dev_df = pd.read_csv("dataset/dev_raw.csv")
test_df = pd.read_csv("dataset/test_raw.csv")
#split the feature and label
train_raw_comment = train_df["Comment"].tolist()
train_label = train_df["Toxicity"].to_numpy()
dev_raw_comment = dev_df["Comment"].tolist()
dev_label = dev_df["Toxicity"].to_numpy()
test_raw_comment = test_df["Comment"].tolist()

christian_raw_df, muslim_raw_df, female_raw_df, homosexual_raw_df, male_raw_df = select_sub_group(dev_df)

In [59]:
#preprocessing
#remove punctuation
import string
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
train_removed_pun = []
for sen in train_raw_comment:
    temp = remove_punctuation(sen).lower()
    train_removed_pun.append(temp)

In [60]:
#data preprocessing : bow
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(stop_words='english')
train_bow = bow_vectorizer.fit_transform(train_removed_pun)
dev_bow = bow_vectorizer.transform(dev_raw_comment)

In [61]:
def split_comment_label(df):
    comment = df["Comment"].tolist()
    label =  df["Toxicity"].to_numpy()
    return comment, label
def print_bias_variance_bow(name,classifier):
    dev_acc_scores = []

    christian_comment, christian_label = split_comment_label(christian_raw_df)
    christian_bow = bow_vectorizer.transform(christian_comment)
    dev_acc_scores.append(classifier.score(christian_bow, christian_label))
    print(name,"accuracy in christian group ", classifier.score(christian_bow, christian_label))

    muslim_comment, muslim_label = split_comment_label(muslim_raw_df)
    muslim_bow = bow_vectorizer.transform(muslim_comment)
    dev_acc_scores.append(classifier.score(muslim_bow, muslim_label))
    print(name,"accuracy in muslim group ", classifier.score(muslim_bow, muslim_label))

    female_comment, female_label = split_comment_label(female_raw_df)
    female_bow = bow_vectorizer.transform(female_comment)
    dev_acc_scores.append(classifier.score(female_bow, female_label))
    print(name,"accuracy in female group ", classifier.score(female_bow, female_label))

    homosexual_comment, homosexual_label = split_comment_label(homosexual_raw_df)
    homosexual_bow = bow_vectorizer.transform(homosexual_comment)
    dev_acc_scores.append(classifier.score(homosexual_bow, homosexual_label))
    print(name,"accuracy in homosexual group ", classifier.score(homosexual_bow, homosexual_label))

    male_comment, male_label = split_comment_label(male_raw_df)
    male_bow = bow_vectorizer.transform(male_comment)
    dev_acc_scores.append(classifier.score(male_bow, male_label))
    print(name,"accuracy in male group ", classifier.score(male_bow, male_label))

    print(name,"variance is ", np.var(dev_acc_scores))

In [62]:
LRclassifier_bow = LogisticRegression(random_state=666, max_iter=800)
LRclassifier_bow.fit(train_bow, train_label)
print("Logistic Regressions bow with balanced label accuracy", LRclassifier_bow.score(dev_bow, dev_label))
print_bias_variance_bow("Logistic Regressions bow with balanced label",LRclassifier_bow)

Logistic Regressions bow with balanced label accuracy 0.8424666666666667
Logistic Regressions bow with balanced label accuracy in christian group  0.9334736842105263
Logistic Regressions bow with balanced label accuracy in muslim group  0.7846153846153846
Logistic Regressions bow with balanced label accuracy in female group  0.8733727810650888
Logistic Regressions bow with balanced label accuracy in homosexual group  0.7763779527559055
Logistic Regressions bow with balanced label accuracy in male group  0.8420316868592731
Logistic Regressions bow with balanced label variance is  0.0033901862326438286


### Final model 

In [63]:
LRclassifier_bow = LogisticRegression(random_state=666, max_iter=800, class_weight="balanced")
LRclassifier_final = BaggingClassifier(LRclassifier_bow,n_estimators=10, random_state=0).fit(train_bow, train_label)

print("LRbagging, bow with balanced label accuracy accuracy", LRclassifier_final.score(dev_bow, dev_label))

LRbagging, bow with balanced label accuracy accuracy 0.8196666666666667


In [64]:
test_bow = bow_vectorizer.transform(test_raw_comment)

test_preds = LRclassifier_final.predict(test_bow).tolist()
assert len(test_ids) == len(test_preds)

f = open("test_predictions.csv", "w")
f.write("ID,Toxicity\n")
for test_id, test_pred in zip(test_ids, test_preds):
    f.write(str(test_id) + "," +str(test_pred) + "\n")
f.close()