In [1]:
import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
#from wordcloud import WordCloud

#data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#NLP tools
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/roohaan111/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
idLabels = pd.read_csv('hate-speech-dataset/annotations_metadata.csv')[["file_id", "label"]]

In [3]:
def get_data(paths, group):
  
    sets = []
    for p in paths:

        with open(p) as f:
            file_id = p.split('/')[-1].split('.')[0]
            sets.append((file_id, f.read(), group))

    return pd.DataFrame(sets, columns=["file_id", "text", "gSet"])

In [4]:
import glob

train_set = get_data(glob.glob('./hate-speech-dataset/sampled_train/*.txt'), 'train')
test_set = get_data(glob.glob('./hate-speech-dataset/sampled_test/*.txt'), 'test')

train_set = train_set.join(idLabels.set_index('file_id'), on='file_id')
test_set = test_set.join(idLabels.set_index('file_id'), on='file_id')

comb_set = pd.concat([train_set, test_set]).reset_index(drop=True)

In [5]:
train_set.head()
train_set.describe()
comb_set.head()
comb_set.tail()

Unnamed: 0,file_id,text,gSet,label
2387,31732972_2,Sometimes I do watch shows about WWII on the H...,test,noHate
2388,30644911_2,It 's a pretty powerful documentary that blew ...,test,noHate
2389,30771611_2,Like the parasites they are they simply moved ...,test,hate
2390,13590005_2,Their lies are so thick and many that people d...,test,hate
2391,14672822_1,We knew that these structures were built by su...,test,noHate


In [6]:
#isolate train labels and text columns
pre_train_data = train_set[['label', 'text']]
pre_test_data = test_set[['label', 'text']]

#isolate labels to transform using One Hot Encoding
labels_actual_train = pre_train_data.iloc[:,:-1].values
labels_actual_test = pre_test_data.iloc[:,:-1].values

In [7]:
#transform hate noHate --> 1, 0
ct1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])])
ct2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])])

#hate classification = 1, no-hate classification = 0
y_train = np.array(ct1.fit_transform(labels_actual_train))[:,0]
y_test =  np.array(ct2.fit_transform(labels_actual_test))[:,0]


In [8]:
#cleaning the data set 
#improve this using spacy

def clean_data(size, data):
    corpus = []
    for i in range(size):
        #conver text to plain english
        text = re.sub('[^a-zA-Z]', ' ', data['text'][i])
        text = text.lower().split()
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()
        text = [stemmer.stem(word) for word in text if word not in stop_words]
        text = ' '.join(text)
        corpus.append(text)
        
    return corpus
    

In [9]:
cv1 = CountVectorizer(max_features = 1953)
cv2 = CountVectorizer(max_features = 1953)
corpus_train = clean_data(len(y_train), pre_train_data)
corpus_test = clean_data(len(y_test), pre_test_data)
X_train = cv1.fit_transform(corpus_train).toarray()
X_test = cv2.fit_transform(corpus_test).toarray()

In [10]:
# classifier_np = GaussianNB()
# classifier_np.fit(X_train, y_train)
print(",")

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons
from sklearn.preprocessing import PolynomialFeatures

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

,


In [12]:
polynomial_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(C=15, kernel = "poly", coef0 = 1, degree = 4))
    ])

# polynomial_svm_clf = Pipeline([
#         ("scaler", StandardScaler()),
#         ("svm_clf", SVC(C=5, kernel = "poly", coef0 = 100, degree = 10))
#     ])


print("z")

z


In [13]:
polynomial_svm_clf.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('svm_clf', SVC(C=15, coef0=1, degree=4, kernel='poly'))])

In [14]:
y_pred = polynomial_svm_clf.predict(X_test)

In [15]:

from sklearn.metrics import confusion_matrix, accuracy_score

# y_pred = classifier_np.predict(X_test)

# y_pred = classifier_np.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[149  90]
 [105 134]]


In [16]:
accuracy_score(y_test, y_pred)

0.5920502092050209