In [None]:
!nvidia-smi

In [None]:
#!unzip /content/dataset.zip

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
imbalance_data = pd.read_csv("imbalanced_data.csv")

In [None]:
imbalance_data.head()

In [None]:
imbalance_data.columns

## EDA

In [None]:
sns.countplot(x = 'label', data = imbalance_data)

In [None]:
imbalance_data["label"].value_counts()

##
- 0 = No Hate
- 1 = Hate

In [None]:
imbalance_data.shape

In [None]:
imbalance_data.isnull().sum()

In [None]:
imbalance_data.drop("id", axis= 1, inplace= True)

In [None]:
imbalance_data.head()

In [None]:
raw_data = pd.read_csv("raw_data.csv")
raw_data.head()

In [None]:
raw_data.shape

In [None]:
raw_data.isnull().sum()

In [None]:
raw_data.columns

In [None]:
raw_data.drop(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither'],axis = 1, inplace=True)

In [None]:
raw_data.head()

In [None]:
raw_data["class"].value_counts()

In [None]:
raw_data["class"].unique()

In [None]:
sns.countplot(x = 'class', data = raw_data)

- class 0: hate
- class 1: abusive
- class 2: no hate

In [None]:
# Combine class 1 and 0
raw_data[raw_data["class"]==0]["class"] = 1

In [None]:
raw_data.head()

In [None]:
raw_data["class"].unique

In [None]:
raw_data['class'].value_counts()

In [None]:
raw_data[raw_data["class"]==0]

In [None]:
# Replace the value of 0 to 1
raw_data['class'].replace({0:1},inplace=True)

In [None]:
raw_data.head()

In [None]:
raw_data['class'].unique()

In [None]:
sns.countplot(x = 'class', data = raw_data)

In [None]:
raw_data["class"].value_counts()

In [None]:
# let's replace the values of 2 to 0
raw_data['class'].replace({2:0},inplace=True)

In [None]:
raw_data.head()

In [None]:
raw_data['class'].unique()

In [None]:
sns.countplot(x='class', data=raw_data)

In [None]:
imbalance_data.head()

In [None]:
raw_data.head()

In [None]:
# Let's change the name of the 'class' to 'label'
raw_data.rename(columns={'class':'label'},inplace=True)

In [None]:
raw_data.head()

In [None]:
imbalance_data.head()

In [None]:
# merge
frame = [imbalance_data,raw_data]
df = pd.concat(frame)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
sns.countplot(x = 'label', data = df)

In [None]:
df['label'].value_counts()

# Pre-processing step

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
#nltk.download('all')

In [None]:
# Let's apply stemming and stopwords on the data
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

In [None]:
stopword

In [None]:
def data_cleaning(words):
    words = str(words).lower()
    words = re.sub('\[.*?\]', '', words)
    words = re.sub('https?://\S+|www\.\S+', '', words)
    words = re.sub('<.*?>+', '', words)
    words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
    words = re.sub('\n', '', words)
    words = re.sub('\w*\d\w*', '', words)

    # correct stopword removal
    words = [word for word in words.split(' ') if word not in stopword]
    words = " ".join(words)

    # correct stemming
    words = [stemmer.stem(word) for word in words.split(' ')]
    words = " ".join(words)

    return words


In [None]:
df['tweet'][1]

In [None]:
df['tweet'] = df['tweet'].apply(data_cleaning)

In [None]:
df['tweet'][1]

In [None]:
x = df['tweet']
y = df['label']

In [None]:
from sklearn.model_selection import train_test_split

# Let's split the data into train and test

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)


In [None]:
print(len(x_train),len(y_train))
print(len(x_test),len(y_test))

In [None]:
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


In [None]:
print(len(x_train),len(y_train))
print(len(x_test),len(y_test))

In [None]:
import numpy as np

y_train = np.array(y_train)
y_test = np.array(y_test)


## Feature Engineering

In [None]:
#!pip install keras
#!pip install tensorflow

In [None]:
import keras
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences


In [None]:
max_words = 50000
max_len = 300

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)

sequences = tokenizer.texts_to_sequences(x_train)
sequences_matrix = pad_sequences(sequences,maxlen=max_len)

In [None]:
sequences_matrix

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D
from keras.optimizers import RMSprop

In [None]:
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.build(input_shape=(None, max_len))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
# starting model training
history = model.fit(sequences_matrix,y_train,batch_size=128,epochs = 5,validation_split=0.2)

In [None]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len)

In [None]:
test_sequences_matrix

In [None]:
# Model evaluation
accr = model.evaluate(test_sequences_matrix,y_test)

In [None]:
lstm_prediction = model.predict(test_sequences_matrix)

In [None]:
res = []
for prediction in lstm_prediction:
    if prediction[0] < 0.5:
        res.append(0)
    else:
        res.append(1)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
print(confusion_matrix(y_test,res))

In [None]:
print(accuracy_score(y_test,res))

In [None]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Let's save the mdoel.
model.save("model.h5")

In [None]:
load_model=keras.models.load_model("model.h5")
with open('tokenizer.pickle', 'rb') as handle:
    load_tokenizer = pickle.load(handle)

In [None]:
# Let's test our model on custom data.
test = 'i love this movie'

def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    words = text.split(' ')
    words = [word for word in words if word not in stopword]
    words = [stemmer.stem(word) for word in words]
    text = " ".join(words)
    return text

test=[clean_text(test)]
print(test)

seq = load_tokenizer.texts_to_sequences(test)
padded = pad_sequences(seq, maxlen=300)
print(seq)

pred = load_model.predict(padded)

print("pred", pred)
if pred<0.5:
    print("no hate")
else:
    print("hate and abusive")
