In [47]:
import pandas as pd
import numpy as np
import re
import nltk
import random
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding, Activation
from tensorflow.keras.models import Sequential
from collections import Counter

In [20]:
df = pd.read_csv("./Final_Dataset/final_dataset")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

df

Unnamed: 0,Comments,Type
0,I imagine his virginity at 40 will also be ple...,0
1,Happiness can be found even in the darkest of ...,1
2,Are your eyes swollen and squinty from all of ...,0
3,Purely practical note:\n\nNow that you have sh...,1
4,You look like you could be the spokesperson fo...,1
...,...,...
73307,I bet you wish you could transfer some of the ...,0
73308,"I hope your dick is as big as your teeth, beca...",0
73309,"You need to put the XXL butt plug down bro, I ...",0
73310,Do you actually have a neck or did you eat tha...,0


In [21]:
df.groupby('Type').describe()
df["Type"].value_counts()

Type
0    36656
1    36656
Name: count, dtype: int64

# Punctuation & Stop Words Filtering


In [22]:
def remove_punc(comment):
    punc = '''!()-[]}{;:'"\,<>./?@#$%^&*_~'''

    for ele in comment:
        if ele in punc:
            comment = comment.replace(ele, "")
    
    return comment

nltk.download("stopwords")
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc


df["Comments"] = df["Comments"].apply(remove_punc)

df["Comments"] = df["Comments"].apply(normalize_document)

df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ketansharma14/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Comments,Type
0,imagine virginity also pleated,0
1,happiness found even darkest times one remembe...,1
2,eyes swollen squinty loads random truckers you...,0
3,purely practical note short hair first time,1
4,look like could spokesperson kind wholesome co...,1
...,...,...
73307,bet wish could transfer fat thighs non existen...,0
73308,hope dick big teeth looks one going want fuck,0
73309,need put xxl butt plug bro doubt anything left...,0
73310,actually neck eat,0


# Train & Test Splitting


In [33]:
X = df["Comments"]
y = df["Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Tokenizing, Padding The Sequences & Reshaping


In [34]:
#Tokenizing Training & Test Data
tokenizer = Tokenizer(num_words=1000, lower=True)
tokenizer.fit_on_texts(X_train)


X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

#Padding Sequences
max_len = 150


X_train = pad_sequences(X_train, padding="post", maxlen=max_len)
X_test = pad_sequences(X_test, padding="post", maxlen=max_len)


X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1], 1))

print(X_train.shape)
print(X_test.shape)

(58649, 150, 1)
(14663, 150, 1)


# One_Hot Encoding of Y Labels


In [43]:
num_classes = 2
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

print(y_train.shape)
print(y_train[0])

(58649, 2)
[0. 1.]


# Creating the Model


In [45]:
num_classes = 2
def vanilla_rnn():
    model = Sequential()
    model.add(SimpleRNN(50, input_shape=(max_len, 1), return_sequences=False))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
    model.summary()
    
    adam = Adam(lr=0.001)
    model.compile(loss="categorical_crossentropy",
                    optimizer=adam,
                    metrics=['accuracy'])
    
    return model

In [None]:
RNN_Classifier = vanilla_rnn()
RNN_Classifier.fit(X_train, y_train, epochs=5)

# Accuracy Score

In [None]:
y_pred = RNN_Classifier.predict(X_test)
y_test_ = np.argmax(y_pred, axis=1)

print(accuracy_score(y_pred, y_test_))

# Test Actual Data

In [None]:
a = input("Enter any sentence: ")
a = tokenizer.texts_to_sequences(a)
a = np.array(a)
a = pad_sequences(a, padding='post', maxlen=max_len)

a = a.reshape((a.shape[0], a.shape[1], 1))

prediction = RNN_Classifier.predict(np.array(a))
print(prediction)