<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [90]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import Stemmer

# Other
import re
import timeit
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [4]:
df = pd.read_csv("labeled_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
index                 24783 non-null int64
count                 24783 non-null int64
hate_speech           24783 non-null int64
offensive_language    24783 non-null int64
neither               24783 non-null int64
class                 24783 non-null int64
tweet                 24783 non-null object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [164]:

filepath = "labeled_data.csv"
op_file = "cleaned_tweets.csv"

stopWords = set(stopwords.words("english"))
stemmer = Stemmer.Stemmer('english', 100000)    

def clean_text(text):
    
    #Remove puncuation
    #text = text.translate(string.punctuation)
    
    #split based on everything except a-z0-9_'.\-
    tokens = re.findall("[a-z0-9_'.\-]+", text.lower())
    #tokens = text.lower().split()
    
    tokens = [stemmer.stemWord(w) for w in tokens if not w in stopWords and len(w) > 2 and len(w)<20]
    text = " ".join(tokens)
    
    return text

def build_data(filepath):
    df = pd.read_csv(filepath)
    data = pd.DataFrame()
    
    data['hate+offensive_count'] = df['offensive_language'] + df['hate_speech']
    data['non-hate_count'] = df['neither']
    
    classes = []
    for index, row in data.iterrows():
        temp = 1 if row['hate+offensive_count'] > row['non-hate_count'] else 0
        classes.append(temp)
        
    data['class'] = classes
    
    #label =  {1:hate, 0:non-hate}
    labels = data['class'].map(lambda x : 1 if int(x) == 1 else 0)
    
    #cleaning text
    data['tweet'] = df['tweet'].map(lambda x: clean_text(x))

    data.to_csv(op_file)
    
    return (data, labels)

data, labels = build_data(filepath)
print("Done!!!")

Done!!!


In [202]:
data.info()
data.head(10)
data['class'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 4 columns):
hate+offensive_count    24783 non-null int64
non-hate_count          24783 non-null int64
class                   24783 non-null int64
tweet                   24783 non-null object
dtypes: int64(3), object(1)
memory usage: 774.5+ KB


1    20620
0     4163
Name: class, dtype: int64

In [159]:
print(type(data['tweet']))
print(len(data['tweet']))

print(type(data['tweet'][0:10]))
print(len(data['tweet'][0:10]))


<class 'pandas.core.series.Series'>
24783
<class 'pandas.core.series.Series'>
10


In [158]:
def process_sample(sample):
    
    print("\nGiven sample size:", len(sample))
    
    #Keras tokenizer function to tokenize the strings and 
    #‘texts_to_sequences’ to make sequences of words.

    vocabulary_size = 20000

    #Maximum number of words to work with 
    #if set, tokenization will be restricted to the top nb_words most common words in the dataset).
    tokenizer = Tokenizer(num_words= vocabulary_size)

    #fit_on_texts(texts):
    #Arguments: list of texts to train on.
    #tokenizer.fit_on_texts(data['tweet'])
    tokenizer.fit_on_texts(sample)

    #texts_to_sequences(texts)
    #texts: list of texts to turn to sequences.
    #Return: list of sequences (one per text input).
    
    #sequences = tokenizer.texts_to_sequences(data['tweet'])
    sequences = tokenizer.texts_to_sequences(sample)
    sample = pad_sequences(sequences, maxlen=50)
    
    print("Processed sample shape:", sample.shape)
    #print("Sample1:", sample[0])
    
    return sample


In [182]:

#sklearn.model_selection.train_test_split(*arrays, **options)[source]

#*arrays : sequence of indexables with same length / shape[0]
#Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes.

#test_size/train_size 
#If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
#If int, represents the absolute number of test samples. 
#If None, the value is set to the complement of the train size

#shuffle : boolean, optional (default=True)
#Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None.

#Returns splitting : list, length=2 * len(arrays)
#List containing train-test split of inputs.


X = data['tweet']
Y = labels
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

print("Train data len:", len(X_train), "\nTest data len:", len(X_test))

X_train_seq = process_sample(X_train)
X_test_seq = process_sample(X_test)


Train data len: 19826 
Test data len: 4957

Given sample size: 19826
Processed sample shape: (19826, 50)

Given sample size: 4957
Processed sample shape: (4957, 50)


In [184]:
print(type(X_train), type(X_train_seq), "\n")

for i in range(10):
    print(X_train.iat[i], "|Y:", Y_train.iat[i])
    
X_train.head(10)

<class 'pandas.core.series.Series'> <class 'numpy.ndarray'> 

yuh bitch ass ull get smack quick |Y: 1
imnottwitfam deal cyber bull log outsid that stop pussi |Y: 1
even mom know fuck ghetto peopl |Y: 1
colinfreez isi support america jihadi next door http t.co aimv7gib24 mlnestel vocativ http t.co ajuau5ywiv |Y: 0
woridstarhlphop phone goe class teacher say turn trash song mixtap https 8230 |Y: 0
rino trash like speakerboehn karlrov frankluntz see conserv democrat parti see blacks. slave property. tcot |Y: 1
burrgo hang hoe gonna turn hoe |Y: 1
pablo_baaih good pussi pussi make realli think bout make breakfast pussi make think father |Y: 1
harmssi strangekeith 8220 quanndadon word tan1aaa zero hoe 8221 |Y: 1
wow fucken bradley suck ass. coach said niggah play barcelona hahahahahaha |Y: 1


23242                    yuh bitch ass ull get smack quick
15634    imnottwitfam deal cyber bull log outsid that s...
8964                       even mom know fuck ghetto peopl
14751    colinfreez isi support america jihadi next doo...
18013    woridstarhlphop phone goe class teacher say tu...
13974    rino trash like speakerboehn karlrov franklunt...
14544                       burrgo hang hoe gonna turn hoe
16835    pablo_baaih good pussi pussi make realli think...
15439    harmssi strangekeith 8220 quanndadon word tan1...
22993    wow fucken bradley suck ass. coach said niggah...
Name: tweet, dtype: object

In [185]:
#sequence for sentences
for i in range(10):
    print(X_train.iat[i], "|Y:", Y_train.iat[i], ":--\n", X_train_seq[i], "\n")


yuh bitch ass ull get smack quick |Y: 1 :--
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 2616    1   14 9810   11  677  554] 

imnottwitfam deal cyber bull log outsid that stop pussi |Y: 1 :--
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 4846
  744 6344 1282 1721  569   31   66    9] 

even mom know fuck ghetto peopl |Y: 1 :--
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0  56 223  22   8 106  61] 

colinfreez isi support america jihadi next door http t.co aimv7gib24 mlnestel vocativ http t.co ajuau5ywiv |Y: 0 :--
 [   0  

In [186]:
#MLP Network architecture

print('Building model...')
model_mlp = Sequential()

max_features = 20000 #size of vocab

#Embedding(input_dim, output_dim, embeddings_initializer='uniform', ***, input_length=None)
#o/p will be model.output_shape == (None, 10 :input_dim, 64:output_dim), where None is the batch dimension of the matrix given.
#model_mlp.add(Embedding(max_features, 100, input_length=50))

## Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape: here, 20-dimensional vectors.
model_mlp.add(Dense(64, input_dim= 50, activation='relu'))
model_mlp.add(Dropout(0.5))

model_mlp.add(Dense(64, activation='relu'))
model_mlp.add(Dropout(0.5))

model_mlp.add(Dense(1, activation='sigmoid'))

model_mlp.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print("Train data len:", len(X_train), ", Test data len:", len(X_test))

EPOCHS = 3
BATCH_SIZE = 32

print('\n\nTraining Model...')

start = timeit.default_timer()

#batch_size: Integer or None. Number of samples per gradient update. 
#If unspecified, batch_size will default to 32.
model_mlp.fit(X_train_seq, Y_train,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              validation_data=(X_test_seq, Y_test))

#model_lstm.fit(data, np.array(labels), validation_split=0.2, epochs=3)

print("\n\nTotal training time: %.4f seconds." % (timeit.default_timer() - start))


start = timeit.default_timer()
score, acc = model_mlp.evaluate(X_test_seq, Y_test, batch_size = BATCH_SIZE)

print("\nTesting time: %.4f seconds." % (timeit.default_timer() - start))
print('\nTest score:', score)
print('Test accuracy:', acc)


Building model...
Train data len: 19826 , Test data len: 4957


Training Model...
Train on 19826 samples, validate on 4957 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


Total training time: 7.8478 seconds.

Testing time: 0.1434 seconds.

Test score: 2.7338222154379137
Test accuracy: 0.8285253178647672


In [133]:
model_mlp.output_shape

(None, 50, 1)

In [187]:

#LSTM Network architecture
print('Building model...')
model_lstm = Sequential()

#The network starts with an embedding layer.
#Turns positive integers (indexes) into dense vectors of fixed size allowing the n/w to represent a word in a meaningful way.
#eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
#This layer can only be used as the first layer in a model.

#keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', ***, input_length=None)

#input_dim: int > 0. Size of the vocabulary, i.e. maximum integer index + 1.

#output_dim: int >= 0. Dimension of the dense embedding.

#input_length: Length of input sequences, when it is constant. 
#This argument is required if you are going to connect Flatten then Dense layers upstream 
#(without it, the shape of the dense outputs cannot be computed).

#eg. model.add(Embedding(1000, 64, input_length=10))

# the model will take as input an integer matrix of size (batch, input_length).
# where the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).

# o/p will be model.output_shape == (None, 10 :input_dim, 64:output_dim), where None is the batch dimension of the matrix given.


model_lstm.add(Embedding(20000, 100, input_length=50))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

print("Train data len:", len(X_train), ", Test data len:", len(X_test))

EPOCHS = 3
BATCH_SIZE = 32

print('\n\nTraining Model...')

start = timeit.default_timer()

#batch_size: Integer or None. Number of samples per gradient update. 
#If unspecified, batch_size will default to 32.
model_lstm.fit(X_train_seq, Y_train,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              validation_data=(X_test_seq, Y_test))

#model_lstm.fit(data, np.array(labels), validation_split=0.2, epochs=3)

print("\n\nTotal training time: %.4f seconds." % (timeit.default_timer() - start))

start = timeit.default_timer()
score, acc = model_lstm.evaluate(X_test_seq, Y_test, batch_size = BATCH_SIZE)

print("\nTesting time: %.4f seconds." % (timeit.default_timer() - start))
print('\nTest score:', score)
print('Test accuracy:', acc)


Building model...
Train data len: 19826 , Test data len: 4957


Training Model...
Train on 19826 samples, validate on 4957 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


Total training time: 254.2815 seconds.

Testing time: 4.7194 seconds.

Test score: 0.41748238686642775
Test accuracy: 0.8448658461938203


In [236]:
print(type(X_train), type(X_train_seq), type(labels))

print(len(X_train_seq), len(X_test_seq))
print(labels.shape)

data_x = pd.concat([pd.DataFrame(X_train_seq), pd.DataFrame(X_test_seq)] , axis = 0)
print(data_x.shape)

data_y = pd.concat([Y_train, Y_test] , axis = 0)
print(data_y.shape)

#to handle InvalidIndexError: Reindexing only valid with uniquely valued Index objects
data_x.reset_index(inplace=True, drop=True)
data_y.reset_index(inplace=True, drop=True)

data_save = pd.concat([data_x, data_y], axis = 1)
print(data_save.shape)

data_save.to_csv('vector_labels.csv', index=False)

data_save.head(10)


<class 'pandas.core.series.Series'> <class 'numpy.ndarray'> <class 'pandas.core.series.Series'>
19826 4957
(24783,)
(24783, 50)
(24783,)
(24783, 51)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,2616,1,14,9810,11,677,554,1
1,0,0,0,0,0,0,0,0,0,0,...,4846,744,6344,1282,1721,569,31,66,9,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,56,223,22,8,106,61,1
3,0,0,0,0,0,0,0,0,0,0,...,4,5,9812,9813,3942,6,4,5,9814,0
4,0,0,0,0,0,0,0,0,0,0,...,389,678,29,142,19,248,1416,138,18,0
5,0,0,0,0,0,0,0,0,0,0,...,44,1983,875,373,44,9818,2170,6345,611,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,9819,571,2,90,142,2,1
7,0,0,0,0,0,0,0,0,0,0,...,52,37,72,28,1224,9,28,37,1340,1
8,0,0,0,0,0,0,0,0,0,0,...,9821,9822,12,9823,211,6347,1501,2,13,1
9,0,0,0,0,0,0,0,0,0,0,...,9824,173,14,1341,60,79,74,9825,6348,1
