In [90]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import Stemmer

# Other
import re
import timeit
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("labeled_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
index                 24783 non-null int64
count                 24783 non-null int64
hate_speech           24783 non-null int64
offensive_language    24783 non-null int64
neither               24783 non-null int64
class                 24783 non-null int64
tweet                 24783 non-null object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [97]:
data = pd.DataFrame()

data['hate+offensive_count'] = df['offensive_language']
data['non-hate_count'] = df['neither']

classes = []
for index, row in data.iterrows():
    temp = 1 if row['hate+offensive_count'] > row['non-hate_count'] else 0
    classes.append(temp)
    
data['class'] = classes

#label =  {1:hate, 0:non-hate}
labels = data['class'].map(lambda x : 1 if int(x) == 1 else 0)

In [98]:

stopWords = set(stopwords.words("english"))
stemmer = Stemmer.Stemmer('english', 100000)    

def clean_text(text):
    
    ## Remove puncuation
    #text = text.translate(string.punctuation)
    
    #split based on everything except a-z0-9_'.\-
    #tokens = re.findall("[a-z0-9_'.\-]+", text.lower())
    tokens = text.lower().split()
    
    tokens = [stemmer.stemWord(w) for w in tokens if not w in stopWords and len(w) > 2 and len(w)<20]
    text = " ".join(tokens)
    
    return text


#cleaning text
data['tweet'] = df['tweet'].map(lambda x: clean_text(x))


In [127]:
data.info()
data.head(10)

#print(len(labels), len(data['tweet']))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 4 columns):
hate+offensive_count    24783 non-null int64
non-hate_count          24783 non-null int64
class                   24783 non-null int64
tweet                   24783 non-null object
dtypes: int64(3), object(1)
memory usage: 774.5+ KB


Unnamed: 0,hate+offensive_count,non-hate_count,class,tweet
0,0,3,0,mayasolov woman complain clean hous amp; man a...
1,3,0,1,mleew17 boy dat cold tyga dwn bad cuffin dat h...
2,3,0,1,urkindofbrand dawg 80sbaby4lif ever fuck bitch...
3,2,1,1,anderson viva base look like tranni
4,6,0,1,shenikarobert shit hear might true might faker...
5,2,0,1,madison shit blow claim faith somebodi still f...
6,3,0,1,brighterday sit hate anoth bitch got much shit go
7,3,0,1,8220; selfiequeenbri caus tire big bitch come ...
8,3,0,1,amp; might get bitch back amp; that
9,2,0,1,rhythmixx hobbi includ fight mariam bitch


In [42]:
data.to_csv("cleaned_tweets1.csv", index=False)

In [100]:
def process_sample(sample):
    
    print("\nGiven sample size:", len(sample))
    
    #Keras tokenizer function to tokenize the strings and 
    #‘texts_to_sequences’ to make sequences of words.

    vocabulary_size = 20000

    #Maximum number of words to work with 
    #(if set, tokenization will be restricted to the top nb_words most common words in the dataset).
    tokenizer = Tokenizer(num_words= vocabulary_size)

    #fit_on_texts(texts):
    #Arguments: list of texts to train on.
    #tokenizer.fit_on_texts(data['tweet'])
    tokenizer.fit_on_texts(sample)

    #texts_to_sequences(texts)
    #texts: list of texts to turn to sequences.
    #Return: list of sequences (one per text input).
    
    #sequences = tokenizer.texts_to_sequences(data['tweet'])
    sequences = tokenizer.texts_to_sequences(sample)
    sample = pad_sequences(sequences, maxlen=50)
    
    print("Processed sample shape:", sample.shape)
    print("Sample1:", sample[0])
    
    return sample



In [128]:
print(type(data['tweet']))
print(len(data['tweet']))

print(type(data['tweet'][0:10]))
print(len(data['tweet'][0:10]))

#sklearn.model_selection.train_test_split(*arrays, **options)[source]

#*arrays : sequence of indexables with same length / shape[0]
#Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes.

#test_size/train_size 
#If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
#If int, represents the absolute number of test samples. 
#If None, the value is set to the complement of the train size

#shuffle : boolean, optional (default=True)
#Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None.

#splitting : list, length=2 * len(arrays)
#List containing train-test split of inputs.


X = data['tweet']
Y = labels
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

print("Train data len:", len(X_train), "\nTest data len:", len(X_test))

print(X_train[0])
X_train = process_sample(X_train)
X_test = process_sample(X_test)

print(X_train[0])


<class 'pandas.core.series.Series'>
24783
<class 'pandas.core.series.Series'>
10
Train data len: 19826 
Test data len: 4957
mayasolov woman complain clean hous amp; man alway take trash

Given sample size: 19826
Processed sample shape: (19826, 50)
Sample1: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 2]

Given sample size: 4957
Processed sample shape: (4957, 50)
Sample1: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
 3503 3504 3505 1271 2194 3506 1608   17]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 2]


In [126]:
#The network starts with an embedding layer. 
#The layer lets the system expand each token to a more massive vector, 
#allowing the network to represent a word in a meaningful way. 
#The layer takes 20000 as the first argument, which is the size of our vocabulary, 
#and 100 as the second input parameter, which is the dimension of the embedding i.e. output_dim
#The third parameter is the input_length of 50, which is the length of each comment sequence.

## Network architecture
print('Building model...')
model_mlp = Sequential()

max_features = 20000 #size of vocab

#Embedding(input_dim, output_dim, embeddings_initializer='uniform', ***, input_length=None)
#o/p will be model.output_shape == (None, 10 :input_dim, 64:output_dim), where None is the batch dimension of the matrix given.
#model_mlp.add(Embedding(max_features, 100, input_length=50))

## Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape: here, 20-dimensional vectors.
model_mlp.add(Dense(64, input_dim= 50, activation='relu'))
model_mlp.add(Dropout(0.5))

model_mlp.add(Dense(64, activation='relu'))
model_mlp.add(Dropout(0.5))

model_mlp.add(Dense(1, activation='sigmoid'))

model_mlp.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

print("Train data len:", len(X_train), ", Test data len:", len(X_test))

EPOCHS = 3
BATCH_SIZE = 32

print('\n\nTraining Model...')

start = timeit.default_timer()

#batch_size: Integer or None. Number of samples per gradient update. 
#If unspecified, batch_size will default to 32.
model_mlp.fit(X_train, Y_train,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              validation_data=(X_test, Y_test))

#model_lstm.fit(data, np.array(labels), validation_split=0.2, epochs=3)

print("\n\nTotal training time: %.4f seconds." % (timeit.default_timer() - start))


start = timeit.default_timer()
score, acc = model_mlp.evaluate(X_test, Y_test, batch_size = BATCH_SIZE)

print("\nTesting time: %.4f seconds." % (timeit.default_timer() - start))
print('\nTest score:', score)
print('Test accuracy:', acc)


Building model...
Train data len: 19826 , Test data len: 4957


Training Model...
Train on 19826 samples, validate on 4957 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


Total training time: 5.9940 seconds.

Testing time: 0.1332 seconds.

Test score: 3.007087031607429
Test accuracy: 0.8113778494215791


In [117]:

## Network architecture
print('Building model...')
model_lstm = Sequential()

#The network starts with an embedding layer.
#Turns positive integers (indexes) into dense vectors of fixed size allowing the n/w to represent a word in a meaningful way.
#eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
#This layer can only be used as the first layer in a model.

#keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', ***, input_length=None)

#input_dim: int > 0. Size of the vocabulary, i.e. maximum integer index + 1.

#output_dim: int >= 0. Dimension of the dense embedding.

#input_length: Length of input sequences, when it is constant. 
#This argument is required if you are going to connect Flatten then Dense layers upstream 
#(without it, the shape of the dense outputs cannot be computed).

#eg. model.add(Embedding(1000, 64, input_length=10))

# the model will take as input an integer matrix of size (batch, input_length).
# where the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).

# o/p will be model.output_shape == (None, 10 :input_dim, 64:output_dim), where None is the batch dimension of the matrix given.


model_lstm.add(Embedding(20000, 100, input_length=50))
model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

print("Train data len:", len(X_train), ", Test data len:", len(X_test))

EPOCHS = 3
BATCH_SIZE = 32

print('\n\nTraining Model...')

start = timeit.default_timer()

#batch_size: Integer or None. Number of samples per gradient update. 
#If unspecified, batch_size will default to 32.
model_lstm.fit(X_train, Y_train,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              validation_data=(X_test, Y_test))

#model_lstm.fit(data, np.array(labels), validation_split=0.2, epochs=3)

print("\n\nTotal training time: %.4f seconds." % (timeit.default_timer() - start))




Building model...
Train data len: 19826 , Test data len: 4957


Training Model...
Train on 19826 samples, validate on 4957 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


Total training time: 254.8442 seconds.


In [118]:
start = timeit.default_timer()
score, acc = model_lstm.evaluate(X_test, Y_test, batch_size = BATCH_SIZE)

print("\nTesting time: %.4f seconds." % (timeit.default_timer() - start))
print('\nTest score:', score)
print('Test accuracy:', acc)



Testing time: 3.0772 seconds.

Test score: 0.38537159742475974
Test accuracy: 0.8374016542503953


In [129]:

"""
model.predict(x, batch_size=None, verbose=0, steps=None)
Generates output predictions for the input samples.

Computation is done in batches.

Arguments

x: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).

batch_size: Integer. If unspecified, it will default to 32.

verbose: Verbosity mode, 0 or 1.

steps: Total number of steps (batches of samples) before declaring the prediction round finished. 
Ignored with the default value of None.

Returns: Numpy array(s) of predictions.

Raises

ValueError: In case of mismatch between the provided input data and the model's expectations, 
or in case a stateful model receives a number of samples that is not a multiple of the batch size.

"""

sample = process_sample(data['tweet'][0:50])
model_lstm.predict_classes(sample)


Given sample size: 50
Processed sample shape: (50, 50)
Sample1: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 42 43 44 45 46  9 47 48
 49 50]


array([[0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])