In [93]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras import layers

df_spam = pd.read_csv("data/SPAM text message 20170820 - Data.csv")
df_yelp = pd.read_csv("data/yelp.csv")
df_corona = pd.read_csv("data/Corona_NLP_train.csv")

In [94]:

#Splitting Test/Train spam
train_x, test_x, train_y, test_y = model_selection.train_test_split(df_spam['Message'], df_spam['Category'])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

#Splitting Test/Train yelp
train_x2, test_x2, train_y2, test_y2 = model_selection.train_test_split(df_yelp['Message'], df_yelp['Category'])
encoder2 = preprocessing.LabelEncoder()
train_y2 = encoder2.fit_transform(train_y2)
test_y2 = encoder2.fit_transform(test_y2)

#Splitting Test/Train corona
train_x3, test_x3, train_y3, test_y3 = model_selection.train_test_split(df_corona['OriginalTweet'], df_corona['Sentiment'])
encoder3 = preprocessing.LabelEncoder()
train_y3 = encoder3.fit_transform(train_y3)
test_y3 = encoder3.fit_transform(test_y3)

#Preprocessing tfidf spam
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df_spam['Message'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

#Preprocessing tfidf yelp
tfidf_vect2 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect2.fit(df_yelp['Message'])
xtrain_tfidf2 =  tfidf_vect.transform(train_x2)
xtest_tfidf2 =  tfidf_vect.transform(test_x2)

#Preprocessing tfidf corona
tfidf_vect3 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect3.fit(df_corona['OriginalTweet'])
xtrain_tfidf3 =  tfidf_vect3.transform(train_x3)
xtest_tfidf3 =  tfidf_vect3.transform(test_x3)

'''# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open("data/wiki-news-300d-1M.vec", encoding=“utf8”)):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(df_spam['Message'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector'''

'# load the pre-trained word-embedding vectors \nembeddings_index = {}\nfor i, line in enumerate(open("data/wiki-news-300d-1M.vec", encoding=“utf8”)):\n    values = line.split()\n    embeddings_index[values[0]] = np.asarray(values[1:], dtype=\'float32\')\n\n# create a tokenizer \ntoken = text.Tokenizer()\ntoken.fit_on_texts(df_spam[\'Message\'])\nword_index = token.word_index\n\n# convert text to sequence of tokens and pad them to ensure equal length vectors \ntrain_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)\ntest_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen=70)\n\n# create token-embedding mapping\nembedding_matrix = np.zeros((len(word_index) + 1, 300))\nfor word, i in word_index.items():\n    embedding_vector = embeddings_index.get(word)\n    if embedding_vector is not None:\n        embedding_matrix[i] = embedding_vector'

In [95]:
#spam Naive Bayes
cls = naive_bayes.MultinomialNB().fit(xtrain_tfidf, train_y)
pred = cls.predict(xtest_tfidf)
print(metrics.accuracy_score(pred, test_y))

#yelp Naive Bayes
cls = naive_bayes.MultinomialNB().fit(xtrain_tfidf2, train_y2)
pred = cls.predict(xtest_tfidf2)
print(metrics.accuracy_score(pred, test_y2))

#covid Naive Bayes
cls = naive_bayes.MultinomialNB().fit(xtrain_tfidf3, train_y3)
pred = cls.predict(xtest_tfidf3)
print(metrics.accuracy_score(pred, test_y3))

0.9705671213208902
0.48542258709609515
0.46472303206997084


In [96]:
model = Sequential()
model.add(layers.Embedding(100, 100, input_length=100))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          10000     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 96, 128)           64128     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 75,429
Trainable params: 75,429
Non-trainable params: 0
_________________________________________________________________


In [97]:
history = model.fit(train_x, train_y, epochs=50, verbose=False, validation_data=(test_x, test_y), batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)



ValueError: in user code:

    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:747 train_step
        y_pred = self(x, training=True)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\sequential.py:372 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\functional.py:385 call
        return self._run_internal_graph(
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\keras\layers\convolutional.py:247 call
        outputs = self._convolution_op(inputs, self.kernel)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\ops\nn_ops.py:1011 convolution_v2
        return convolution_internal(
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\ops\nn_ops.py:1141 convolution_internal
        return op(
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py:574 new_func
        return func(*args, **kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py:574 new_func
        return func(*args, **kwargs)
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\ops\nn_ops.py:1881 conv1d
        result = gen_nn_ops.conv2d(
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py:974 conv2d
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:742 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\framework\func_graph.py:591 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py:3477 _create_op_internal
        ret = Operation(
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py:1974 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    C:\Users\Brian\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py:1815 _create_c_op
        raise ValueError(str(e))

    ValueError: Negative dimension size caused by subtracting 5 from 1 for '{{node sequential_2/conv1d_1/conv1d}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](sequential_2/conv1d_1/conv1d/ExpandDims, sequential_2/conv1d_1/conv1d/ExpandDims_1)' with input shapes: [?,1,1,100], [1,5,100,128].
