In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras import layers, models, optimizers
import scipy

df_spam = pd.read_csv("data/SPAM text message 20170820 - Data.csv")
df_yelp = pd.read_csv("data/yelp.csv")
df_corona = pd.read_csv("data/Corona_NLP_train.csv")
df_imdb = pd.read_csv("data/IMDB Dataset.csv")

In [7]:

#Splitting Test/Train spam
train_x, test_x, train_y, test_y = model_selection.train_test_split(df_spam['Message'], df_spam['Category'])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

#Splitting Test/Train yelp
train_x2, test_x2, train_y2, test_y2 = model_selection.train_test_split(df_yelp['Message'], df_yelp['Category'])
encoder2 = preprocessing.LabelEncoder()
train_y2 = encoder2.fit_transform(train_y2)
test_y2 = encoder2.fit_transform(test_y2)

#Splitting Test/Train corona
train_x3, test_x3, train_y3, test_y3 = model_selection.train_test_split(df_corona['OriginalTweet'], df_corona['Sentiment'])
encoder3 = preprocessing.LabelEncoder()
train_y3 = encoder3.fit_transform(train_y3)
test_y3 = encoder3.fit_transform(test_y3)

#Splitting Test/Train IMDB
train_x4, test_x4, train_y4, test_y4 = model_selection.train_test_split(df_imdb['review'], df_imdb['sentiment'])
encoder4 = preprocessing.LabelEncoder()
train_y4 = encoder4.fit_transform(train_y4)
test_y4 = encoder4.fit_transform(test_y4)

#Preprocessing tfidf spam
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df_spam['Message'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

#Preprocessing tfidf yelp
tfidf_vect2 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect2.fit(df_yelp['Message'])
xtrain_tfidf2 =  tfidf_vect.transform(train_x2)
xtest_tfidf2 =  tfidf_vect.transform(test_x2)

#Preprocessing tfidf corona
tfidf_vect3 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect3.fit(df_corona['OriginalTweet'])
xtrain_tfidf3 =  tfidf_vect3.transform(train_x3)
xtest_tfidf3 =  tfidf_vect3.transform(test_x3)

#Preprocessing tfidf IMDB
tfidf_vect4 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect4.fit(df_imdb['review'])
xtrain_tfidf4 =  tfidf_vect4.transform(train_x4)
xtest_tfidf4 =  tfidf_vect4.transform(test_x4)

In [8]:
#spam Naive Bayes
cls = naive_bayes.MultinomialNB().fit(xtrain_tfidf, train_y)
pred = cls.predict(xtest_tfidf)
print(metrics.accuracy_score(pred, test_y))

#yelp Naive Bayes
cls = naive_bayes.MultinomialNB().fit(xtrain_tfidf2, train_y2)
pred = cls.predict(xtest_tfidf2)
print(metrics.accuracy_score(pred, test_y2))

#covid Naive Bayes
cls = naive_bayes.MultinomialNB().fit(xtrain_tfidf3, train_y3)
pred = cls.predict(xtest_tfidf3)
print(metrics.accuracy_score(pred, test_y3))

#imdb Naive Bayes
cls = naive_bayes.MultinomialNB().fit(xtrain_tfidf4, train_y4)
pred = cls.predict(xtest_tfidf4)
print(metrics.accuracy_score(pred, test_y4))

0.9576453697056713
0.4895274181665088
0.4564625850340136
0.85368


In [13]:
embeddings_index = {}
for i, line in enumerate(open('data/wiki-news-300d-1M.vec', 'r', encoding='utf8')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(df_spam['Message'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_x), maxlen=70)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [26]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

cls = create_cnn()
cls.fit(train_seq_x, train_y)
pred = cls.predict(test_seq_x)
pred = pred.argmax(axis=-1)
print(metrics.accuracy_score(pred, test_y))

0.8521177315147165
