In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

# Constants
DF_LOC = '../dataset/twitter-spam/train.csv'

# 1. Base Model

In [2]:
raw_df = pd.read_csv(DF_LOC).drop('following', axis=1).drop('followers', axis=1).drop('actions', axis=1).drop('is_retweet', axis=1).drop('location', axis=1).drop('Unnamed: 7', axis=1)
raw_df.iloc[7552]

Tweet    #Virgos are usually very helpful around the ho...
Type                                          South Dakota
Name: 7552, dtype: object

In [3]:
# Preprocess type
raw_df['Type'] = raw_df['Type'].map({'Quality': 0, 'Spam': 1})
raw_df.head()

Unnamed: 0,Tweet,Type
0,Good Morning Love @LeeBrown_V,0.0
1,'@realDonaldTrump @USNavy RIP TO HEROES',1.0
2,Haven't been following the news but I understa...,0.0
3,pic.twitter.com/dy9q4ftLhZ What to do with pap...,0.0
4,#DidYouKnow ► Mahatma Gandhi made a brief visi...,0.0


In [4]:
raw_df = raw_df.drop(raw_df['Type'].loc[raw_df['Type'].isnull()].index.values)
# raw_df.isnull()

In [5]:
raw_df['Type'].loc[raw_df['Type'].isnull()]

Series([], Name: Type, dtype: float64)

In [6]:
sentences = raw_df['Tweet']

tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

In [7]:
sentences

0                           Good Morning Love  @LeeBrown_V
1                 '@realDonaldTrump @USNavy RIP TO HEROES'
2        Haven't been following the news but I understa...
3        pic.twitter.com/dy9q4ftLhZ What to do with pap...
4        #DidYouKnow ► Mahatma Gandhi made a brief visi...
                               ...                        
14894    #AllWentWrongWhen I told my hair stylist to "g...
14895    They don't have to like you, and you don't hav...
14896    #Miami Graham Nash Live at Parker Playhouse  #...
14897    @bethannhamilton is in the business of one-upp...
14898      Chasing Success  by  Space Cadetz  Listen up...
Name: Tweet, Length: 14897, dtype: object

In [7]:
# Saving via pickle
import pickle
with open('../dataset/basic-tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

[31mERROR: Could not find a version that satisfies the requirement pickle[0m
[31mERROR: No matching distribution found for pickle[0m


In [53]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')

print(padded[0])
print(padded.shape)

[   86   233    73 11601   158     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0]
(14897, 71)


In [95]:
# len(tokenizer.word_index)
padded[0]

array([   86,   233,    73, 11601,   158,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0],
      dtype=int32)

In [77]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index), 16, input_length=71),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(), metrics=tf.keras.metrics.BinaryAccuracy())
model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 71, 16)            649152    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                12544     
_________________________________________________________________
dense_24 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 129       
Total params: 670,145
Trainable params: 670,145
Non-trainable params: 0
_________________________________________________________________


In [78]:
model.fit(padded, raw_df['Type'].values, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7f8942661070>

In [79]:
model.save('../model/twitter-spam')

2021-10-27 19:32:32.794185: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ../model/twitter-spam/assets


INFO:tensorflow:Assets written to: ../model/twitter-spam/assets


In [81]:
# Testing
squid_df = pd.read_json('../dataset/squid-tweets/squid_tweets.json')
squid_df = squid_df.rename(columns={0: "tweets"})
squid_df.head()

Unnamed: 0,tweets
0,RT @sweetshrubs_: More skts x squid game doodl...
1,@milesSI Squid Game any good?
2,I've just watched episode S01 | E02 of Squid G...
3,RT @binseolovely: SQUID GAME (#STRAYKIDS VER.)...
4,RT @a_leesha1: We kinda already have a real li...


In [127]:
input_pred = tokenizer.texts_to_sequences(squid_df['tweets'])
# print(input_pred, end='\n===\n')
input_pad = tf.keras.preprocessing.sequence.pad_sequences(input_pred, padding='post', maxlen=71)
# print(input_pad, end='\n===\n')
is_spam = model.predict(input_pad)

squid_df['is_spam'] = [True if i >= 0.5 else False for i in is_spam]

# for i in squid_df['tweets']:
#     input_pred = tokenizer.texts_to_sequences(i)
#     print(input_pred, end='\n===\n')
#     input_pad = tf.keras.preprocessing.sequence.pad_sequences([input_pred], padding='post', maxlen=71)
#     print(input_pad, end='\n===\n')
#     model.predict(input_pad)

In [128]:
squid_df.head()

Unnamed: 0,tweets,is_spam
0,RT @sweetshrubs_: More skts x squid game doodl...,True
1,@milesSI Squid Game any good?,False
2,I've just watched episode S01 | E02 of Squid G...,True
3,RT @binseolovely: SQUID GAME (#STRAYKIDS VER.)...,True
4,RT @a_leesha1: We kinda already have a real li...,False


In [130]:
!pip install openpyxl
squid_df.to_excel('../dataset/squid-tweets/squid-dataset.xlsx')

Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 527 kB/s 
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.9


In [None]:
squid_df['spam'] = model.predict()