In [276]:
import urllib
import pandas as pd
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
from numpy import argmax
from sklearn.model_selection import train_test_split
import pickle

import random

In [277]:
#set paths:

#downloads the dataset for the machine learning model
import pathlib
BASE_DIR = pathlib.Path().resolve()
DATASET_DIR = BASE_DIR / 'datasets'
EXPORT_DIR = DATASET_DIR /'exports'
EXPORT_DIR.mkdir(parents=True, exist_ok=True)
SPAM_DATASET_DIR = EXPORT_DIR /'spam_dataset.csv'

SPAM_METADATA_PATH = EXPORT_DIR /'spam_metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR /'spam-tokenizer.json'



In [278]:
df = pd.read_csv(SPAM_DATASET_DIR)
df.head()

Unnamed: 0,text,label,Source
0,"Huh, anyway check out this you[tube] channel: ...",spam,YT_spam
1,Hey guys check out my new channel and our firs...,spam,YT_spam
2,just for test I have to say murdev.com,spam,YT_spam
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,spam,YT_spam
4,watch?v=vtaRGgvGtWQ Check this out .﻿,spam,YT_spam


In [279]:
labels = df.label.to_list()
texts = df.text.to_list()

In [280]:
labels[234], texts[234]

('ham',
 'What Can i say....This Song He Just Change The World Completely... So good job PSY... (and your girls are awesome :))) )\ufeff')

In [281]:
label_legend = {'ham': '0', 'spam':'1'}
label_legend_inverted = {'0': 'ham', '1':'spam'}
labels_as_int = [label_legend[x] for x in labels]

random_idx = random.randint(0,len(labels))
texts[random_idx]
df.iloc[random_idx].label

'spam'

In [282]:
#setting maximum number of words to 280, since this is the twitter limit.
MAX_NUM_WORDS = 280
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)


#fit the tokenizer:
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
sequences


[[23, 17, 11, 3, 102],
 [94,
  131,
  23,
  17,
  7,
  74,
  102,
  6,
  88,
  190,
  11,
  9,
  161,
  5,
  42,
  5,
  10,
  5,
  48,
  249,
  4,
  38,
  236,
  6,
  48,
  86],
 [30, 14, 1, 20, 2, 182, 50],
 [12, 7, 16, 7, 102, 19],
 [191, 206, 23, 11, 17, 19],
 [94, 23, 17, 7, 74, 11, 9, 89, 50],
 [86, 2, 7, 102, 19],
 [1, 13, 16, 76, 245, 9, 1, 16, 1, 30, 2, 23, 5, 184, 19],
 [3, 146, 23, 7, 102, 14, 251, 19],
 [6, 8, 146, 128, 23, 7, 102, 6, 132, 12, 61, 1, 146, 36, 246, 19],
 [94, 86, 2],
 [3,
  20,
  36,
  32,
  106,
  27,
  3,
  36,
  32,
  86,
  2,
  12,
  81,
  77,
  3,
  6,
  35,
  21,
  27,
  3,
  79,
  2,
  86,
  170,
  31,
  19],
 [179, 50],
 [86, 38],
 [48, 38, 128, 179, 50],
 [241,
  36,
  3,
  38,
  251,
  251,
  6,
  117,
  117,
  113,
  15,
  22,
  27,
  3,
  36,
  48,
  23,
  17,
  88,
  102,
  6,
  86,
  2,
  13,
  30,
  28,
  245,
  43,
  153,
  43,
  35,
  33,
  2,
  49,
  15,
  88,
  3,
  29,
  180,
  23,
  17,
  61,
  68,
  21,
  19],
 [42, 66, 5],
 [121, 82, 50

In [283]:
word_index = tokenizer.word_index
word_index


{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'and': 6,
 'my': 7,
 'u': 8,
 'is': 9,
 'in': 10,
 'this': 11,
 'me': 12,
 'it': 13,
 'for': 14,
 'of': 15,
 'on': 16,
 'out': 17,
 'your': 18,
 '\ufeff': 19,
 'have': 20,
 'so': 21,
 'that': 22,
 'check': 23,
 'are': 24,
 '2': 25,
 'call': 26,
 'if': 27,
 'but': 28,
 'can': 29,
 'just': 30,
 'now': 31,
 'not': 32,
 'be': 33,
 'at': 34,
 'will': 35,
 'do': 36,
 'or': 37,
 'like': 38,
 'get': 39,
 'with': 40,
 'up': 41,
 "i'm": 42,
 'we': 43,
 'no': 44,
 'love': 45,
 'ur': 46,
 'from': 47,
 'please': 48,
 'all': 49,
 'com': 50,
 'lt': 51,
 'gt': 52,
 'how': 53,
 'when': 54,
 'go': 55,
 '4': 56,
 'video': 57,
 'know': 58,
 'free': 59,
 'am': 60,
 'what': 61,
 'good': 62,
 'was': 63,
 'ok': 64,
 'time': 65,
 'only': 66,
 'then': 67,
 'got': 68,
 'its': 69,
 'song': 70,
 'come': 71,
 '39': 72,
 'youtube': 73,
 'new': 74,
 'br': 75,
 'as': 76,
 'day': 77,
 'there': 78,
 'want': 79,
 'he': 80,
 'one': 81,
 'www': 82,
 'by': 83,
 'amp': 84,
 

In [284]:
#pad the word sequences so they are the same matrix length:
MAX_SEQUENCE_LENGTH = 300

#generate the training data:
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
X


array([[  0,   0,   0, ...,  11,   3, 102],
       [  0,   0,   0, ...,   6,  48,  86],
       [  0,   0,   0, ...,   2, 182,  50],
       ...,
       [  0,   0,   0, ...,  21, 114, 276],
       [  0,   0,   0, ..., 161,  14,  59],
       [  0,   0,   0, ...,   2,  69, 268]])

In [285]:
#one hot encoding

labels_as_int_array = np.array(labels_as_int)
y = to_categorical(labels_as_int_array)
y

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [286]:
#split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)


In [287]:
training_data = {
    "X_train": X_train,
    "X_test": X_test,
    "y_train": y_train,
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS,
    "label_legend": label_legend,
    "label_legend_inverted":label_legend_inverted
}
tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335

In [288]:
with open(SPAM_METADATA_PATH, 'wb') as file:
    pickle.dump(training_data, file)