### Library import

In [1]:
# Library Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\codej\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Util Function for preprocessing

In [2]:
# Utils Function

def convert_to_number(series):
    for i  in range(len(series)):
        try:
            series[i] = int(series[i])
        except ValueError:
            series[i] = None
    return series


def remove_html_tags_special_character(col: pd.Series) -> pd.Series:
    tags_list = ['<p>' ,'</p>' , '<p*>',
                 '<ul>','</ul>',
                 '<li>','</li>',
                 '<br>',
                 '<strong>','</strong>',
                 '<span*>','</span>',
                 '<a href*>','</a>',
                 '<em>','</em>','<br>','<br />','<div>','</div>','\\n','~']
    for tag in tags_list:
        col.replace(to_replace=tag,value='',regex=False,inplace=True)
    return col

punctuations_list = string.punctuation
def remove_punctuations(text):
    temp = str.maketrans('', '', punctuations_list)
    text = str(text)
    return text.translate(temp)


def remove_stopwords(text):
    stop_words = stopwords.words('english')
 
    imp_words = []
 
    # Storing the important words
    for word in str(text).split():
        word = word.lower()
 
        if (word not in stop_words) and 'br' not in word:
            imp_words.append(word)
 
    output = " ".join(imp_words)
 
    return output

def balance_data(df,y_column_name):
    ham_msg = df[df[y_column_name] == 0]
    spam_msg = df[df[y_column_name] == 1]
    print(ham_msg.shape)
    print(spam_msg.shape)

    if len(ham_msg) >= len(spam_msg):
        ham_msg.sample(n=len(spam_msg),random_state=42)
    else:
        spam_msg.sample(n=len(ham_msg),random_state=42)
    return pd.concat([ham_msg, spam_msg],ignore_index=True)

### Preprocessing Main Function

In [3]:
def preprocess(df,X_column_name="C3",y_column_name="C5",split_ratio=[0.1,0.2]):
    """
    Preprocess the data for model training
    Arguments:
    df: Dataframe of raw data
    X_column_name: represent the input x column name in DataFrame
    Y_Column_nameL represent the target y column name in DataFrame
    split_ratio: use to define spliting ratio for training and testing data default is 0.2 (20% of data is use for testing and 80% for training)
    """
    # Target value preprocessing
    df = df[[X_column_name,y_column_name]] # C3 for input column ,C5 target column
    df[y_column_name] = pd.Series(convert_to_number(df[y_column_name].to_list()),name=y_column_name)
    df = df[ df[y_column_name] <= 1]

    #input Value preprocessing
    #Step 1 remove html Tags an extra special character
    df[X_column_name] = remove_html_tags_special_character(df[X_column_name])
    df[X_column_name].replace(to_replace='\n',value='',inplace=True,regex=True)
    df[X_column_name].replace(to_replace='\\?',value='',inplace=True,regex=True)
    df[X_column_name].dropna(inplace=True)

    # Balance Data
    df = balance_data(df,y_column_name)
    # Step 3 NLP Text Preprocessing
    df[X_column_name] = df[X_column_name].apply(lambda x: remove_punctuations(x))
    df[X_column_name] = df[X_column_name].apply(lambda text: remove_stopwords(text))
    
    if len(split_ratio) == 2:
        train_x, test_x, train_y, test_y = train_test_split(df[X_column_name],df[y_column_name],test_size=split_ratio[1])
        train_x, val_x, train_y, val_y = train_test_split(train_x,train_y,test_size=split_ratio[0])
        return ((train_x,train_y), (val_x, val_y), (test_x, test_y))
    else:
        train_x, test_x, train_y, test_y = train_test_split(df[X_column_name],df[y_column_name],test_size=split_ratio[0])
        return ((train_x, train_y), (test_x, test_y))

### Enter Raw dataset File name for preprocessing

In [4]:
#Preprocessing raw Data
DATASET_PATH = r"A:\CJ_Personal\Upwork\Text Moderation\Dataset\messages.csv"
INPUT_COLUMN_NAME = "C3"
TARGET_COLUMN_NAME = "C5"
Max_COLUMN = 100
df = pd.read_csv(DATASET_PATH,encoding="UTF-8",names=[f"C{i}" for i in range(Max_COLUMN)])
VALIDATION_RATIO = 0.1
TESTING_RATIO = 0.2
raw_df = pd.read_csv(DATASET_PATH, encoding="UTF-8",names=[f"C{i}" for i in range(Max_COLUMN)])
train_set, val_set, test_set = preprocess(raw_df,
                                          INPUT_COLUMN_NAME,
                                          TARGET_COLUMN_NAME,
                                          [VALIDATION_RATIO,TESTING_RATIO])

  raw_df = pd.read_csv(raw_file, encoding="UTF-8",names=[f"C{i}" for i in range(Max_columns)])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[y_column_name] = pd.Series(convert_to_number(df[y_column_name].to_list()),name=y_column_name)


(578489, 2)
(91143, 2)


## Convert Text to Number
### Plase Set the max length for padding

In [5]:
# Convert Text Data into Number
max_len = 150
train_tokenizer = Tokenizer()
train_tokenizer.fit_on_texts(train_set[0])

train_sequence = train_tokenizer.texts_to_sequences(train_set[0])
val_sequence = train_tokenizer.texts_to_sequences(val_set[0])
test_sequence = train_tokenizer.texts_to_sequences(test_set[0])

train_sequence = pad_sequences(train_sequence,
                                maxlen=max_len,
                                padding='post',
                                truncating='post')


val_sequence = pad_sequences(val_sequence,
                                maxlen=max_len,
                                padding='post',
                                truncating='post')


test_sequence = pad_sequences(test_sequence,
                                maxlen=max_len,
                                padding='post',
                                truncating='post')

# Data is ready for Model Training

## Model Selection ( run the respective cells for model traning)

### Machine learning Model SVM ( Support vector machine)

In [54]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [55]:
model = SVC()
model.fit(train_sequence,train_set[1])

prediction = model.predict(test_sequence)

print(accuracy_score(test_set[1],prediction))

### TensorFlow Custom Neural Network

In [None]:
# Build the model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1,
                                    output_dim=64,
                                    input_length=max_len))
model.add(tf.keras.layers.LSTM(32,return_sequences=True))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
 
# Print the model summary
model.summary()


In [None]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
              metrics = ['accuracy'],
              optimizer = 'SGD')

es = EarlyStopping(patience=3,
                   monitor = 'val_accuracy',
                   restore_best_weights = True)
 
lr = ReduceLROnPlateau(patience = 3,
                       monitor = 'val_loss',
                       factor = 0.8,
                       verbose = 0)


In [None]:
history = model.fit(train_sequence, train_set[1],
                    validation_data=(val_sequence, val_set[1]),
                    epochs=20,
                    batch_size=32,
                    callbacks = [lr, es]
                   )

### Bert Model

In [6]:
import tensorflow_text as text
import tensorflow_hub as hub

#### Bert Model Download

In [8]:
# Downloading Pre-build Bert Model Layer

bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')


#### Bert Model Wrapper ( Input and output layer Added)

In [9]:
# bert model wth input and output layer wrapper

text_input = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'Inputs')
preprocessed_text = bert_preprocessor(text_input)
embeed = bert_encoder(preprocessed_text)
dropout = tf.keras.layers.Dropout(0.1, name = 'Dropout')(embeed['pooled_output'])
outputs = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'Dense')(dropout)
# creating final model
model = tf.keras.Model(inputs = [text_input], outputs = [outputs])


#### GPU Configuration for model training (Run Only System have supported GPU Card)

In [11]:
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(gpus[0], 'GPU')
logical_gpus = tf.config.list_logical_devices('GPU')

#### Model Summry to show trainable and non-trainable Parameters

In [12]:

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Inputs (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     {'input_mask': (Non  0           ['Inputs[0][0]']                 
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

#### Model Compiling with Call backs and metrics

In [13]:
# model Compiling

Metrics = [tf.keras.metrics.BinaryAccuracy(name = 'accuracy'),
           tf.keras.metrics.Precision(name = 'precision'),
           tf.keras.metrics.Recall(name = 'recall')
           ]
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath="/Model CheckPoints",
                                                 save_weights_only=True,
                                                 verbose=1)
# compiling our model
model.compile(optimizer ='adam',
               loss = 'binary_crossentropy',
               metrics = Metrics)

#### Model Training ()

In [15]:
history = model.fit(train_set[0], train_set[1],batch_size=128,validation_data=val_set,shuffle=True, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Full Model Save ( Large File)

In [44]:
opt = tf.saved_model.SaveOptions(
    namespace_whitelist=None,
    save_debug_info=False,
    function_aliases=None,
    experimental_io_device="CPU:0",
    experimental_variable_policy=None,
    experimental_custom_gradients=True
)
model.save("models/spam_model",options=opt)



INFO:tensorflow:Assets written to: models/spam_model\assets


INFO:tensorflow:Assets written to: models/spam_model\assets


#### Save Model Weights Only ( lite weight)

In [17]:
model.save_weights("spam_model_weight")

#### Model Evalution

In [18]:
model.evaluate(test_set[0],test_set[1])



[0.39939022064208984, 0.8637391924858093, 0.0, 0.0]

#### Model Classificatio Report

In [40]:
from sklearn.metrics import precision_recall_fscore_support

pred = model.predict(test_set[0])



In [43]:
pred_int = tf.cast(pred, tf.int32)
precision_recall_fscore_support(test_set[1],pred_int)

  _warn_prf(average, modifier, msg_start, len(result))


(array([0.8637392, 0.       ]),
 array([1., 0.]),
 array([0.92688848, 0.        ]),
 array([115678,  18249], dtype=int64))