In [1]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers

import string
import re

import preprocess_text as pt


from pandarallel import pandarallel
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 

In [None]:
import gc

gc.collect()

In [None]:
train_df,val_df = pt.suicidal_intent_data_load(test_dataset=False,standardization=False)

In [None]:

pandarallel.initialize()

In [None]:
def stemmer_parallel(input):
    import nltk
    stemmer = nltk.SnowballStemmer("english")
    return ' '.join([stemmer.stem(word) for word in input.split(' ')])

In [None]:
train_df["text"] = train_df["text"].parallel_apply(stemmer_parallel)
val_df["text"] = val_df["text"].parallel_apply(stemmer_parallel)

In [None]:
SQ_LEN = train_df["text"].apply(lambda x: len(x.split()))

In [None]:
val_SQ_LEN = val_df["text"].apply(lambda x: len(x.split()))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.set_style("darkgrid")
plt.figure(figsize=(16,10))
sns.displot(SQ_LEN, kde=True)

In [None]:
plt.figure(figsize=(16,10))
sns.displot(val_SQ_LEN, kde=True)

In [None]:
max_length = SQ_LEN.max()
max_length

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dataframe["text"].values, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
raw_train_ds = df_to_dataset(train_df)
raw_val_ds = df_to_dataset(val_df)


print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")


In [None]:
for feature_batch,label_batch in raw_train_ds.take(1):
    print("feature:",feature_batch) 
    print("label:",label_batch)

In [None]:
class Patterns:
    URL_PATTERN_STR = r"^((https?|ftp|smtp):\/\/)?(www.)?[a-z0-9]+\.[a-z]+(\/[a-zA-Z0-9#]+\/?)*$"
    HASHTAG_STR = r'#\w*'
    MENTION_STR = r'@\w*'
    EMOJIS_STR = u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])'
    SMILEYS_STR = r"(\s?:X|:|;|=)(?:-)?(?:\)+|\(|O|D|P|S|\\|\/\s){1,}"
    NUMBERS_STR = r"(^|\s)(-?\d+([.,]?\d+)*)"
    PUNCTUATION_STR = r"[^\w\s]"

def custom_standardizer(input):
    lowercase = tf.strings.lower(input)
    processed = tf.strings.regex_replace(lowercase, Patterns.URL_PATTERN_STR, "")
    processed = tf.strings.regex_replace(processed, Patterns.HASHTAG_STR, "")
    processed = tf.strings.regex_replace(processed, Patterns.MENTION_STR, "")
    processed = tf.strings.regex_replace(processed, Patterns.EMOJIS_STR, "")
    processed = tf.strings.regex_replace(processed, Patterns.SMILEYS_STR, "")
    processed = tf.strings.regex_replace(processed, Patterns.NUMBERS_STR, "")
    processed = tf.strings.regex_replace(processed, Patterns.PUNCTUATION_STR, "")
    return processed

In [None]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorize_layer = TextVectorization(
    standardize=custom_standardizer,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [None]:
text_ds = raw_train_ds.map(lambda x, y: x)

In [None]:
vectorize_layer.adapt(text_ds)

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)


# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)


In [None]:
for feature_batch,label_batch in train_ds.take(1):
    print("feature:",feature_batch) 
    print("label:",label_batch)

In [None]:
train_ds.save("model_checkpoints/neural_net/train_data")
val_ds.save("model_checkpoints/neural_net/val_data")

In [None]:
del raw_train_ds,raw_val_ds,train_df,val_df,train_ds,val_ds,text_ds
gc.collect()

In [None]:
train_ds = tf.data.Dataset.load("model_checkpoints/neural_net/train_data")
val_ds = tf.data.Dataset.load("model_checkpoints/neural_net/val_data")

In [None]:
max_features = 20000
embedding_dim = 128
sequence_length = 500
# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"],run_eagerly=True)
checkpoint = tf.keras.callbacks.ModelCheckpoint("model_outputs/neural_net/best_model",save_best_only=True)
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
model.summary()

In [None]:
epochs = 10
# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

In [None]:
test_df = pd.DataFrame({"text":["I kill myself"],"target":[1]})

In [None]:
test_df["text"] = test_df["text"].parallel_apply(stemmer_parallel)

In [None]:
new_test_ds = df_to_dataset(test_df)

In [None]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = model(indices)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)

# Test it with `raw_test_ds`, which yields raw strings
end_to_end_model.predict(new_test_ds)

In [None]:
end_to_end_model.save('model_outputs/neural_net')