In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers
import string
import re
from sklearn.model_selection import train_test_split
from pandarallel import pandarallel
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 

In [2]:
train = pd.read_csv("Dataset/Twitter/train.csv", encoding = "ISO-8859-1",usecols=[0,5],header=None)
test = pd.read_csv("Dataset/Twitter/test.csv", encoding = "ISO-8859-1",usecols=["Sentiment","SentimentText"])

In [3]:
train.columns = ["target","text"]
test.columns = ["target","text"]

In [4]:
train['target'] = np.where(train['target']==4, 0, 1)
test['target'] = np.where(test['target']==1, 0, 1)

In [5]:
def custom_standardization(input_data):
    import preprocessor as p
    processed_data = p.clean(input_data)
    lowercase_value = processed_data.lower()
    return lowercase_value

In [7]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [8]:
train["text"] = train["text"].parallel_apply(custom_standardization)
test["text"] = test["text"].parallel_apply(custom_standardization)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=266667), Label(value='0 / 266667')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=263103), Label(value='0 / 263103')…

In [9]:
train_df, test_df = train_test_split(train, test_size=0.2)

In [36]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dataframe["text"].values, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [37]:
raw_train_ds = df_to_dataset(train_df)
raw_val_ds = df_to_dataset(test_df)
raw_test_ds = df_to_dataset(test)

In [38]:
for feature_batch, label_batch in raw_train_ds.take(1):
  print('Lables:', label_batch )
  print('Text:', feature_batch)

Lables: tf.Tensor([1 1 1 1 1 0 1 0 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 0 1], shape=(32,), dtype=int32)
Text: tf.Tensor(
[b"yeah, and the mazda mpv had the taurus' l duratec v6; i thought about one of those, but they don't make 'em anymore."
 b"seriously?! why didn't you say so? lmao. tell me how."
 b'tried bleu cheese for the first time in a long time. i am not sure how the tummy is accepting this.'
 b"idk n their mic doesn't work to well apparently. so, disneyland!!!"
 b"ok i'm bored... wat shall i do? rly wish i can drive"
 b'no man no man no man no man!!!'
 b"ma3arf it's like yom kan yrmes an what was wrong in the presentations kel el comments aggi w ana el wa7eeda eli ys2alni!"
 b'oh man i love that place...enjoy it'
 b"yea...dad hijacked the putah. i'm off for tonight my lovelies. -"
 b'got woken up early by my dog...'
 b'hope i can get out of this meeting early...i need to find a home'
 b"i have arrived in la. waitin on my bags. lax aint as big as hartsfield jackson tho! 

In [39]:
print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

Number of batches in raw_train_ds: 40000
Number of batches in raw_val_ds: 10000
Number of batches in raw_test_ds: 49332


In [40]:
# Model constants.
max_features = 200000
embedding_dim = 128
sequence_length = 500
vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [41]:
text_ds = raw_train_ds.map(lambda x, y: x)

In [42]:
vectorize_layer.adapt(text_ds)

In [43]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [44]:
for feature_batch,label_batch in train_ds.take(1):
    print(label_batch)
    print(feature_batch)

tf.Tensor([1 0 1 0 1 1 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 0 0], shape=(32,), dtype=int32)
tf.Tensor(
[[   64   160    31 ...     0     0     0]
 [  127   128   199 ...     0     0     0]
 [   45     3   141 ...     0     0     0]
 ...
 [  316   394     0 ...     0     0     0]
 [   39    77   603 ...     0     0     0]
 [  898 22161    52 ...     0     0     0]], shape=(32, 500), dtype=int64)


In [45]:
# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [46]:
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2a612f78520>

In [47]:
model.evaluate(test_ds)



[0.3614481985569, 0.8398500084877014]

In [65]:
test_df = pd.DataFrame({"text":["I want to kill myself"],"target":[1]})

In [66]:
new_test_ds = df_to_dataset(test_df)

In [67]:
# A string input
inputs = tf.keras.Input(shape=(1,), dtype="string")
# Turn strings into vocab indices
indices = vectorize_layer(inputs)
# Turn vocab indices into predictions
outputs = model(indices)

# Our end to end model
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
)

# Test it with `raw_test_ds`, which yields raw strings

end_to_end_model.predict(new_test_ds)



array([[0.8786312]], dtype=float32)