In [96]:
import pandas as pd
import matplotlib.pyplot as plt

In [97]:
df = pd.read_csv('./df_test.csv')
#df = df.sample(100000)

In [98]:
nr_samples = 50000
df0 = df[df['positive'] == 1].sample(nr_samples)
df1 = df[df['negative'] == 1].sample(nr_samples)
df2 = df[df['neutral'] == 1].sample(nr_samples)
df = pd.concat([df0,df1,df2])
df.shape

(150000, 5)

In [99]:
import string
from nltk.stem import WordNetLemmatizer

def __remove_punctuation(text):
    """
        remove punctuation from text and lower case it
    """
    text = str(text)

    punctuations = string.punctuation
    punctuations += '“'
    punctuations += '’'
    punctuations += '”'
    punctuations += '’'
    punctuations += ' — '
    punctuations += 'â€œ'
    punctuations += 'â€¦'
    punctuations += 'â€'
    punctuations += '€™'
    punctuations += '€'
    punctuations += '™'
    punctuations += '¦'
    punctuations += 'œ'
    punctuations += 'Â'
    punctuations += 'Ã'
    punctuations += '— '
    punctuations += '¶'
    punctuations += '§'
    punctuations += '£'
    punctuations += '©'
    punctuations += 'ª'
    punctuations += '³'

    # text = emoji.get_emoji_regexp().sub(u'', text)

    for punctuation in punctuations:
        text = text.replace(punctuation, ' ') 
        #text = text.replace('donald', 'trump')
        #text = text.replace('clinton', 'hillary')
    return text.lower() # lower case

def __remove_numbers(text):
    """
        remove number from text
    """
    text = str(text)

    words_only = ''.join([i for i in text if not i.isdigit()])
    return words_only.strip()

# def __remove_stopwords(text):
#     """
#         remove stop words from text
#     """
#     text = str(text)

#     # stop_words = stopwords.words('english')
#     #stop_words += stopwords.words('portuguese')
#     stop_words.append('mr')
#     stop_words = set(stop_words)

#     tokenized = word_tokenize(text)
#     without_stopwords = [word for word in tokenized if not word in stop_words]
#     return without_stopwords

def __lemmatize(text):
    """
        lemmatize text
    """
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    lemmatized_string = " ".join(lemmatized)
    return lemmatized_string


def process_data(df):
    """
        process the data
    """

    df_ = df.copy()
        
    df_['text'] = df_['text'].apply(__remove_punctuation)

    df_['text'] = df_['text'].apply(__remove_numbers)

    # df_['text'] = df_['text'].apply(__remove_stopwords)

    # df_['text'] = df_['text'].apply(__lemmatize)
    
    return df_

In [100]:
cleaned_sentences = process_data(df)
cleaned_sentences.shape

(150000, 5)

In [101]:
cleaned_sentences.dropna(inplace=True)
cleaned_sentences.shape

(150000, 5)

In [102]:
cleaned_sentences['text']

124918    that should be fun  i may have call in sick to...
106577    you ll know it s true love when he proposals y...
85955     thanks for the responses y all  i m going to d...
115516    it was just as illegal yesterday     nothing c...
88279     yeah  for sure  i ve definitely come pretty cl...
                                ...                        
81282                       you guys got hate in your heart
53340     i have a straight lebanese friend  girl  and t...
70836                           damn rui lol  serving looks
105466    what   the straights   have            stereot...
78664     so now disabled people don t have rights   you...
Name: text, Length: 150000, dtype: object

In [103]:
cleaned_sentences["num_words"] = cleaned_sentences["text"].apply(lambda x:len(str(x).split()))
cleaned_sentences["num_words"].describe()


count    150000.000000
mean         13.322627
std           6.869168
min           0.000000
25%           8.000000
50%          13.000000
75%          19.000000
max          35.000000
Name: num_words, dtype: float64

In [122]:
cleaned_sentences[cleaned_sentences["num_words"]==0]

Unnamed: 0.1,Unnamed: 0,text,positive,neutral,negative,num_words
15432,16910,,0,1,0,0
109483,120041,,0,1,0,0
60933,66808,,0,1,0,0
100173,109849,,0,1,0,0
18273,20020,,0,1,0,0
81407,89246,,0,1,0,0


In [104]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

### Let's create some mock data
max_len = 64
def get_mock_up_data(tk, X):
    X_token = tk.texts_to_sequences(X)

    ### Pad the inputs
    X_pad = pad_sequences(X_token, maxlen=max_len, dtype='float32', padding='post')
    
    return X_pad

In [105]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cleaned_sentences['text'], df[['positive', 'negative', 'neutral']], test_size=0.3, random_state=0)

In [106]:
from tensorflow.keras.layers import Normalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten, Embedding, LSTM, Conv1D, Dropout, SpatialDropout1D, Bidirectional 
from tensorflow.keras.metrics import Precision
from tensorflow.keras.metrics import Recall
import tensorflow as tf

In [107]:
tk = Tokenizer()
tk.fit_on_texts(cleaned_sentences['text'])
vocab_size = len(tk.word_index)+1
print(f'There are {vocab_size} different words in your corpus')

X_train_pad = get_mock_up_data(tk, X_train)

There are 29970 different words in your corpus


In [108]:
X_train_pad.shape

(105000, 64)

In [109]:
X_test_pad = get_mock_up_data(tk, X_test)

In [110]:
X_test_pad.shape

(45000, 64)

In [130]:
# Size of your embedding space = size of the vector representing each word
embedding_size = 32

model = Sequential()
model.add(Embedding(
    input_dim=vocab_size, # 16 +1 for the 0 padding
    input_length=max_len, # Max_sentence_length (optional, for model summary)
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))

#model.add(LSTM(32))

model.add(LSTM(32, return_sequences=True))
#model.add(LSTM(32, return_sequences=True))
model.add(LSTM(20))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
#model.add(Dense(20, activation='relu'))
#model.add(Dense(20, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 64, 32)            959328    
                                                                 
 lstm_32 (LSTM)              (None, 64, 32)            8320      
                                                                 
 lstm_33 (LSTM)              (None, 20)                4240      
                                                                 
 dense_61 (Dense)            (None, 20)                420       
                                                                 
 dense_62 (Dense)            (None, 20)                420       
                                                                 
 dense_63 (Dense)            (None, 20)                420       
                                                                 
 dense_64 (Dense)            (None, 20)              

In [31]:
# Size of your embedding space = size of the vector representing each word
embedding_size = 32

model = Sequential()
model.add(Embedding(
    input_dim=vocab_size, # 16 +1 for the 0 padding
    input_length=max_len, # Max_sentence_length (optional, for model summary)
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))



model.add(Conv1D(64, kernel_size=16))

model.add(Conv1D(32, kernel_size=8))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Flatten())
model.add(Dense(3, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 64, 32)            459520    
                                                                 
 conv1d_2 (Conv1D)           (None, 49, 64)            32832     
                                                                 
 conv1d_3 (Conv1D)           (None, 42, 32)            16416     
                                                                 
 dense_3 (Dense)             (None, 42, 20)            660       
                                                                 
 dense_4 (Dense)             (None, 42, 20)            420       
                                                                 
 dense_5 (Dense)             (None, 42, 20)            420       
                                                                 
 flatten (Flatten)           (None, 840)              

In [54]:
embedding_size = 32

model = Sequential()
model.add(Embedding(
    input_dim=vocab_size, # 16 +1 for the 0 padding
    input_length=max_len, # Max_sentence_length (optional, for model summary)
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))

model.add(LSTM(128,activation='relu',return_sequences=True))

model.add(Dropout(0.2))

model.add(LSTM(128,activation='relu'))

model.add(Dropout(0.2))

# for units in [128,128,64,32]:

# model.add(Dense(units,activation='relu'))

# model.add(Dropout(0.2))

model.add(Dense(32,activation='relu'))

model.add(Dropout(0.2))

model.add(Dense(3,activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

print(model.summary())


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 64, 32)            459520    
                                                                 
 lstm_6 (LSTM)               (None, 64, 128)           82432     
                                                                 
 dropout_6 (Dropout)         (None, 64, 128)           0         
                                                                 
 lstm_7 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_11 (Dense)            (None, 32)                4128      
                                                                 
 dropout_8 (Dropout)         (None, 32)               

In [63]:
embedding_size = 32

model = Sequential()
model.add(Embedding(
    input_dim=vocab_size, # 16 +1 for the 0 padding
    input_length=max_len, # Max_sentence_length (optional, for model summary)
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))

model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))




In [70]:
embedding_size = 32

model = Sequential()
model.add(Embedding(
    input_dim=vocab_size, # 16 +1 for the 0 padding
    input_length=max_len, # Max_sentence_length (optional, for model summary)
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))


model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))


In [83]:

model = tf.keras.applications.VGG19(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=3,
    classifier_activation="softmax",
)



TypeError: object of type 'int' has no len()

In [111]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_len,))
embedding_layer = Embedding(
    input_dim=vocab_size, # 16 +1 for the 0 padding
    input_length=max_len, # Max_sentence_length (optional, for model summary)
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(3, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
 
    

In [112]:
from keras.optimizers import Adam
learning_rate = 1e-3
opt = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop',
              #optimizer=opt,
             metrics=['accuracy',Precision(),Recall()]) # Use `rmsprop`

In [87]:
#monitor="accuracy",

In [113]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)

In [116]:
model.fit(X_train_pad, y_train, 
          epochs=1000, 
          batch_size=256, 
          verbose=1, 
          callbacks = [es],
          validation_data=(X_test_pad,y_test))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000


<keras.callbacks.History at 0x7f0bc4dad790>

model_3 = Sequential()
#model_3.add(layers.LSTM(units=40, input_shape=(12575,1), activation='tanh', return_sequences=True))
#model_3.add(layers.LSTM(units=20, activation='tanh', return_sequences=False))
model_3.add(Dense(20, activation="relu"))
model_3.add(Dense(20, activation="relu"))
model_3.add(Dense(20, activation="relu"))
model_3.add(Dense(10, activation="relu"))
model_3.add(Dense(3, activation='softmax'))
model_3.summary()