In [112]:
import pandas as pd
import matplotlib.pyplot as plt

In [115]:
df = pd.read_csv('./df_test.csv')
#df = df.sample(100000)

In [116]:
nr_samples = 50000
df0 = df[df['positive'] == 1].sample(nr_samples)
df1 = df[df['negative'] == 1].sample(nr_samples)
df2 = df[df['neutral'] == 1].sample(nr_samples)
df = pd.concat([df0,df1,df2])
df.shape

(150000, 5)

In [117]:
import string
from nltk.stem import WordNetLemmatizer

def __remove_punctuation(text):
    """
        remove punctuation from text and lower case it
    """
    text = str(text)

    punctuations = string.punctuation
    punctuations += '“'
    punctuations += '’'
    punctuations += '”'
    punctuations += '’'
    punctuations += ' — '
    punctuations += 'â€œ'
    punctuations += 'â€¦'
    punctuations += 'â€'
    punctuations += '€™'
    punctuations += '€'
    punctuations += '™'
    punctuations += '¦'
    punctuations += 'œ'
    punctuations += 'Â'
    punctuations += 'Ã'
    punctuations += '— '
    punctuations += '¶'
    punctuations += '§'
    punctuations += '£'
    punctuations += '©'
    punctuations += 'ª'
    punctuations += '³'

    # text = emoji.get_emoji_regexp().sub(u'', text)

    for punctuation in punctuations:
        text = text.replace(punctuation, ' ') 
        #text = text.replace('donald', 'trump')
        #text = text.replace('clinton', 'hillary')
    return text.lower() # lower case

def __remove_numbers(text):
    """
        remove number from text
    """
    text = str(text)

    words_only = ''.join([i for i in text if not i.isdigit()])
    return words_only.strip()

# def __remove_stopwords(text):
#     """
#         remove stop words from text
#     """
#     text = str(text)

#     # stop_words = stopwords.words('english')
#     #stop_words += stopwords.words('portuguese')
#     stop_words.append('mr')
#     stop_words = set(stop_words)

#     tokenized = word_tokenize(text)
#     without_stopwords = [word for word in tokenized if not word in stop_words]
#     return without_stopwords

def __lemmatize(text):
    """
        lemmatize text
    """
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    lemmatized_string = " ".join(lemmatized)
    return lemmatized_string


def process_data(df):
    """
        process the data
    """

    df_ = df.copy()
        
    df_['text'] = df_['text'].apply(__remove_punctuation)

    df_['text'] = df_['text'].apply(__remove_numbers)

    # df_['text'] = df_['text'].apply(__remove_stopwords)

    # df_['text'] = df_['text'].apply(__lemmatize)
    
    return df_

In [118]:
cleaned_sentences = process_data(df)
cleaned_sentences.shape

(150000, 5)

In [119]:
cleaned_sentences.dropna(inplace=True)
cleaned_sentences.shape

(150000, 5)

In [120]:
cleaned_sentences['text']

76433     well  my upcoming adverts are going to be inte...
67561                                                my man
50063                                      great job thanks
75636     i m glad you had a great time here  wishing yo...
37226     we can hammer out the details later      don t...
                                ...                        
122274                       and yet you have so much karma
174771    suddenly  dorion will fleece another gm of the...
90525     no child should go without needed medical care...
138418    yeah  the only time i ever parked like that wa...
91123     if we shoot by accident we don t keep feeding ...
Name: text, Length: 150000, dtype: object

In [121]:
cleaned_sentences["num_words"] = cleaned_sentences["text"].apply(lambda x:len(str(x).split()))
cleaned_sentences["num_words"].describe()


count    150000.000000
mean         13.332967
std           6.871415
min           0.000000
25%           8.000000
50%          13.000000
75%          19.000000
max          35.000000
Name: num_words, dtype: float64

In [122]:
cleaned_sentences[cleaned_sentences["num_words"]==0]

Unnamed: 0.1,Unnamed: 0,text,positive,neutral,negative,num_words
15432,16910,,0,1,0,0
109483,120041,,0,1,0,0
60933,66808,,0,1,0,0
100173,109849,,0,1,0,0
18273,20020,,0,1,0,0
81407,89246,,0,1,0,0


In [123]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

### Let's create some mock data
max_len = 64
def get_mock_up_data(tk, X):
    X_token = tk.texts_to_sequences(X)

    ### Pad the inputs
    X_pad = pad_sequences(X_token, maxlen=max_len, dtype='float32', padding='post')
    
    return X_pad

In [124]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cleaned_sentences['text'], df[['positive', 'negative', 'neutral']], test_size=0.3, random_state=0)

In [125]:
from tensorflow.keras.layers import Normalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten, Embedding, LSTM 
from tensorflow.keras.metrics import Precision
from tensorflow.keras.metrics import Recall

In [126]:
tk = Tokenizer()
tk.fit_on_texts(cleaned_sentences['text'])
vocab_size = len(tk.word_index)+1
print(f'There are {vocab_size} different words in your corpus')

X_train_pad = get_mock_up_data(tk, X_train)

There are 29979 different words in your corpus


In [127]:
X_train_pad.shape

(105000, 64)

In [128]:
X_test_pad = get_mock_up_data(tk, X_test)

In [129]:
X_test_pad.shape

(45000, 64)

In [130]:
# Size of your embedding space = size of the vector representing each word
embedding_size = 32

model = Sequential()
model.add(Embedding(
    input_dim=vocab_size, # 16 +1 for the 0 padding
    input_length=max_len, # Max_sentence_length (optional, for model summary)
    output_dim=embedding_size, # 100
    mask_zero=True, # Built-in masking layer :)
))

#model.add(LSTM(32))

model.add(LSTM(32, return_sequences=True))
#model.add(LSTM(32, return_sequences=True))
model.add(LSTM(20))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
#model.add(Dense(20, activation='relu'))
#model.add(Dense(20, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 64, 32)            959328    
                                                                 
 lstm_32 (LSTM)              (None, 64, 32)            8320      
                                                                 
 lstm_33 (LSTM)              (None, 20)                4240      
                                                                 
 dense_61 (Dense)            (None, 20)                420       
                                                                 
 dense_62 (Dense)            (None, 20)                420       
                                                                 
 dense_63 (Dense)            (None, 20)                420       
                                                                 
 dense_64 (Dense)            (None, 20)              

In [131]:
from keras.optimizers import Adam
learning_rate = 1e-3
opt = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop',
              #optimizer=opt,
             metrics=['accuracy',Precision(),Recall()]) # Use `rmsprop`

In [100]:
#monitor="accuracy",

In [132]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)

In [133]:
model.fit(X_train_pad, y_train, 
          epochs=1000, 
          batch_size=32, 
          verbose=1, 
          callbacks = [es],
          validation_data=(X_test_pad,y_test))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000


<keras.callbacks.History at 0x7f5419917370>

model_3 = Sequential()
#model_3.add(layers.LSTM(units=40, input_shape=(12575,1), activation='tanh', return_sequences=True))
#model_3.add(layers.LSTM(units=20, activation='tanh', return_sequences=False))
model_3.add(Dense(20, activation="relu"))
model_3.add(Dense(20, activation="relu"))
model_3.add(Dense(20, activation="relu"))
model_3.add(Dense(10, activation="relu"))
model_3.add(Dense(3, activation='softmax'))
model_3.summary()