In [4]:
#colab
#!pip install pyspellchecker

In [5]:
import pandas as pd
import numpy as np
import string

from spellchecker import SpellChecker
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize


## Dataset

In [6]:
#colab
#from google.colab import files
#upload = files.upload()

In [7]:
#colab
#import io
#df = pd.read_csv(io.BytesIO(upload['data.csv']))
#df

In [8]:
df = pd.read_csv('./data.csv')
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


## Descricao

In [9]:
print('Shape ' + str(df.shape))

Shape (5842, 2)


In [10]:
print('is there Null?')
print(df.isnull().sum())

is there Null?
Sentence     0
Sentiment    0
dtype: int64


In [11]:
df.groupby('Sentiment').describe()

Unnamed: 0_level_0,Sentence,Sentence,Sentence,Sentence
Unnamed: 0_level_1,count,unique,top,freq
Sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
negative,860,860,"$ESI on lows, down $1.50 to $2.50 BK a real po...",1
neutral,3130,3124,SSH Communications Security Corporation is hea...,2
positive,1852,1852,The GeoSolutions technology will leverage Bene...,1


## PRE-PROCESSAMENTO

In [12]:
corpus = df

In [13]:
l_unique = sorted(corpus['Sentiment'].unique())
label_map = {sentiment: i for i, sentiment in enumerate(l_unique)} 
label_map

{'negative': 0, 'neutral': 1, 'positive': 2}

In [None]:
corpus['Class'] = corpus['Sentiment'].map(label_map)
corpus

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
#https://www.kaggle.com/code/jovanchua/financial-statement-analysis


# tokenizer = TreebankWordTokenizer()
# lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))
# tool = language_tool_python.LanguageTool('en-US')
#def preprocess_text(text, tokenizer, lemmatizer, stop_words, spellchecker):

textos = corpus['Sentence']
textos = textos.str.lower()
textos = textos.str.translate(str.maketrans('', '', string.punctuation))
textos = textos.str.replace('[\d+]', '') #remove numeros
#textos = textos.str.replace(None, '')


In [None]:

corpus['SentenceAdj'] = textos
corpus = corpus[['SentenceAdj','Class']]
corpus

In [None]:
#colab
#import nltk
#nltk.download('punkt')

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import train_test_split

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [None]:
x_train, x_test, y_train, y_test = train_test_split(corpus['SentenceAdj'], corpus['Class'], test_size=0.2, random_state=42)
print('df split done')

In [None]:
aux = df.shape[0]
print('Train size: x ' + str(x_train.count()/aux) + ', y ' + str(y_train.count()/aux))

## MODEL - BERT
[Source](https://github.com/codebasics/deep-learning-keras-tf-tutorial/blob/master/47_BERT_text_classification/BERT_email_classification-handle-imbalance.ipynb)

In [None]:
#Colab
#!pip install tensorflow-hub
#!pip install tensorflow-text

In [None]:
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
[[corpus['SentenceAdj'][0]], [corpus['SentenceAdj'][1]]]

In [None]:
emb = get_sentence_embeding([corpus['SentenceAdj'][0], corpus['SentenceAdj'][1]])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity([emb[0]],[emb[1]])

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

### Model-Train

In [None]:
model.fit(x_train, y_train, epochs=1)


In [None]:
model.evaluate(x_test, y_test)

In [None]:
y_predicted = model.predict(x_test)
y_predicted = y_predicted.flatten()

### Model - Evaluate

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib import pyplot as plt
import seaborn as sn

In [None]:
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

In [None]:
cm = confusion_matrix(y_test, y_predicted)
cm

In [None]:
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

## MODEL - LSTM