## Load Model

In [24]:
import spacy

nlp = spacy.load('en_core_web_sm')

## Read Data

In [25]:
import pandas as pd
import os

def read_folder(path_to_folder):
    data_list = []
    try:
        filenames = [f for f in os.listdir(path_to_folder) if f.endswith('.txt')]
        for filename in filenames:
            full_path = os.path.join(path_to_folder, filename)
            temp_df = pd.read_csv(full_path, sep='\t', header=None, names=['texto'])

            # Adicione o DataFrame lido à nossa lista
            data_list.append(temp_df)

        # 4. Concatene todos os DataFrames da lista em um único DataFrame final
        if data_list:
            final_df = pd.concat(data_list, ignore_index=True)
            print("Arquivos lidos com sucesso!")
            return final_df
        else:
            print(f"Nenhum arquivo .txt encontrado no diretório: {path_to_folder}")

    except FileNotFoundError:
        print(f"Erro: O diretório não foi encontrado em '{path_to_folder}'")
    except Exception as e:
        print(f"Ocorreu um erro inesperado: {e}")

In [26]:
path_to_folder = 'data/train/neg/'
df_neg = read_folder(path_to_folder)
path_to_folder = 'data/train/pos/'
df_pos = read_folder(path_to_folder)

Arquivos lidos com sucesso!
Arquivos lidos com sucesso!


## Add target

In [27]:
df_pos['sentiment'] = 1
df_pos

Unnamed: 0,texto,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
12495,"Seeing as the vote average was pretty low, and...",1
12496,"The plot had some wretched, unbelievable twist...",1
12497,I am amazed at how this movie(and most others ...,1
12498,A Christmas Together actually came before my t...,1


In [28]:
df_neg['sentiment'] = 0
df_neg

Unnamed: 0,texto,sentiment
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0
...,...,...
12495,"Towards the end of the movie, I felt it was to...",0
12496,This is the kind of movie that my enemies cont...,0
12497,I saw 'Descent' last night at the Stockholm Fi...,0
12498,Some films that you pick up for a pound turn o...,0


## Concat data

In [29]:
df = pd.concat([df_neg, df_pos], ignore_index=True)
df

Unnamed: 0,texto,sentiment
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0
...,...,...
24995,"Seeing as the vote average was pretty low, and...",1
24996,"The plot had some wretched, unbelievable twist...",1
24997,I am amazed at how this movie(and most others ...,1
24998,A Christmas Together actually came before my t...,1


## Preprocessing Data

In [31]:
def preprocess(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and not token.is_punct]

df['texto_preprocessado'] = df['texto'].apply(preprocess)
df

Unnamed: 0,texto,sentiment,texto_preprocessado
0,Story of a man who has unnatural feelings for ...,0,"[story, man, unnatural, feeling, pig, start, o..."
1,Airport '77 starts as a brand new luxury 747 p...,0,"[airport, start, brand, new, luxury, plane, lo..."
2,This film lacked something I couldn't put my f...,0,"[film, lack, finger, charisma, lead, actress, ..."
3,"Sorry everyone,,, I know this is supposed to b...",0,"[sorry, know, suppose, art, film, wow, hand, g..."
4,When I was little my parents took me along to ...,0,"[little, parent, take, theater, Interiors, mov..."
...,...,...,...
24995,"Seeing as the vote average was pretty low, and...",1,"[see, vote, average, pretty, low, fact, clerk,..."
24996,"The plot had some wretched, unbelievable twist...",1,"[plot, wretched, unbelievable, twist, chemistr..."
24997,I am amazed at how this movie(and most others ...,1,"[amazed, average, star, low, crappy, movie, av..."
24998,A Christmas Together actually came before my t...,1,"[Christmas, actually, come, time, raise, John,..."


## Vectorizing Data

In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

all_processed_reviews = df['texto_preprocessado'].tolist()

MAX_VOCAB_SIZE = 10000

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<unk>")
tokenizer.fit_on_texts(all_processed_reviews)

word_index = tokenizer.word_index
print(f"Encontradas {len(word_index)} palavras únicas.")

Encontradas 61623 palavras únicas.


In [33]:
sequences = tokenizer.texts_to_sequences(all_processed_reviews)
print("\nSequências de inteiros:")
print(sequences)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [34]:
MAX_SEQUENCE_LENGTH = 50

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='pre')

y = np.array(df['sentiment'])

print("\nSequências com padding (formato final para o Keras):")
print(X)
print("\nShape do tensor de dados (X):", X.shape)
print("Shape do tensor de rótulos (y):", y.shape)


Sequências com padding (formato final para o Keras):
[[ 510   13 1025 ... 6255   11 2773]
 [ 471 3263   58 ...  164    1 3549]
 [   0    0    0 ... 1710  495  954]
 ...
 [ 109   43   18 ...  826   78 6204]
 [ 776  208 7483 ...   79 7190 1401]
 [ 377 2661  474 ...  204 2207   19]]

Shape do tensor de dados (X): (25000, 50)
Shape do tensor de rótulos (y): (25000,)


In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

EMBEDDING_DIM = 64

model = Sequential()

model.add(Embedding(input_dim=MAX_VOCAB_SIZE,
                    output_dim=EMBEDDING_DIM,
                    input_length=MAX_SEQUENCE_LENGTH))

model.add(Bidirectional(LSTM(64)))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()



In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

history = model.fit(
    X,
    y,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64
)

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.9134 - loss: 0.2207 - val_accuracy: 0.9262 - val_loss: 0.1905
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.9279 - loss: 0.1901 - val_accuracy: 0.9452 - val_loss: 0.1604
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.9415 - loss: 0.1619 - val_accuracy: 0.9598 - val_loss: 0.1294
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.9558 - loss: 0.1307 - val_accuracy: 0.9724 - val_loss: 0.0948
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.9684 - loss: 0.0988 - val_accuracy: 0.9828 - val_loss: 0.0688
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.9800 - loss: 0.0706 - val_accuracy: 0.9888 - val_loss: 0.0487
Epoch 7/10
[1m391/391

In [38]:
model.evaluate(X_test, y_test) * 100

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9966 - loss: 0.0157


[0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.9965999722480774,
 0.015669720247387886,
 0.996599

In [None]:
git