In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q statsmodels
!pip install -q scikit-learn

In [3]:
# Importing the required libraries
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, BatchNormalization, Conv1D, MaxPooling1D, Flatten, Embedding
from sklearn.model_selection import KFold
import pickle

tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
# Load the data from the file

with open('/content/drive/MyDrive/MacMorpho/macmorpho-train.txt', 'r') as f:
    data = f.readlines()
    
# Preprocessing the data
X, Y = [], []
for line in data:
    tokens = line.strip().split()
    X.append([t.split('_')[0].lower() for t in tokens])
    Y.append([t.split('_')[1] for t in tokens])
    
# Creating vocabulary and dictionaries
vocab = sorted(set(np.concatenate(X)))
vocab.append('<UNK>')  # add <UNK> to represent unknown words
tag_dict = {t: i for i, t in enumerate(sorted(set(np.concatenate(Y))))}
reverse_tag_dict = {v: k for k, v in tag_dict.items()}
word_dict = {w: i+1 for i, w in enumerate(vocab)}  # shift indices by 1 to make room for <UNK>
word_dict['<UNK>'] = 0  # add <UNK> to dictionary and give it index 0

# Converting words and tags to numbers
X = [[word_dict[word] for word in sent] for sent in X]
Y = [[tag_dict[tag] for tag in sent] for sent in Y]

# Padding the sequences
X = tf.keras.preprocessing.sequence.pad_sequences(X)
Y = tf.keras.preprocessing.sequence.pad_sequences(Y)

# # Creating train and validation sets
# X_train, X_val = X[:50000], X[50000:]
# Y_train, Y_val = Y[:50000], Y[50000:]

In [5]:
# Set the number of folds
n_splits = 5

# Initialize the KFold object
kf = KFold(n_splits=n_splits)

# Create an empty list to store the validation accuracie, AICs and models
val_accuracies = []
val_aics = []
models = []


# Loop over the folds
for train_index, val_index in kf.split(X):
    # Split the data into train and validation sets
    X_train, X_val = X[train_index], X[val_index]
    Y_train, Y_val = Y[train_index], Y[val_index]

    # Define the model architecture
    model = Sequential()
    model.add(Embedding(input_dim=len(vocab), output_dim=128, mask_zero=True))
    model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
    model.add(Dropout(0.7))
    model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
    model.add(Dropout(0.7))
    model.add(Dense(units=len(tag_dict), activation='softmax'))

    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=128)

    # Evaluate the model on the validation set
    _, accuracy = model.evaluate(X_val, Y_val, batch_size=128)

    # Calculate the AIC for the model
    k = model.count_params()
    L = np.exp(-accuracy / 2)
    aic = 2 * k - 2 * np.log(L)

    # Append the validation accuracy and AIC to the lists
    val_accuracies.append(accuracy)
    val_aics.append(aic)
    models.append(model)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
# Calculate the average validation accuracy
avg_val_accuracy = np.mean(val_accuracies)

# Print the average validation accuracy
print('Average validation accuracy:', avg_val_accuracy)

Average validation accuracy: 0.9298357963562012


In [10]:
# Choose the model with the lowest AIC
best_model_index = np.argmin(val_aics)
best_model = models[best_model_index]

# Salva o modelo escolhido em um arquivo
with open('/content/drive/MyDrive/MacMorpho/modelo_treinado.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("Modelo Salvo!")
# Preprocess test data
with open('/content/drive/MyDrive/MacMorpho/macmorpho-test.txt', 'r') as f:
    test_data = f.readlines()

test_X, test_Y = [], []
for line in test_data:
    tokens = line.strip().split()
    test_X.append([word_dict.get(t.split('_')[0].lower(), 0) for t in tokens])  # replace unknown words with <UNK>
    test_Y.append([tag_dict[t.split('_')[1]] for t in tokens])

# Padding the sequences
test_X = tf.keras.preprocessing.sequence.pad_sequences(test_X)
test_Y = tf.keras.preprocessing.sequence.pad_sequences(test_Y)

# Evaluating the model on test data
loss, accuracy = model.evaluate(test_X, test_Y, batch_size=128)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Modelo Salvo!
Test Loss: 0.2738824784755707
Test Accuracy: 0.9413623213768005


In [11]:
# Exemplo de frase para classificar
input_text = "Era uma vez um rapaz chamado Ivan . Num certo dia o Ivan foi à escola ele era tão distraído, que precisava dos professores. Passado algum tempo chegou a hora do lanche, o Ivan enquanto lanchava imitava as pessoas quando tocou para o intervalo. Como estava um dia de sol o Ivan foi jogar futebol como ele era um dos melhores a jogar estava na equipe principal da escola"

# Converte a frase em uma sequência de índices de palavras
input_sequence = [word_dict.get(word.lower(), 0) for word in input_text.split()]

# Adiciona padding à sequência
padded_input_sequence = tf.keras.preprocessing.sequence.pad_sequences([input_sequence])

# Faz a previsão das etiquetas das palavras na frase
predicted_tags = model.predict(padded_input_sequence)[0]

# Converte os índices das etiquetas em suas representações em texto
predicted_tags_text = [reverse_tag_dict[np.argmax(tag)] for tag in predicted_tags]

# Imprime as palavras e suas etiquetas previstas
for i, word in enumerate(input_text.split()):
    print(word, predicted_tags_text[i])

Era V
uma ART
vez N
um ART
rapaz N
chamado PCP
Ivan NPROP
. PU
Num PREP+ART
certo ADJ
dia N
o ART
Ivan NPROP
foi V
à PREP+ART
escola N
ele PROPESS
era V
tão ADV
distraído, ADV
que KS
precisava V
dos PREP+ART
professores. ADV
Passado N
algum PROADJ
tempo N
chegou V
a ART
hora N
do PREP+ART
lanche, ADV
o ART
Ivan NPROP
enquanto KS
lanchava ADV
imitava ADV
as ART
pessoas N
quando KS
tocou V
para PREP
o PROSUB
intervalo. ADV
Como KS
estava V
um ART
dia N
de PREP
sol N
o ART
Ivan NPROP
foi V
jogar V
futebol N
como KS
ele PROPESS
era V
um PROSUB
dos PREP+ART
melhores ADJ
a PREP
jogar V
estava V
na PREP+ART
equipe N
principal ADJ
da PREP+ART
escola N
