In [1]:
!pip install numpy pandas scikit-learn tensorflow setuptools matplotlib pyarrow fastparquet transformers torch spacy seaborn tf-keras
!python3 -m spacy download en_core_web_sm

Collecting numpy
  Using cached numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Using cached numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl (5.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
openai-whisper 20231117 requires tiktoken, which is not installed.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.0.2 which is incompatible.
langchain 0.3.3 requires numpy<2.0.0,>=1.26.0; python_version >= "3.12", but you have numpy 2.0.2 which is incompatible.
faiss-cpu 1.8.0.post1 requires numpy<2.0,>=1.0, but you have numpy 2.0.2 which is incompatible.
langchain-community 0.3.2 requires numpy<2.0.0,>=1.26.0; python_version >= "3.12", but you have

In [2]:
# Importação das bibliotecas necessárias
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import spacy
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Carregar o conjunto de dados
df = pd.read_csv('wiki_movie_plots_deduped.csv')

# Verificar as primeiras linhas do DataFrame
print(df.head())

# Preprocessamento da coluna 'Genre'
# Dividir os gêneros em listas
df['Genre'] = df['Genre'].str.split(', ')

# Padronizar os gêneros: remover espaços extras e converter para minúsculas
df['Genre'] = df['Genre'].apply(lambda genres: [g.lower().strip() for g in genres])

# Atualizar a lista de gêneros selecionados para minúsculas
selected_genres = [genre.lower() for genre in ['Action', 'Comedy', 'Drama', 'Horror', 'Romance']]

# Filtrar filmes que tenham pelo menos um dos gêneros selecionados
df['Selected_Genre'] = df['Genre'].apply(lambda genres: [g for g in genres if g in selected_genres])

# Remover filmes que não possuem os gêneros selecionados
df = df[df['Selected_Genre'].map(len) > 0]

# Atribuir o primeiro gênero selecionado como o gênero principal
df['Primary_Genre'] = df['Selected_Genre'].apply(lambda genres: genres[0])

# Verificar o número de linhas após o filtro
print(f"Número de linhas após filtrar os gêneros selecionados: {len(df)}")
print(df[['Title', 'Primary_Genre']].head())

# Preprocessamento: Tokenização, StopWords e Lematização
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text.lower())  # Converter para minúsculas
    lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
    return lemmatized_text

df['Processed_Plot'] = df['Plot'].apply(preprocess_text)

# Preparação dos dados para o BERT
texts = df['Processed_Plot'].tolist()
labels = df['Primary_Genre'].tolist()

# Codificar os rótulos
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_labels = len(label_encoder.classes_)

# Dividir os dados em treino e teste
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts, labels_encoded, test_size=0.2, random_state=42, stratify=labels_encoded
)

# Carregar o tokenizer do BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizar os textos
def tokenize_texts(texts):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='tf'
    )

X_train = tokenize_texts(X_train_texts)
X_test = tokenize_texts(X_test_texts)

# Criar os datasets do TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train),
    y_train
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_test),
    y_test
)).batch(16)

# Carregar o modelo pré-treinado BERT
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Compilar o modelo
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss_fn, metrics=[metric])

# Configurar o EarlyStopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

# Treinar o modelo
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3,
    callbacks=[early_stopping]
)

# Avaliar o modelo no conjunto de teste
loss, accuracy = model.evaluate(test_dataset)
print(f"Acurácia no conjunto de teste: {accuracy * 100:.2f}%")

# Obter previsões
y_pred_logits = model.predict(test_dataset).logits
y_pred = np.argmax(y_pred_logits, axis=1)

# Relatório de classificação
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Matriz de confusão
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    cmap='Blues'
)
plt.xlabel('Predito')
plt.ylabel('Real')
plt.title('Matriz de Confusão')
plt.show()


   Release Year                             Title Origin/Ethnicity  \
0          1901            Kansas Saloon Smashers         American   
1          1901     Love by the Light of the Moon         American   
2          1901           The Martyred Presidents         American   
3          1901  Terrible Teddy, the Grizzly King         American   
4          1902            Jack and the Beanstalk         American   

                             Director Cast    Genre  \
0                             Unknown  NaN  unknown   
1                             Unknown  NaN  unknown   
2                             Unknown  NaN  unknown   
3                             Unknown  NaN  unknown   
4  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...   
1  https://en.wikipedia.org/wiki/Love_by_the_Ligh...   
2  https://en.wikipedia.org/wiki/The_Martyred_Pre...   
3  https://en.wikipedia.

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
 85/809 [==>...........................] - ETA: 11:01:40 - loss: 1.3091 - accuracy: 0.4463

KeyboardInterrupt: 