<a href="https://colab.research.google.com/github/wesleycoutinhodev/pln-b2w-reviews/blob/main/notebooks/Operacao_10_Doc_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Monte o Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Download do dataset
!gdown --id 1_Elg5O_H0fJ1mIWEKkb6q4ul9x7SIhnk -O /content/drive/MyDrive/pln_data/

In [None]:
!pip install gensim
# Executar no início dos notebooks
!pip install gensim sentence-transformers
!pip install wordcloud plotly

# Para downloads do dataset
!pip install gdown

In [None]:
# Verificar uso de RAM
!cat /proc/meminfo | head -n 3

# Verificar GPU disponível
!nvidia-smi

In [None]:
# Bibliotecas principais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Pré-processamento
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Word Embeddings
import gensim
from gensim.models import Word2Vec, Doc2Vec
from sentence_transformers import SentenceTransformer

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
reviews = pd.read_csv('/content/drive/MyDrive/PLN_B2W_Reviews/data/processed/b2w_reviews_samsung_modified.csv')
reviews

In [None]:
all_reviews = reviews['review_text']
all_reviews

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
reviews_tokens = [r.split() for r in all_reviews]
reviews_tokens

In [None]:
tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(reviews_tokens)]

# Treinando o Doc2Vec
model_doc2vec = Doc2Vec(
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    epochs=20,
    dm=1
)

model_doc2vec.build_vocab(tagged_data)
model_doc2vec.train(tagged_data, total_examples=model_doc2vec.corpus_count, epochs=model_doc2vec.epochs)

In [None]:
vectors_doc2vec = [model_doc2vec.dv[str(i)] for i in range(len(reviews))]

print("Embedding do primeiro review:", vectors_doc2vec[0])

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model_sbert = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
vectors_sbert = model_sbert.encode(all_reviews)

print("Dimensão dos embeddings SBERT:", vectors_sbert.shape)
print("Embedding do primeiro review:", vectors_sbert[0])