In [5]:
# Importación de librerías necesarias
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Para embeddings y modelos de gráficos
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
from torch_geometric.utils import from_networkx
from gensim.models import Word2Vec

# Cargar el dataset de IMDB
from sklearn.datasets import load_files
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ymamani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def preprocess_text(text):
    # Remover caracteres especiales y convertir a minúsculas
    text = re.sub(r'\W', ' ', text.lower())
    # Remover palabras de parada
    tokens = [word for word in text.split() if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Cargar el dataset desde un archivo CSV
df = pd.read_csv('/home/ymamani/projects/code/experimentos2/imdb_ds_2k_clean.csv')  # Asegúrate de que el nombre del archivo sea correcto

# Asumimos que las columnas son 'sw_text' y 'sentiment'
X = df['sw_text'].values
y = df['sentiment'].values

# Preprocesar textos
X = [preprocess_text(text) for text in X]

# Convertir las etiquetas de sentimiento a valores numéricos
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)  # Convierte 'positive' a 1 y 'negative' a 0

# Dividir en conjuntos de entrenamiento y prueba
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Construir un grafo de co-ocurrencia de palabras para Node2Vec
def build_cooccurrence_graph(texts):
    G = nx.Graph()
    for text in texts:
        tokens = text.split()
        for i in range(len(tokens)-1):
            G.add_edge(tokens[i], tokens[i+1])
    return G

# Generar embeddings con Node2Vec
def node2vec_embeddings(G):
    from node2vec import Node2Vec
    node2vec = Node2Vec(G, dimensions=300, walk_length=40, num_walks=300, workers=30)
    model = node2vec.fit(window=5, min_count=1, batch_words=4)
    return model

# Crear el grafo
G = build_cooccurrence_graph(X_train_text)
node2vec_model = node2vec_embeddings(G)

  from .autonotebook import tqdm as notebook_tqdm
Computing transition probabilities: 100%|██████████| 21720/21720 [01:41<00:00, 214.62it/s] 
Generating walks (CPU: 1): 100%|██████████| 50/50 [07:55<00:00,  9.52s/it]
Generating walks (CPU: 2): 100%|██████████| 50/50 [07:56<00:00,  9.53s/it]
Generating walks (CPU: 3): 100%|██████████| 50/50 [07:55<00:00,  9.50s/it]
Generating walks (CPU: 4): 100%|██████████| 50/50 [07:58<00:00,  9.57s/it]


KeyboardInterrupt: 

In [None]:
# Obtener embeddings para cada documento
def get_doc_embedding(text, model):
    tokens = text.split()
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.wv.vector_size)

X_train_embeddings = np.array([get_doc_embedding(text, node2vec_model) for text in X_train_text])
X_test_embeddings = np.array([get_doc_embedding(text, node2vec_model) for text in X_test_text])

# Entrenar y evaluar clasificadores
def evaluate_classifiers(X_train, X_test, y_train, y_test):
    classifiers = {
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'KNN': KNeighborsClassifier(),
        'Logistic Regression': LogisticRegression(max_iter=1000)
    }
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        print(f'Classifier: {name}')
        print(f'Accuracy: {acc:.4f}')
        print(f'Precision: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'F1-score: {f1:.4f}')
        print('-----------------------------')

evaluate_classifiers(X_train_embeddings, X_test_embeddings, y_train, y_test)

In [7]:






# Para GCN y GAT necesitamos representar los datos como grafos
# Aquí simplificaremos y crearemos un grafo completo entre documentos

# Construir el grafo para PyTorch Geometric
def build_pyg_graph(X_text, y):
    G = nx.Graph()
    for idx, text in enumerate(X_text):
        G.add_node(idx, text=text, label=y[idx])
    # Conectar todos los nodos (esto es una simplificación)
    for i in range(len(X_text)):
        for j in range(i+1, len(X_text)):
            G.add_edge(i, j)
    pyg_graph = from_networkx(G)
    return pyg_graph

# Convertir textos a vectores (por ejemplo, usando Bag of Words)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train_text).toarray()
X_test_vec = vectorizer.transform(X_test_text).toarray()

# Crear Data para PyTorch Geometric
data = Data(x=torch.tensor(X_train_vec, dtype=torch.float), edge_index=build_pyg_graph(X_train_text, y_train).edge_index, y=torch.tensor(y_train))

# Definir modelos GCN y GAT
class GCN(torch.nn.Module):
    def __init__(self, input_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, 64)
        self.conv2 = GCNConv(64, 32)
        self.lin = torch.nn.Linear(32, 2)
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.lin(x)
        return x

class GAT(torch.nn.Module):
    def __init__(self, input_dim):
        super(GAT, self).__init__()
        self.conv1 = GATConv(input_dim, 64, heads=8, concat=True)
        self.conv2 = GATConv(64*8, 32, heads=1, concat=True)
        self.lin = torch.nn.Linear(32, 2)
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = self.lin(x)
        return x

# Entrenar modelos GCN y GAT
def train_graph_model(model, data, epochs=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# Preparar datos para PyTorch Geometric
data_test = Data(x=torch.tensor(X_test_vec, dtype=torch.float), edge_index=build_pyg_graph(X_test_text, y_test).edge_index, y=torch.tensor(y_test))

# Entrenar y evaluar GCN
gcn_model = GCN(input_dim=X_train_vec.shape[1])
train_graph_model(gcn_model, data)

gcn_model.eval()
with torch.no_grad():
    out = gcn_model(data_test)
    y_pred = out.argmax(dim=1).numpy()
    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print('GCN Model Evaluation')
    print(f'Accuracy: {acc:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    print('-----------------------------')

# Entrenar y evaluar GAT
gat_model = GAT(input_dim=X_train_vec.shape[1])
train_graph_model(gat_model, data)

gat_model.eval()
with torch.no_grad():
    out = gat_model(data_test)
    y_pred = out.argmax(dim=1).numpy()
    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print('GAT Model Evaluation')
    print(f'Accuracy: {acc:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    print('-----------------------------')


Epoch 1, Loss: 0.6957
Epoch 2, Loss: 0.8368
Epoch 3, Loss: 0.7010
Epoch 4, Loss: 0.6952
Epoch 5, Loss: 0.6956
Epoch 6, Loss: 0.6959
Epoch 7, Loss: 0.6960
Epoch 8, Loss: 0.6960
Epoch 9, Loss: 0.6960
Epoch 10, Loss: 0.6958
Epoch 11, Loss: 0.6957
Epoch 12, Loss: 0.6954
Epoch 13, Loss: 0.6952
Epoch 14, Loss: 0.6949
Epoch 15, Loss: 0.6947
Epoch 16, Loss: 0.6944
Epoch 17, Loss: 0.6941
Epoch 18, Loss: 0.6939
Epoch 19, Loss: 0.6937
Epoch 20, Loss: 0.6935
Epoch 21, Loss: 0.6934
Epoch 22, Loss: 0.6933
Epoch 23, Loss: 0.6932
Epoch 24, Loss: 0.6932
Epoch 25, Loss: 0.6932
Epoch 26, Loss: 0.6932
Epoch 27, Loss: 0.6932
Epoch 28, Loss: 0.6933
Epoch 29, Loss: 0.6933
Epoch 30, Loss: 0.6934
Epoch 31, Loss: 0.6934
Epoch 32, Loss: 0.6934
Epoch 33, Loss: 0.6934
Epoch 34, Loss: 0.6933
Epoch 35, Loss: 0.6933
Epoch 36, Loss: 0.6933
Epoch 37, Loss: 0.6932
Epoch 38, Loss: 0.6932
Epoch 39, Loss: 0.6932
Epoch 40, Loss: 0.6932
Epoch 41, Loss: 0.6932
Epoch 42, Loss: 0.6931
Epoch 43, Loss: 0.6931
Epoch 44, Loss: 0.69

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Extraer las embeddings de GCN y GAT

# Para GCN
gcn_model.eval()
with torch.no_grad():
    def get_embeddings_gcn(model, data):
        x, edge_index = data.x, data.edge_index
        x = model.conv1(x, edge_index).relu()
        x = model.conv2(x, edge_index).relu()
        return x

    embeddings_gcn_train = get_embeddings_gcn(gcn_model, data).cpu().numpy()
    embeddings_gcn_test = get_embeddings_gcn(gcn_model, data_test).cpu().numpy()

# Para GAT
gat_model.eval()
with torch.no_grad():
    def get_embeddings_gat(model, data):
        x, edge_index = data.x, data.edge_index
        x = model.conv1(x, edge_index).relu()
        x = model.conv2(x, edge_index).relu()
        return x

    embeddings_gat_train = get_embeddings_gat(gat_model, data).cpu().numpy()
    embeddings_gat_test = get_embeddings_gat(gat_model, data_test).cpu().numpy()

# Ahora, entrenamos los clasificadores utilizando estas embeddings

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Definir los clasificadores
classifiers = {
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'Regresión Logística': LogisticRegression(max_iter=1000),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'KNN': KNeighborsClassifier()
}

def evaluate_classifiers(X_train_emb, X_test_emb, y_train, y_test, embedding_name):
    print(f'\nResultados usando embeddings de {embedding_name}:')
    for name, clf in classifiers.items():
        clf.fit(X_train_emb, y_train)
        y_pred = clf.predict(X_test_emb)
        acc = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_test, y_pred, average='binary', zero_division=0)
        print(f'Clasificador: {name}')
        print(f'Exactitud: {acc:.4f}')
        print(f'Precisión: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'F1-score: {f1:.4f}')
        print('----------------------------------------')

# Convertir etiquetas a numpy arrays
y_train_np = data.y.cpu().numpy()
y_test_np = data_test.y.cpu().numpy()

# Evaluar clasificadores con embeddings de GCN
evaluate_classifiers(embeddings_gcn_train, embeddings_gcn_test, y_train_np, y_test_np, 'GCN')

# Evaluar clasificadores con embeddings de GAT
evaluate_classifiers(embeddings_gat_train, embeddings_gat_test, y_train_np, y_test_np, 'GAT')


Resultados usando embeddings de GCN:
Clasificador: SVM
Exactitud: 0.4775
Precisión: 0.4231
Recall: 0.0537
F1-score: 0.0952
----------------------------------------
Clasificador: Random Forest
Exactitud: 0.5125
Precisión: 0.5125
Recall: 1.0000
F1-score: 0.6777
----------------------------------------
Clasificador: Regresión Logística
Exactitud: 0.4875
Precisión: 0.0000
Recall: 0.0000
F1-score: 0.0000
----------------------------------------
Clasificador: XGBoost
Exactitud: 0.4875
Precisión: 0.0000
Recall: 0.0000
F1-score: 0.0000
----------------------------------------
Clasificador: KNN
Exactitud: 0.4875
Precisión: 0.0000
Recall: 0.0000
F1-score: 0.0000
----------------------------------------

Resultados usando embeddings de GAT:
Clasificador: SVM
Exactitud: 0.5175
Precisión: 0.5283
Recall: 0.5463
F1-score: 0.5372
----------------------------------------
Clasificador: Random Forest
Exactitud: 0.4875
Precisión: 0.0000
Recall: 0.0000
F1-score: 0.0000
------------------------------------