In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from node2vec import Node2Vec
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
import torch.nn.functional as F


In [None]:
# Descargar recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')

# Cargar el dataset CSV
df = pd.read_csv('/home/ymamani/projects/code/experimentos2/imdb_ds_2k_clean.csv')  # Reemplaza 'tu_dataset.csv' por el nombre de tu archivo

# Mostrar las primeras filas
print(df.head())

# Preprocesamiento de texto
def preprocess_text(text):
    # Tokenización
    tokens = word_tokenize(text.lower())
    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Unir tokens
    return ' '.join(tokens)

df['clean_text'] = df['sw_text'].apply(preprocess_text)

# Codificar etiquetas
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])


In [3]:
X = df['clean_text']
y = df['label']

# División en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
vectorizer = TfidfVectorizer(max_features=5000)
X_all_tfidf = vectorizer.fit_transform(X)

# Convertir a array
X_all_tfidf_array = X_all_tfidf.toarray()


In [5]:
# Crear grafo kNN
k = 5  # Ajusta el número de vecinos según sea necesario
A = kneighbors_graph(X_all_tfidf_array, n_neighbors=k, mode='connectivity', include_self=False)

# Crear el grafo usando from_scipy_sparse_array
G = nx.from_scipy_sparse_array(A)

In [None]:


# Aplicar Node2Vec
node2vec = Node2Vec(G, dimensions=300, walk_length=40, num_walks=300, workers=30)
model = node2vec.fit(window=5, min_count=1, batch_words=4)

# Obtener embeddings
embeddings = np.array([model.wv[str(node)] for node in G.nodes()])


In [7]:
# Mapear índices de entrenamiento y prueba
train_indices = X_train.index.values
test_indices = X_test.index.values

X_train_embeddings = embeddings[train_indices]
X_test_embeddings = embeddings[test_indices]
y_train_array = y_train.values
y_test_array = y_test.values


In [None]:
# Definir clasificadores
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'KNN': KNeighborsClassifier(),
    'Regresión Logística': LogisticRegression(max_iter=1000)
}

# Entrenar y evaluar cada clasificador
for name, clf in classifiers.items():
    clf.fit(X_train_embeddings, y_train_array)
    y_pred = clf.predict(X_test_embeddings)
    acc = accuracy_score(y_test_array, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_array, y_pred, average='binary')
    print(f'Clasificador: {name}')
    print(f'Exactitud: {acc:.4f}')
    print(f'Precisión: {precision:.4f}')
    print(f'Exhaustividad (Recall): {recall:.4f}')
    print(f'F1-score: {f1:.4f}')
    print('----------------------------------------')


In [9]:
# Convertir datos a tensores de PyTorch
x = torch.tensor(X_all_tfidf_array, dtype=torch.float)
edge_index = torch.tensor(np.array([A.nonzero()[0], A.nonzero()[1]]), dtype=torch.long)
y_tensor = torch.tensor(df['label'].values, dtype=torch.long)

data = Data(x=x, edge_index=edge_index, y=y_tensor)

# Crear máscaras de entrenamiento y prueba
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[train_indices] = True
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask[test_indices] = True
data.train_mask = train_mask
data.test_mask = test_mask


In [None]:
# Definir modelo GCN
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Entrenar modelo GCN
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_gcn = GCN(num_features=x.shape[1], num_classes=2).to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model_gcn.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train_gcn():
    model_gcn.train()
    optimizer.zero_grad()
    out = model_gcn(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test_gcn():
    model_gcn.eval()
    logits = model_gcn(data)
    pred = logits.argmax(dim=1)
    y_true = data.y[data.test_mask].cpu()
    y_pred = pred[data.test_mask].cpu()
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    return acc, precision, recall, f1

# Entrenamiento
for epoch in range(1, 200):
    loss = train_gcn()
    if epoch % 5 == 0:
        acc, precision, recall, f1 = test_gcn()
        print(f'Epoch: {epoch}, Loss: {loss:.4f}, Acc: {acc:.4f}, F1: {f1:.4f}')


In [None]:
# Definir modelo GAT
class GAT(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super().__init__()
        self.conv1 = GATConv(num_features, 8, heads=8, dropout=0.6)
        self.conv2 = GATConv(8 * 8, num_classes, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Entrenar modelo GAT
model_gat = GAT(num_features=x.shape[1], num_classes=2).to(device)
optimizer_gat = torch.optim.Adam(model_gat.parameters(), lr=0.005, weight_decay=5e-4)

def train_gat():
    model_gat.train()
    optimizer_gat.zero_grad()
    out = model_gat(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer_gat.step()
    return loss.item()

def test_gat():
    model_gat.eval()
    logits = model_gat(data)
    pred = logits.argmax(dim=1)
    y_true = data.y[data.test_mask].cpu()
    y_pred = pred[data.test_mask].cpu()
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    return acc, precision, recall, f1

# Entrenamiento
for epoch in range(1, 200):
    loss = train_gat()
    if epoch % 5 == 0:
        acc, precision, recall, f1 = test_gat()
        print(f'Epoch: {epoch}, Loss: {loss:.4f}, Acc: {acc:.4f}, F1: {f1:.4f}')


In [12]:
# Obtener embeddings de GCN
model_gcn.eval()
with torch.no_grad():
    x_gcn = model_gcn.conv1(data.x, data.edge_index)
    x_gcn = F.relu(x_gcn)
    embeddings_gcn = x_gcn.cpu().numpy()

# Obtener embeddings de GAT
model_gat.eval()
with torch.no_grad():
    x_gat = model_gat.conv1(data.x, data.edge_index)
    x_gat = F.elu(x_gat)
    embeddings_gat = x_gat.cpu().numpy()

# Dividir embeddings en entrenamiento y prueba
X_train_embeddings_gcn = embeddings_gcn[train_indices]
X_test_embeddings_gcn = embeddings_gcn[test_indices]

X_train_embeddings_gat = embeddings_gat[train_indices]
X_test_embeddings_gat = embeddings_gat[test_indices]


In [None]:
# Función para entrenar y evaluar clasificadores con embeddings proporcionados
def evaluate_classifiers(X_train_emb, X_test_emb, y_train, y_test, embedding_name):
    print(f'\n--- Resultados usando embeddings de {embedding_name} ---')
    for name, clf in classifiers.items():
        clf.fit(X_train_emb, y_train)
        y_pred = clf.predict(X_test_emb)
        acc = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        print(f'Clasificador: {name}')
        print(f'Exactitud: {acc:.4f}')
        print(f'Precisión: {precision:.4f}')
        print(f'Exhaustividad (Recall): {recall:.4f}')
        print(f'F1-score: {f1:.4f}')
        print('----------------------------------------')

# Evaluar clasificadores con embeddings de GCN
evaluate_classifiers(X_train_embeddings_gcn, X_test_embeddings_gcn, y_train_array, y_test_array, 'GCN')

# Evaluar clasificadores con embeddings de GAT
evaluate_classifiers(X_train_embeddings_gat, X_test_embeddings_gat, y_train_array, y_test_array, 'GAT')
