In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from stellargraph import StellarGraph
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf

2024-09-24 11:37:35.151565: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-24 11:37:35.228237: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-09-24 11:37:35.228254: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-09-24 11:37:35.684616: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-

In [2]:
# Cargar el dataset
df = pd.read_csv('imdb_ds_2k_clean.csv')
#df = df[:110]

In [3]:
# Preprocesamiento de texto
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char == ' '])
    return text

df['clean_review'] = df['sw_text'].apply(preprocess_text)

In [4]:

# Crear matriz TF-IDF
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = tfidf.fit_transform(df['clean_review'])  # Matriz de tamaño (2000, 1000)

terms = tfidf.get_feature_names_out()  # Lista de términos, tamaño 1000

# Obtener la matriz término-documento
X_td = X_tfidf.T  # Matriz término-documento de tamaño (1000, 2000)

# Reducir la dimensionalidad de las características de los nodos
pca = PCA(n_components=100)
X_reduced = pca.fit_transform(X_td.toarray())  # Resultado de tamaño (1000, 100)

# Crear un DataFrame de términos y sus índices
terms_df = pd.DataFrame({'term': terms})
terms_df['term_id'] = terms_df.index

In [5]:
# Crear grafo
G_nx = nx.Graph()

# Añadir nodos al grafo con sus características (vectores reducidos)
for idx, term in enumerate(terms):
    feature_vector = X_reduced[idx]  # Vector de tamaño (100,)
    G_nx.add_node(term, features=feature_vector)

# Añadir aristas basadas en co-ocurrencias en los documentos
for doc in X_tfidf.toarray():
    # Obtener índices de términos con peso mayor a 0
    non_zero_indices = np.where(doc > 0)[0]
    non_zero_terms = [terms[idx] for idx in non_zero_indices]
    
    # Añadir aristas entre términos que co-ocurren
    for i, term1 in enumerate(non_zero_terms):
        for term2 in non_zero_terms[i+1:]:
            if G_nx.has_edge(term1, term2):
                G_nx[term1][term2]['weight'] += 1
            else:
                G_nx.add_edge(term1, term2, weight=1)

# Convertir el grafo de NetworkX a StellarGraph
G = StellarGraph.from_networkx(G_nx, node_features='features')

In [6]:
# Preparar los datos para GraphSAGE
nodes = list(G.nodes())
node_features = G.node_features(nodes)

# Dividir los nodos en entrenamiento y prueba, junto con sus características
train_nodes, test_nodes, train_targets, test_targets = train_test_split(
    nodes, node_features, test_size=0.2, random_state=42
)

In [7]:
# Definir el generador de nodos
batch_size = 50
num_samples = [40, 40] #numero de caminos en primerca y segunda capa
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)

In [8]:

# Crear generadores de entrenamiento y prueba con targets
train_gen = generator.flow(train_nodes, targets=train_targets, shuffle=True)
test_gen = generator.flow(test_nodes, targets=test_targets)

In [9]:
# Definir el modelo de GraphSAGE
layer_sizes = [50, 300]
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.3
)

# Construir el modelo
x_inp, x_out = graphsage.in_out_tensors()
prediction = tf.keras.layers.Dense(units=100, activation='linear')(x_out)  # Salida de dimensión 100

# Obtener las representaciones de los nodos (embeddings)
model = Model(inputs=x_inp, outputs=prediction)


model.compile(
    optimizer=Adam(lr=1e-3),
    loss='mse',  # Usamos MSE para reconstruir las características
)

# Entrenar el modelo
history = model.fit(
    train_gen,
    validation_data=test_gen,
    epochs=10,
    verbose=1,
    use_multiprocessing=False,
    workers=1,
    callbacks=[EarlyStopping(monitor='val_loss', patience=2)],
)

  f"The initializer {self.__class__.__name__} is unseeded "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:

# Obtener las representaciones de los nodos
embedding_model = Model(inputs=x_inp, outputs=x_out)  # x_out es el embedding
node_embeddings = embedding_model.predict(generator.flow(nodes), verbose=1)

# Crear un diccionario de embeddings de nodos
node_embeddings_dict = dict(zip(nodes, node_embeddings))


# Guardar los embeddings en un archivo .txt
with open('embedding_imdb2k_gsage_le40nw300.txt', 'w') as f:
    # Escribir la cantidad de nodos y el tamaño del embedding en la primera línea
    f.write(f"{len(node_embeddings_dict)} {layer_sizes[-1]}\n")
    
    # Escribir cada nodo y su embedding en las líneas siguientes
    for node, embedding in node_embeddings_dict.items():
        embedding_str = ' '.join(map(str, embedding))
        f.write(f"{node} {embedding_str}\n")



In [11]:

# Función para obtener el embedding de un documento
def get_doc_embedding(doc):
    words = doc.split()
    embeddings = [node_embeddings_dict[word] for word in words if word in node_embeddings_dict]
    if embeddings:
        embedding = np.mean(embeddings, axis=0)
    else:
        embedding = np.zeros(layer_sizes[-1])
    return embedding

df['embedding'] = df['clean_review'].apply(get_doc_embedding)

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

X = np.vstack(df['embedding'])
y = df['sentiment']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
def evaluate_model(true_labels, predictions, model_name):
    acc = accuracy_score(true_labels, predictions)
    prec = precision_score(true_labels, predictions)
    rec = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f"Resultados del modelo {model_name}:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}\n")

In [13]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)


from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)


from xgboost import XGBClassifier
rf_model3 = XGBClassifier()
rf_model3.fit(X_train, y_train)
rf_predictions3 = rf_model3.predict(X_test)

from sklearn.neighbors import KNeighborsClassifier
rf_model4 = KNeighborsClassifier(n_neighbors=5)
rf_model4.fit(X_train, y_train)
rf_predictions4 = rf_model4.predict(X_test)


from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [14]:
evaluate_model(y_test, svm_predictions, "SVM")
evaluate_model(y_test, rf_predictions, "Random Forest")
evaluate_model(y_test, rf_predictions3, "XGBoots")
evaluate_model(y_test, rf_predictions4, "KNN")
evaluate_model(y_test, y_pred, "LR")

Resultados del modelo SVM:
Accuracy: 0.6500
Precision: 0.6498
Recall: 0.6878
F1-Score: 0.6682

Resultados del modelo Random Forest:
Accuracy: 0.7150
Precision: 0.7358
Recall: 0.6927
F1-Score: 0.7136

Resultados del modelo XGBoots:
Accuracy: 0.7100
Precision: 0.7259
Recall: 0.6976
F1-Score: 0.7114

Resultados del modelo KNN:
Accuracy: 0.6175
Precision: 0.6444
Recall: 0.5659
F1-Score: 0.6026

Resultados del modelo LR:
Accuracy: 0.6425
Precision: 0.6520
Recall: 0.6488
F1-Score: 0.6504

