In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from node2vec import Node2Vec
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cargar datos de IMDB
# Asegúrate de tener un dataframe con columnas "review" y "sentiment" (1 = positivo, 0 = negativo)
df = pd.read_csv('imdb_ds_2k_clean.csv')  # Cambia por la ruta correcta
df

Unnamed: 0,sw_text,sentiment
0,one reviewer ha mention watch oz episode youll...,positive
1,wonderful little production film technique una...,positive
2,think wa wonderful way spend time hot summer w...,positive
3,basically family little boy jake think zombie ...,negative
4,petter matteis love time money visually stun f...,positive
...,...,...
1995,feel minnesota direct steven baigelmann star k...,negative
1996,cell rat cell like antz must watch twice appre...,positive
1997,movie despite list list celebs complete waste ...,negative
1998,love movie wa could break tear watch really up...,positive


In [3]:
# Preprocesamiento de texto
def preprocess_text(text):
    # Limpieza básica (minúsculas, quitar puntuación, etc.)
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char == ' '])
    return text

df['clean_review'] = df['sw_text'].apply(preprocess_text)

In [4]:
# Crear grafo basado en co-ocurrencias de términos
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = tfidf.fit_transform(df['clean_review'])

In [5]:

G = nx.Graph()
terms = tfidf.get_feature_names_out()

# Añadir nodos y conexiones al grafo optimizado
for doc in X_tfidf.toarray():
    # Obtener índices de términos con peso mayor a 0
    non_zero_indices = np.where(doc > 0)[0]
    non_zero_terms = [(terms[idx], doc[idx]) for idx in non_zero_indices]
    
    # Añadir nodos
    for term, weight in non_zero_terms:
        G.add_node(term)
    
    # Añadir aristas con pesos
    for i, (term1, weight1) in enumerate(non_zero_terms):
        for term2, weight2 in non_zero_terms[i+1:]:
            G.add_edge(term1, term2, weight=weight1 * weight2)


In [6]:

# Aplicar Node2Vec para generar embeddings
node2vec = Node2Vec(G, dimensions=300, walk_length=40, num_walks=300, workers=28)
model = node2vec.fit(window=5, min_count=1)

Computing transition probabilities: 100%|██████████| 1000/1000 [18:25<00:00,  1.11s/it]
Generating walks (CPU: 1): 100%|██████████| 11/11 [00:21<00:00,  2.00s/it]
Generating walks (CPU: 2): 100%|██████████| 11/11 [00:21<00:00,  1.95s/it]
Generating walks (CPU: 3): 100%|██████████| 11/11 [00:21<00:00,  1.98s/it]
Generating walks (CPU: 4): 100%|██████████| 11/11 [00:22<00:00,  2.04s/it]
Generating walks (CPU: 5): 100%|██████████| 11/11 [00:22<00:00,  2.01s/it]
Generating walks (CPU: 6): 100%|██████████| 11/11 [00:22<00:00,  2.03s/it]
Generating walks (CPU: 7): 100%|██████████| 11/11 [00:22<00:00,  2.04s/it]
Generating walks (CPU: 8): 100%|██████████| 11/11 [00:22<00:00,  2.05s/it]
Generating walks (CPU: 9): 100%|██████████| 11/11 [00:21<00:00,  1.96s/it]
Generating walks (CPU: 10): 100%|██████████| 11/11 [00:22<00:00,  2.03s/it]
Generating walks (CPU: 11): 100%|██████████| 11/11 [00:21<00:00,  1.98s/it]
Generating walks (CPU: 12): 100%|██████████| 11/11 [00:22<00:00,  2.06s/it]
Generatin

In [7]:
# Abre un archivo en modo escritura
with open('embedding_imdb2k_n2v_w5d300_le40nw300.txt', 'w') as f:
    # Escribe la cantidad de palabras y las dimensiones
    f.write(f"{len(model.wv.vectors)} {model.wv.vector_size}\n")
    
    # Itera sobre cada palabra y sus embeddings
    for word in model.wv.index_to_key:
        # Obtén los embeddings de la palabra
        vector = model.wv[word]
        # Escribe la palabra seguida de sus valores de embedding
        f.write(f"{word} {' '.join(map(str, vector))}\n")

In [8]:
#2000 records to imdb
#18 min probability calculate
#8 min walks

# Obtener embeddings para cada documento
def get_doc_embedding(doc):
    words = doc.split()
    embedding = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
    return embedding if embedding.size else np.zeros(300)

df['embedding'] = df['clean_review'].apply(get_doc_embedding)

In [9]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [10]:
# Preparar datos para clasificación
X = np.vstack(df['embedding'])
y = df['sentiment']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
def evaluate_model(true_labels, predictions, model_name):
    acc = accuracy_score(true_labels, predictions)
    prec = precision_score(true_labels, predictions)
    rec = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f"Resultados del modelo {model_name}:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}\n")

In [12]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)


from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)


from xgboost import XGBClassifier
rf_model3 = XGBClassifier()
rf_model3.fit(X_train, y_train)
rf_predictions3 = rf_model3.predict(X_test)

from sklearn.neighbors import KNeighborsClassifier
rf_model4 = KNeighborsClassifier(n_neighbors=5)
rf_model4.fit(X_train, y_train)
rf_predictions4 = rf_model4.predict(X_test)


from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [13]:
evaluate_model(y_test, svm_predictions, "SVM")
evaluate_model(y_test, rf_predictions, "Random Forest")
evaluate_model(y_test, rf_predictions3, "XGBoots")
evaluate_model(y_test, rf_predictions4, "KNN")
evaluate_model(y_test, y_pred, "LR")

Resultados del modelo SVM:
Accuracy: 0.8075
Precision: 0.8019
Recall: 0.8293
F1-Score: 0.8153

Resultados del modelo Random Forest:
Accuracy: 0.7075
Precision: 0.7115
Recall: 0.7220
F1-Score: 0.7167

Resultados del modelo XGBoots:
Accuracy: 0.7700
Precision: 0.7783
Recall: 0.7707
F1-Score: 0.7745

Resultados del modelo KNN:
Accuracy: 0.6325
Precision: 0.6686
Recall: 0.5610
F1-Score: 0.6101

Resultados del modelo LR:
Accuracy: 0.7925
Precision: 0.8020
Recall: 0.7902
F1-Score: 0.7961

