In [1]:
import pandas as pd
import nltk
import re
import networkx as nx
from collections import defaultdict
import random
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Descargar recursos de NLTK
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('vader_lexicon')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet as wn
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /home/ymamani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ymamani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ymamani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ymamani/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

df = pd.read_csv('imdb_ds_2k_clean.csv')  # Cambia por la ruta correcta
df

Unnamed: 0,sw_text,sentiment
0,one reviewer ha mention watch oz episode youll...,positive
1,wonderful little production film technique una...,positive
2,think wa wonderful way spend time hot summer w...,positive
3,basically family little boy jake think zombie ...,negative
4,petter matteis love time money visually stun f...,positive
...,...,...
1995,feel minnesota direct steven baigelmann star k...,negative
1996,cell rat cell like antz must watch twice appre...,positive
1997,movie despite list list celebs complete waste ...,negative
1998,love movie wa could break tear watch really up...,positive


In [3]:
df.shape

(2000, 2)

In [21]:
#df1['sentiment'].value_counts()

In [4]:
# Inicializar lematizador, stop words y VADER
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
sia = SentimentIntensityAnalyzer()

# Función de preprocesamiento
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar URLs
    #text = re.sub(r'http\S+|www.\S+', '', text)
    # Eliminar caracteres especiales y números
    #text = re.sub(r'[^a-z\s]', '', text)
    # Tokenización
    tokens = word_tokenize(text)
    # Eliminar stop words y lematizar
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Preprocesar el texto y almacenar los tokens
df['tokens'] = df['sw_text'].apply(preprocess_text)

In [5]:
from multiprocessing import Pool
import multiprocessing
# Obtener el lexicón de VADER
vader_lexicon = sia.lexicon  # Es un diccionario {palabra: puntuación}

# Definir umbrales para palabras positivas y negativas
positive_threshold = 2.0
negative_threshold = -2.0

# Inicializar conjuntos de palabras positivas y negativas
positive_words = set()
negative_words = set()

# Iterar sobre el lexicón y separar palabras según su puntuación
for word, score in vader_lexicon.items():
    if score >= positive_threshold:
        positive_words.add(word)
    elif score <= negative_threshold:
        negative_words.add(word)

# Construir word_polarity y all_words antes de la paralelización
word_polarity = {}
all_words = set()

for word in positive_words:
    word_polarity[word] = 1
    all_words.add(word)

for word in negative_words:
    word_polarity[word] = -1
    all_words.add(word)

# Recopilar todas las palabras de los documentos
for tokens in df['tokens']:
    for token in tokens:
        all_words.add(token)
        if token not in word_polarity:
            word_polarity[token] = 0  # Asignar polaridad neutral

In [6]:
# Definir la función de procesamiento
def process_document(tokens):
    nodes = set()
    edges = []
    window_size = 15  # ajustar el tamaño de la ventana si lo deseas

    for i, token in enumerate(tokens):
        nodes.add(token)
        # Conectar con palabras dentro de la ventana de contexto
        for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
            if i != j:
                edges.append((token, tokens[j]))
        # Añadir conexiones semánticas basadas en WordNet
        synonyms = set()
        for syn in wn.synsets(token):
            for lemma in syn.lemmas():
                synonym = lemma.name().lower()
                # Evitar conectar la palabra consigo misma y asegurar que el sinónimo esté en nuestro vocabulario
                if synonym != token and synonym in all_words:
                    synonyms.add(synonym)
        # Añadir aristas para los sinónimos
        for synonym in synonyms:
            edges.append((token, synonym))
    return nodes, edges


In [7]:
# Obtener la lista de tokens de cada documento
tokens_list = df['tokens'].tolist()

# Definir el número de procesos (número de núcleos)
n = multiprocessing.cpu_count()  # Puedes ajustar este valor

    # Crear un pool de procesos
with Pool(processes=n) as pool:
    results = pool.map(process_document, tokens_list)

    # Recopilar nodos y aristas de todos los procesos
all_nodes = set()
all_edges = set()

for nodes, edges in results:
    all_nodes.update(nodes)
    all_edges.update(edges)

    # Construir el grafo
G = nx.Graph()
G.add_nodes_from(all_nodes)
G.add_edges_from(all_edges)
#contruir grafo 8s

In [8]:
print("Numero de nodos: ", G.number_of_nodes())
print("Numero de aristas: ", G.number_of_edges())

#utilizando wordnet
#Numero de nodos:  25265
#Numero de aristas:  1761140

Numero de nodos:  25265
Numero de aristas:  1761140


In [9]:
# Definir la función de caminata aleatoria sesgada
def sentiment_biased_walk(start_node):
    walk = [start_node]
    for _ in range(walk_length - 1):
        curr_node = walk[-1]
        neighbors = list(G.neighbors(curr_node))
        if not neighbors:
            break
        probabilities = []
        for neighbor in neighbors:
            # Calcular polaridad
            curr_polarity = word_polarity.get(curr_node, 0)
            neighbor_polarity = word_polarity.get(neighbor, 0)
            # Si las polaridades coinciden y no son neutras, aumentar probabilidad
            if curr_polarity == neighbor_polarity and curr_polarity != 0:
                prob = alpha
            else:
                prob = 1 - alpha
            probabilities.append(prob)
        # Normalizar probabilidades
        total = sum(probabilities)
        probabilities = [prob / total for prob in probabilities]
        # Seleccionar siguiente nodo basado en las probabilidades
        next_node = random.choices(neighbors, weights=probabilities, k=1)[0]
        walk.append(next_node)
    return walk

# Función auxiliar para generar caminatas para un nodo
def generate_walks_for_node(node):
    node_walks = []
    for _ in range(num_walks_per_node):
        walk = sentiment_biased_walk(node)
        node_walks.append(walk)
    return node_walks

In [None]:
# Parámetros para las caminatas
num_walks_per_node = 300
walk_length = 40
alpha = 0.95

# Lista de nodos
nodes = list(G.nodes())
#n_processes = 30
# Generar caminatas en paralelo
with Pool(processes=n) as pool:
    results = pool.map(generate_walks_for_node, nodes)

# Combinar las caminatas
walks = []
for node_walks in results:
    walks.extend(node_walks)

#tiempo 101m 30.7s 2025 sep


In [None]:
# Entrenar Word2Vec
print("numero de nucleos utilizando...:", n)
walks_str = [[str(node) for node in walk] for walk in walks]  # Convertir nodos a cadenas
model = Word2Vec(walks_str, vector_size=300, window=5, min_count=1, sg=1, workers=n)

#20min 2025sep

numero de nucleos utilizando...: 32


In [12]:
print("Guardando embeddings en archivo txt...")
model.wv.save_word2vec_format("graph_model_embedding_wn_w5d300_le40nw300a095_2k_092025.txt", binary=False)

Guardando embeddings en archivo txt...


In [13]:
# Obtener embeddings de palabras
def get_word_embedding(word):
    if word in model.wv:
        return model.wv[word]
    else:
        return np.zeros(model.vector_size)

# Representar documentos promediando los embeddings de sus palabras
def document_embedding(tokens):
    embeddings = [get_word_embedding(token) for token in tokens]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Obtener embeddings de los documentos
df['embedding'] = df['tokens'].apply(document_embedding)

# Preparar datos para entrenamiento
X = np.stack(df['embedding'].values)
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [14]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import (
    accuracy_score,
    accuracy_score,
    roc_auc_score,
    f1_score,
    classification_report,
    precision_score,
    recall_score,
)

def Evaluation(model, X_train, X_test, y_train, y_test, hypertuning=False):
    
    y_pred = model.predict(X_train)
    y_pred_proba = model.predict_proba(X_train)

    accuracy_train = accuracy_score(y_train, y_pred)
    precision_train = precision_score(y_train, y_pred)
    recall_train = recall_score(y_train, y_pred)
    F1_score_train = f1_score(y_train, y_pred)
    # print("F1_Score = ", F1_score_train )
    roc_auc_train = roc_auc_score(y_train, y_pred_proba[:, 1])
    # print( classification_report( y_train, y_pred ) )

    # print( " For Test Set :  ")
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    accuracy_test = accuracy_score(y_test, y_pred)
    precision_test = precision_score(y_test, y_pred)
    recall_test = recall_score(y_test, y_pred)
    F1_score_test = f1_score(y_test, y_pred)
    # print("F1_Score = ", F1_score_test )
    roc_auc_test = roc_auc_score(y_test, y_pred_proba[:, 1])
    
    #cross-validation
    cross_val = cross_val_score(model, X_train, y_train, cv=5).mean()
    
    return (
        accuracy_train,
        precision_train,
        recall_train,
        F1_score_train,
        roc_auc_train,
        accuracy_test,
        precision_test,
        recall_test,
        F1_score_test,
        roc_auc_test,
        cross_val
    )
    
    
def apply_models_with_default_paramters(X_train, X_test, y_train, y_test):
    models_default = [
        {"ModelNames": "SVM", "Model": SVC(kernel="poly", probability=True)},
        {
            "ModelNames": "RF",
            "Model": RandomForestClassifier(n_estimators=100, random_state=42),
        },
        {"ModelNames": "KNN", "Model": KNeighborsClassifier(n_neighbors=5)},
        {"ModelNames": "XGB", "Model": XGBClassifier()},
        {"ModelNames": "LR", "Model": LogisticRegression(max_iter=1000)},
    ]
    
    cross_val_train = []

    F1_Score_train = []
    Accuracy_train = []
    Recall_train = []
    Precision_train = []
    ROC_AUC_Score_train = []

    F1_Score_test = []
    Accuracy_test = []
    Recall_test = []
    Precision_test = []
    ROC_AUC_Score_test = []

    Model_Name = []

    for model in models_default:
        # print(model)
        Model_Name.append(model["ModelNames"])
        model["Model"].fit(X_train, y_train)

        (
            accuracy_train,
            precision_train,
            recall_train,
            F1_score_train,
            roc_auc_train,
            accuracy_test,
            precision_test,
            recall_test,
            F1_score_test,
            roc_auc_test,
            cross_val            
        ) = Evaluation(model["Model"], X_train, X_test, y_train, y_test, False)

        cross_val_train.append(cross_val)
        
        F1_Score_train.append(F1_score_train)
        Accuracy_train.append(accuracy_train)
        Recall_train.append(recall_train)
        Precision_train.append(precision_train)
        ROC_AUC_Score_train.append(roc_auc_train)

        F1_Score_test.append(F1_score_test)
        Accuracy_test.append(accuracy_test)
        Recall_test.append(recall_test)
        Precision_test.append(precision_test)
        ROC_AUC_Score_test.append(roc_auc_test)

    results = pd.DataFrame()
    results["Model_Name"] = Model_Name

    train_test_f1_score_difference = np.subtract(
        F1_Score_train, F1_Score_test
    )  # To Check Overfitting/Underfitting

    results["Cross validation mean"] = cross_val_train
    
    results["Accuracy on Test Set"] = Accuracy_test
    results["Precision on Test Set"] = Precision_test
    results["Recall on Test Set"] = Recall_test
    results["F1_Score on Test Set"] = F1_Score_test         
    results["ROC_AUC_Score on Test Set"] = ROC_AUC_Score_test

    results["Accuracy on Train Set"] = Accuracy_train
    results["Precision on Train Set"] = Precision_train
    results["Recall on Train Set"] = Recall_train
    results["F1_Score on Train Set"] = F1_Score_train      
    results["ROC_AUC_Score on Train Set"] = ROC_AUC_Score_train

    results["Difference of F1_Score on train and test"] = train_test_f1_score_difference

    results = results.sort_values(
        by=["F1_Score on Test Set", "Difference of F1_Score on train and test"],
        ascending=[False, True],
    )

    return results

In [15]:
Results_wn_supervised = apply_models_with_default_paramters(X_train, X_test, y_train, y_test)


In [16]:
Results_wn_supervised

Unnamed: 0,Model_Name,Cross validation mean,Accuracy on Test Set,Precision on Test Set,Recall on Test Set,F1_Score on Test Set,ROC_AUC_Score on Test Set,Accuracy on Train Set,Precision on Train Set,Recall on Train Set,F1_Score on Train Set,ROC_AUC_Score on Train Set,Difference of F1_Score on train and test
0,SVM,0.84375,0.8475,0.856436,0.843902,0.850123,0.920675,0.955,0.966667,0.9425,0.95443,0.991275,0.104308
4,LR,0.845,0.84,0.857868,0.82439,0.840796,0.916923,0.893125,0.895597,0.89,0.89279,0.958917,0.051994
1,RF,0.755,0.8025,0.794393,0.829268,0.811456,0.867167,1.0,1.0,1.0,1.0,1.0,0.188544
3,XGB,0.773125,0.775,0.764977,0.809756,0.78673,0.864665,1.0,1.0,1.0,1.0,1.0,0.21327
2,KNN,0.68375,0.6825,0.825,0.482927,0.609231,0.792921,0.808125,0.912898,0.68125,0.780243,0.916808,0.171013
