In [1]:
import pandas as pd
import nltk
import re
import networkx as nx
from collections import defaultdict
import random
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import requests

# Descargar recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /home/ymamani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ymamani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ymamani/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
df = pd.read_csv('imdb_ds_2k_clean.csv')  # ruta del dataset
df = df[:1]

In [3]:
df

Unnamed: 0,sw_text,sentiment
0,one reviewer ha mention watch oz episode youll...,positive


In [4]:
df_synonyms = pd.read_csv('graph_conceptnet_dic_synonym_limit5_imdb2k_full.csv', sep=";") 
df_synonyms

Unnamed: 0,Original,Conceptos
0,godyou,
1,govt,government
2,whyits,
3,modestly,modest
4,moneywhatever,
...,...,...
25011,betternow,
25012,chronological,"time,chronograph,chronology,chronologically"
25013,demonize,"vilify,demon,demonology,represent,demonic"
25014,ministry,"minister,cabinet,portfolio,secretary-of-state,..."


In [5]:
# Inicializar lematizador, stop words y VADER
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [6]:
# Función de preprocesamiento
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Eliminar caracteres especiales y números
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenización
    tokens = word_tokenize(text)
    # Eliminar stop words y lematizar
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Preprocesar el texto y almacenar los tokens
df['tokens'] = df['sw_text'].apply(preprocess_text)

In [7]:

# Obtener el lexicón de VADER
sia = SentimentIntensityAnalyzer()
vader_lexicon = sia.lexicon  # Es un diccionario {palabra: puntuación}

# Definir umbrales para palabras positivas y negativas
positive_threshold = 2.0
negative_threshold = -2.0

# Inicializar conjuntos de palabras positivas y negativas
positive_words = set()
negative_words = set()

# Iterar sobre el lexicón y separar palabras según su puntuación
for word, score in vader_lexicon.items():
    if score >= positive_threshold:
        positive_words.add(word)
    elif score <= negative_threshold:
        negative_words.add(word)

# Construir word_polarity y all_words antes de la paralelización
word_polarity = {}
all_words = set()

for word in positive_words:
    word_polarity[word] = 1
    all_words.add(word)

for word in negative_words:
    word_polarity[word] = -1
    all_words.add(word)

# Recopilar todas las palabras de los documentos
for tokens in df['tokens']:
    for token in tokens:
        all_words.add(token)
        if token not in word_polarity:
            word_polarity[token] = 0  # Asignar polaridad neutral

In [8]:
import multiprocessing
from multiprocessing import Pool
from functools import partial
import networkx as nx
import pandas as pd

# Supongamos que ya tienes tus DataFrames `df_imdb` y `df_synonym`

# Definir la función para obtener sinónimos desde el DataFrame `df_synonym`
def get_synonyms_from_df(word, df_synonym):
    synonyms = set()
    # Filtrar el DataFrame para encontrar la fila correspondiente a la palabra
    synonyms_row = df_synonym[df_synonym['Original'] == word]['Conceptos']
    if not synonyms_row.empty:
        conceptos = synonyms_row.values[0]
        if isinstance(conceptos, str) and conceptos.strip():
            # Dividir los conceptos por comas y limpiar espacios
            synonyms.update([syn.strip().lower() for syn in conceptos.split(',')])
    return synonyms

# Definir la función de procesamiento
def process_document(tokens, df_synonym, all_words):
    nodes = set()
    edges = []
    window_size = 2  # Puedes ajustar el tamaño de la ventana si lo deseas

    for i, token in enumerate(tokens):
        nodes.add(token)
        # Conectar con palabras dentro de la ventana de contexto
        for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
            if i != j:
                edges.append((token, tokens[j]))
        
        # Obtener sinónimos desde el DataFrame `df_synonym`
        synonyms = get_synonyms_from_df(token, df_synonym)
        
        
        # Añadir aristas para los sinónimos
        for synonym in synonyms:
            #print(synonym)
            #if synonym in all_words:
            edges.append((token, synonym))
            #print(token, "----", synonym)
    return nodes, edges

In [9]:
# Preparar los datos
# Suponiendo que la columna 'tokens' de `df_imdb` contiene listas de tokens por documento
tokens_list = df['tokens'].tolist()
# Asegurarse de que `all_words` sea un conjunto de todas las palabras únicas en los tokens
all_words = set(word for tokens in tokens_list for word in tokens)
# Número de procesos
n_processes = multiprocessing.cpu_count()

# Usar functools.partial para fijar `df_synonym` y `all_words`
process_document_partial = partial(process_document, df_synonym=df_synonyms, all_words=all_words)

# Crear un pool de procesos y mapear la función
with Pool(processes=n_processes) as pool:
    results = pool.map(process_document_partial, tokens_list)

# Recopilar nodos y aristas de todos los procesos
all_nodes = set()
all_edges = set()

for nodes, edges in results:
    all_nodes.update(nodes)
    all_edges.update(edges)

# Construir el grafo
G = nx.Graph()
G.add_nodes_from(all_nodes)
G.add_edges_from(all_edges)

In [10]:
# from pyvis.network import Network

# net = Network(notebook=True)
# net.from_nx(G)
# net.show("example_graph_conceptnet12.html")


In [16]:
# Definir la función de caminata aleatoria sesgada
def sentiment_biased_walk(start_node):
    walk = [start_node]
    for _ in range(walk_length - 1):
        curr_node = walk[-1]
        neighbors = list(G.neighbors(curr_node))
        if not neighbors:
            break
        probabilities = []
        for neighbor in neighbors:
            # Calcular polaridad
            curr_polarity = word_polarity.get(curr_node, 0)
           
            neighbor_polarity = word_polarity.get(neighbor, 0)
            
            # Si las polaridades coinciden y no son neutras, aumentar probabilidad
            if curr_polarity == neighbor_polarity and curr_polarity != 0:
                prob = alpha
                #print("curr_polarity ", curr_polarity, "-", curr_node)
                #print("neighbor_polarity ", neighbor_polarity, "-", neighbor)
                #print("proba = ",  prob)
            else:                
                prob = 1 - alpha
                #print("else proba = ",  prob)
            probabilities.append(prob)
        # Normalizar probabilidades
        total = sum(probabilities)
        #print("total sum prob = ",  total)
        probabilities = [prob / total for prob in probabilities]
        # Seleccionar siguiente nodo basado en las probabilidades
        next_node = random.choices(neighbors, weights=probabilities, k=1)[0]
        walk.append(next_node)
    return walk

# Función auxiliar para generar caminatas para un nodo
def generate_walks_for_node(node):
    node_walks = []
    for _ in range(num_walks_per_node):
        walk = sentiment_biased_walk(node)
        node_walks.append(walk)
    return node_walks

In [14]:
# Parámetros para las caminatas
num_walks_per_node = 300
walk_length = 40
alpha = 0.9  # Sesgo hacia la misma polaridad

In [None]:
# Lista de nodos
nodes = list(G.nodes())

# Generar caminatas en paralelo
with Pool(processes=n_processes) as pool:
    results = pool.map(generate_walks_for_node, nodes)

# Combinar las caminatas
walks = []
for node_walks in results:
    walks.extend(node_walks)

else proba = else proba = else proba = else proba = else proba = else proba = else proba = else proba = else proba = else proba = else proba = else proba = else proba =   else proba = else proba =   else proba = curr_polarity  else proba = else proba =  else proba = else proba =  else proba = else proba =   else proba = else proba =   else proba = else proba =  0.09999999999999998else proba =  else proba = 0.09999999999999998else proba = else proba = else proba =  0.09999999999999998 0.09999999999999998  0.09999999999999998  0.09999999999999998  0.09999999999999998 0.09999999999999998 0.099999999999999980.09999999999999998  0.09999999999999998  0.09999999999999998
0.09999999999999998  
 0.09999999999999998 
 0.09999999999999998
0.09999999999999998-1
0.099999999999999980.09999999999999998
0.099999999999999980.09999999999999998
0.09999999999999998
0.09999999999999998

0.099999999999999980.09999999999999998
0.099999999999999980.09999999999999998
else proba = 
0.099999999999999980.09999999

In [31]:
# Entrenar Word2Vec
walks_str = [[str(node) for node in walk] for walk in walks]  # Convertir nodos a cadenas
model = Word2Vec(walks_str, vector_size=300, window=5, min_count=1, sg=1, workers=30)

In [None]:
print("Guardando embeddings en archivo txt...")
model.wv.save_word2vec_format("graph_model_embedding_cn_w5d300_le40nw300a09_2k.txt", binary=False)

In [32]:
# Obtener embeddings de palabras
def get_word_embedding(word):
    if word in model.wv:
        return model.wv[word]
    else:
        return np.zeros(model.vector_size)

# Representar documentos promediando los embeddings de sus palabras
def document_embedding(tokens):
    embeddings = [get_word_embedding(token) for token in tokens]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Obtener embeddings de los documentos
df['embedding'] = df['tokens'].apply(document_embedding)


In [33]:
# Preparar datos para entrenamiento
X = np.stack(df['embedding'].values)
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true_labels, predictions, model_name):
    acc = accuracy_score(true_labels, predictions)
    prec = precision_score(true_labels, predictions)
    rec = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    
    print(f"Resultados del modelo {model_name}:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}\n")

In [35]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)


from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)


from xgboost import XGBClassifier
rf_model3 = XGBClassifier()
rf_model3.fit(X_train, y_train)
xgb_predictions = rf_model3.predict(X_test)

from sklearn.neighbors import KNeighborsClassifier
rf_model4 = KNeighborsClassifier(n_neighbors=5)
rf_model4.fit(X_train, y_train)
knn_predictions = rf_model4.predict(X_test)


clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
lr_predictions = clf.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

In [None]:
evaluate_model(y_test, svm_predictions, "SVM")
evaluate_model(y_test, rf_predictions, "Random Forest")
evaluate_model(y_test, lr_predictions, "LR")
evaluate_model(y_test, xgb_predictions, "XGBoots")
evaluate_model(y_test, knn_predictions, "KNN")