In [16]:
from datetime import datetime
fecha_hora_actual = datetime.now()
print("Fecha y Hora actual Inicio:", fecha_hora_actual)

Fecha y Hora actual Inicio: 2024-05-20 11:17:22.567120


In [1]:
import string
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("punkt")

from gensim.models.word2vec import Word2Vec
from gensim.parsing.preprocessing import (
    strip_punctuation,
    strip_numeric,
    strip_short,
    stem_text,
    strip_multiple_whitespaces,
    remove_stopwords,
    STOPWORDS,
)

from bs4 import BeautifulSoup
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
import time

# ejecutar dataframe de forma paralela
from pandarallel import pandarallel  # import pandarallel

pandarallel.initialize()  # initialize pandarallel

[nltk_data] Downloading package wordnet to /home/ymamani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ymamani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ymamani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:

ds_imdb = pd.read_csv('imdb_ds_2k_clean.csv')
ds_imdb

Unnamed: 0,sw_text,sentiment
0,one reviewer ha mention watch oz episode youll...,positive
1,wonderful little production film technique una...,positive
2,think wa wonderful way spend time hot summer w...,positive
3,basically family little boy jake think zombie ...,negative
4,petter matteis love time money visually stun f...,positive
...,...,...
1995,feel minnesota direct steven baigelmann star k...,negative
1996,cell rat cell like antz must watch twice appre...,positive
1997,movie despite list list celebs complete waste ...,negative
1998,love movie wa could break tear watch really up...,positive


In [3]:
print("Shape de dataset", ds_imdb.shape)

Shape de dataset (2000, 2)


In [4]:
# tokenización de palabras
print("Realizando tokenizacion de palabras ...")
ds_imdb["token_text"] = [word_tokenize(text) for text in ds_imdb["sw_text"]]
print(len(ds_imdb["token_text"]))

Realizando tokenizacion de palabras ...
2000


In [5]:
# Obtener palabras unicas de todas las oraciones
all_words_ds_sentiment = list(set(word for text in ds_imdb["token_text"] for word in text))
print("Total de palabras unicas:", len(all_words_ds_sentiment))  

Total de palabras unicas: 25016


In [7]:
ds_array = ds_imdb['sw_text'].to_numpy()

In [8]:
oracion_tokens = []
for oracion in ds_array:
    # Eliminar puntuación y dividir por espacios
    tokens = oracion.translate(str.maketrans('', '', string.punctuation)).split()
    # Convertir a minúsculas
    tokens = [word.lower() for word in tokens if word.isalpha()]
    if tokens:  # Añadir solo si hay tokens
        oracion_tokens.append(tokens)

# Embeddings

## w2v

In [9]:
from gensim.models.word2vec import Word2Vec
def word_embeddings_w2v(imdb_sentences, window_size, embedding_vector_size, num_workers):
    model = Word2Vec(
        sentences=imdb_sentences,
        window=window_size,
        vector_size=embedding_vector_size,
        sg=1,
        workers=num_workers,
        min_count=1
    )
    return model

In [10]:
import multiprocessing

tiempo_inicio = time.time()
print("Generando vectorizacion de palabras ...")

num_cores = multiprocessing.cpu_count()
word_emb_wv = word_embeddings_w2v(oracion_tokens, 5, 300, num_cores)

tiempo_final = time.time()
tiempo_ejecucion_horas = (tiempo_final - tiempo_inicio) / 3600
print(f"Tiempo de ejecución vectorizacion de palabras Skip-gram W2V: {tiempo_ejecucion_horas:.4f} horas")

Generando vectorizacion de palabras ...
Tiempo de ejecución vectorizacion de palabras Skip-gram W2V: 0.0004 horas


In [11]:
print("Guardando embeddings W2V en archivo txt ...")
#word_emb_wv.wv.save_word2vec_format("embedding_imdb2k_w2v.txt", binary=False)

Guardando embeddings W2V en archivo txt ...


## FastText

In [12]:
from gensim.models.fasttext import FastText
def word_embeddings_ft(imdb_sentences, window_size, embedding_vector_size, num_workers):
    model = FastText(
        sentences=imdb_sentences,
        window=window_size,
        vector_size=embedding_vector_size,   
        sg=1,     
        workers=num_workers,
    )
    return model

In [14]:
import multiprocessing

tiempo_inicio = time.time()
print("Generando vectorizacion de palabras FastText ...")

num_cores = multiprocessing.cpu_count()
word_emb_ft = word_embeddings_ft(oracion_tokens, 5, 300, num_cores)

tiempo_final = time.time()
tiempo_ejecucion_horas = (tiempo_final - tiempo_inicio) / 3600
print(f"Tiempo de ejecución vectorizacion de palabras Skip-gram FT: {tiempo_ejecucion_horas:.4f} horas")

Generando vectorizacion de palabras FastText ...
Tiempo de ejecución vectorizacion de palabras Skip-gram FT: 0.0009 horas


In [15]:
print("Guardando embeddings FT en archivo txt ...")
#word_emb_ft.wv.save_word2vec_format("embedding_imdb2k_ft.txt", binary=False)

Guardando embeddings FT en archivo txt ...


## Glove

In [17]:
! git clone https://github.com/stanfordnlp/glove
! cd glove && make

Cloning into 'glove'...
remote: Enumerating objects: 656, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 656 (delta 36), reused 47 (delta 32), pack-reused 592 (from 1)[K
Receiving objects: 100% (656/656), 245.96 KiB | 375.00 KiB/s, done.
Resolving deltas: 100% (374/374), done.
mkdir -p build
gcc -c src/vocab_count.c -o build/vocab_count.o -lm -pthread -O3 -march=native -funroll-loops -Wall -Wextra -Wpedantic
gcc -c src/cooccur.c -o build/cooccur.o -lm -pthread -O3 -march=native -funroll-loops -Wall -Wextra -Wpedantic
[01m[Ksrc/cooccur.c:[m[K In function ‘[01m[Kmerge_files[m[K’:
  180 |         [01;35m[Kfread(&new, sizeof(CREC), 1, fid[i])[m[K;
      |         [01;35m[K^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[m[K
  190 |     [01;35m[Kfread(&new, sizeof(CREC), 1, fid[i])[m[K;
      |     [01;35m[K^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~[m[K
  203 |         [01;35m[Kfread(&new, sizeof(CREC), 1,

In [None]:
# Define path
#import os
#NLP_REPO_PATH = '/home/ymamani/projects/code/embeddings_w2v_ft_glove/data'
#glove_model_path = os.path.join(NLP_REPO_PATH, "utils_nlp", "models", "glove")

# Execute shell commands
#!cd $glove_model_path && make

In [18]:
import os
#SAVE_FILES_PATH = '/home/ymamani/projects/code/embeddings_w2v_ft_glove/data/trained_word_embeddings'
SAVE_FILES_PATH = '/home/ymamani/projects/code/experimentos3/data/trained_word_embeddings'
# Save our corpus as tokens delimited by spaces with new line characters in between sentences.
training_corpus_file_path = os.path.join(SAVE_FILES_PATH, "training-corpus-cleaned.txt")
with open(training_corpus_file_path, 'w', encoding='utf8') as file:
    for sent in oracion_tokens:
        file.write(" ".join(sent) + "\n")

In [19]:
# Define path
#glove_model_path = '/home/ymamani/projects/code/embeddings_w2v_ft_glove/glove'
glove_model_path = '/home/ymamani/projects/code/experimentos3/glove'
vocab_count_exe_path = os.path.join(glove_model_path, "build", "vocab_count")
vocab_file_path = os.path.join(SAVE_FILES_PATH, "vocab.txt")
# Execute shell commands
!$vocab_count_exe_path -min-count 5 -verbose 2 <$training_corpus_file_path> $vocab_file_path

BUILDING VOCABULARY
Processed 0 tokens.[11G100000 tokens.[11G200000 tokens.[0GProcessed 236760 tokens.
Counted 25016 unique words.
Truncating vocabulary at min count 5.
Using vocabulary of size 5391.



In [20]:
# Define path
cooccur_exe_path = os.path.join(glove_model_path, "build", "cooccur")
cooccurrence_file_path = os.path.join(SAVE_FILES_PATH, "cooccurrence.bin")
# Execute shell commands
!$cooccur_exe_path -memory 32 -vocab-file $vocab_file_path -verbose 2 -window-size 5 <$training_corpus_file_path> $cooccurrence_file_path

COUNTING COOCCURRENCES
window size: 5
context: symmetric
max product: 98356909
overflow length: 304226850
Reading vocab from file "/home/ymamani/projects/code/experimentos3/data/trained_word_embeddings/vocab.txt"...loaded 5391 words.
Building lookup table...table contains 29062882 elements.
Processing token: 0[19G100000[19G200000[0GProcessed 236760 tokens.
Writing cooccurrences to disk.......2 files in total.
Merging cooccurrence files: processed 0 lines.[39G0 lines.[39G100000 lines.[39G200000 lines.[39G300000 lines.[39G400000 lines.[39G500000 lines.[39G600000 lines.[39G700000 lines.[39G800000 lines.[39G900000 lines.[39G1000000 lines.[0GMerging cooccurrence files: processed 1093837 lines.



In [21]:
# Define path
shuffle_exe_path = os.path.join(glove_model_path, "build", "shuffle")
cooccurrence_shuf_file_path = os.path.join(SAVE_FILES_PATH, "cooccurrence.shuf.bin")
# Execute shell commands
!$shuffle_exe_path -memory 32 -verbose 2 <$cooccurrence_file_path> $cooccurrence_shuf_file_path

Using random seed 1726766747
SHUFFLING COOCCURRENCES
array size: 2040109465
Shuffling by chunks: processed 0 lines.[22Gprocessed 1093837 lines.
Wrote 1 temporary file(s).
Merging temp files: processed 0 lines.[31G1093837 lines.[0GMerging temp files: processed 1093837 lines.



In [22]:
# Define path
glove_exe_path = os.path.join(glove_model_path, "build", "glove")
glove_vector_file_path = os.path.join(SAVE_FILES_PATH, "GloVe_vectors")
# Execute shell commands
!$glove_exe_path -save-file $glove_vector_file_path -threads 8 -input-file $cooccurrence_shuf_file_path -x-max 10 -iter 15 -vector-size 300 -binary 2 -vocab-file $vocab_file_path -verbose 2

TRAINING MODEL
Read 1093837 lines.
Initializing parameters...Using random seed 1726766753
done.
vector size: 300
vocab size: 5391
x_max: 10.000000
alpha: 0.750000
09/19/24 - 12:25.54PM, iter: 001, cost: 0.055437
09/19/24 - 12:25.54PM, iter: 002, cost: 0.047176
09/19/24 - 12:25.55PM, iter: 003, cost: 0.039700
09/19/24 - 12:25.56PM, iter: 004, cost: 0.038113
09/19/24 - 12:25.56PM, iter: 005, cost: 0.037385
09/19/24 - 12:25.57PM, iter: 006, cost: 0.036717
09/19/24 - 12:25.58PM, iter: 007, cost: 0.035874
09/19/24 - 12:25.59PM, iter: 008, cost: 0.034901
09/19/24 - 12:25.59PM, iter: 009, cost: 0.033797
09/19/24 - 12:26.00PM, iter: 010, cost: 0.032539
09/19/24 - 12:26.01PM, iter: 011, cost: 0.031166
09/19/24 - 12:26.01PM, iter: 012, cost: 0.029687
09/19/24 - 12:26.02PM, iter: 013, cost: 0.028176
09/19/24 - 12:26.03PM, iter: 014, cost: 0.026693
09/19/24 - 12:26.03PM, iter: 015, cost: 0.025256


In [23]:
#load in the saved word vectors.
glove_wv = {}
glove_vector_txt_file_path = os.path.join(SAVE_FILES_PATH, "GloVe_vectors.txt")
with open(glove_vector_txt_file_path, encoding='utf-8') as f:
    for line in f:
        split_line = line.split(" ")
        glove_wv[split_line[0]] = [float(i) for i in split_line[1:]]

In [24]:
# 1. Let's see the word embedding for "apple" by passing in "apple" as the key.
print("Embedding for apple:", glove_wv["movie"])

# 2. Inspect the model vocabulary by accessing keys of the "wv.vocab" attribute. We'll print the first 20 words.
print("\nFirst 30 vocabulary words:", list(glove_wv.keys())[:20])

Embedding for apple: [-0.235367, -0.194131, -0.337478, 0.069839, -0.525106, -1.209318, -0.125127, 0.380385, 0.204849, 0.337658, 0.786869, 0.045477, -0.271602, 0.398011, -1.151463, 0.350926, -0.241669, 0.158509, -1.120875, 0.695348, 0.099997, -0.706794, 0.036722, -0.225602, 0.35696, 0.161608, -0.376903, 0.03142, 0.542876, -0.447508, 0.356882, 0.825494, -0.196084, 0.09937, -0.480333, 0.035598, 0.989132, -0.073479, -0.007345, -0.095263, 0.30634, -0.322474, 0.277669, 0.160892, -0.084347, -0.641337, -0.400511, -0.027045, -0.356613, -0.418473, 0.485058, -1.056487, -0.082592, -0.1274, 0.043072, -0.071779, -0.008721, 0.251047, 0.414428, -0.783374, 0.573423, -0.375085, 0.416375, -0.029831, 0.041344, -0.406137, -0.53061, -0.004196, 0.010236, 0.100226, -0.398907, 0.369308, -0.407135, 0.142113, -0.563081, 0.829507, 0.282536, 0.166587, -0.431276, 0.154563, 0.112659, -0.925348, 0.031795, -0.565388, 0.568706, 0.024221, 0.251521, 0.059045, -0.501073, 0.323, 0.004241, 0.30676, -0.426005, 0.57374, 0.128

In [25]:
from datetime import datetime
fecha_hora_actual_final = datetime.now()
print("Fecha y Hora final:", fecha_hora_actual_final)

Fecha y Hora final: 2024-09-19 12:26:24.778966
