# SST Dataset utilizando imdb 2k embedding propios generados por w2v, ft, glove, bert

In [1]:
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("punkt")

from gensim.models.word2vec import Word2Vec
from gensim.parsing.preprocessing import (
    strip_punctuation,
    strip_numeric,
    strip_short,
    stem_text,
    strip_multiple_whitespaces,
    remove_stopwords,
    STOPWORDS,
)

from bs4 import BeautifulSoup
from collections import defaultdict
#import networkx as nx
import matplotlib.pyplot as plt
#from pyvis.network import Network
import time

# ejecutar dataframe de forma paralela
from pandarallel import pandarallel  # import pandarallel

pandarallel.initialize()  # initialize pandarallel

[nltk_data] Downloading package wordnet to /home/ymamani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ymamani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ymamani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
#https://github.com/jiangqn/SST-preprocess

train_path_file_sst = '/home/ymamani/projects/data/movie_sst/train_sst.tsv'
val_path_file_sst = '/home/ymamani/projects/data/movie_sst/dev_sst.tsv'
test_path_file_sst = '/home/ymamani/projects/data/movie_sst/test_sst.tsv'

ds_sst_train = pd.read_csv(train_path_file_sst, delimiter="\t", encoding="latin-1")
ds_sst_val = pd.read_csv(val_path_file_sst, delimiter="\t", encoding="latin-1")
ds_sst_test = pd.read_csv(test_path_file_sst, delimiter="\t", encoding="latin-1")

In [3]:
html_tags_regex = r'<[^>]+>'
# Limpia la columna 'texto' eliminando las etiquetas HTML y cualquier palabra dentro de <>
ds_sst_train['sentence'] = ds_sst_train['sentence'].str.replace(html_tags_regex, '', regex=True)
ds_sst_val['sentence'] = ds_sst_val['sentence'].str.replace(html_tags_regex, '', regex=True)
ds_sst_test['sentence'] = ds_sst_test['sentence'].str.replace(html_tags_regex, '', regex=True)

In [4]:
# funcion para limpieza de textos
def clean_text(text): 
        
    text = str(text).lower()
    #text = BeautifulSoup(text, "lxml").getText()
    #text = BeautifulSoup(text, "html.parser").getText()
        
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Elimina URLs
    text = re.sub(r"\@\w+|\#\w+", "", text)  # Elimina las menciones @ y '#' de las redes sociales
    
    text = re.sub('\[[^]]*\]', '', text) #quitar corchetes
    text = re.sub(r"[^a-zA-z0-9\s]",'',text) #caracteres especiales
    
    text = re.sub(r" +", " ", text)  # elimina espacios en blanco
    text = strip_punctuation(text)  # Elimina los caracteres de puntuación
    text = strip_numeric(text)  # Elimina los números
    text = strip_multiple_whitespaces(text)
    text = strip_short(text,minsize=2)# Elimina las palabras cortas
    text = re.sub(r"\b\w{20,}\b", "", text)  # elimina palabras largar de 20 caracteres a mas.
    # text = set(stopwords.words('english'))
    # text = remove_stopwords(text) df['texto'] = df['texto'].apply(lambda x: re.sub(r'<[^>]+>', '', x))
    return text

print("Limpieza de las oraciones ...")

ds_sst_train['clean_sentence'] = ds_sst_train['sentence'].parallel_apply(clean_text)
ds_sst_val['clean_sentence'] = ds_sst_val['sentence'].parallel_apply(clean_text)
ds_sst_test['clean_sentence'] = ds_sst_test['sentence'].parallel_apply(clean_text)



Limpieza de las oraciones ...


In [5]:
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

def remove_stopwords_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

print("Stopwords and lemmatization de las oraciones ...")

ds_sst_train['sw_sentence'] = ds_sst_train['clean_sentence'].parallel_apply(remove_stopwords_text)
ds_sst_val['sw_sentence'] = ds_sst_val['clean_sentence'].parallel_apply(remove_stopwords_text)
ds_sst_test['sw_sentence'] = ds_sst_test['clean_sentence'].parallel_apply(remove_stopwords_text)



Stopwords and lemmatization de las oraciones ...


In [6]:
import gensim
emb_wn_graph = gensim.models.KeyedVectors.load_word2vec_format('graph_model_embedding_wn_w5d300_le40nw300a09_2k.txt', binary=False)
#emb_cn_graph = gensim.models.KeyedVectors.load_word2vec_format('graph_model_embedding_wn_w5d300_le40nw300a09_2k.txt', binary=False)
emb_w2v = gensim.models.KeyedVectors.load_word2vec_format('embedding_imdb2k_w2v.txt', binary=False)
emb_ft = gensim.models.KeyedVectors.load_word2vec_format('embedding_imdb2k_ft.txt', binary=False)
emb_bert = gensim.models.KeyedVectors.load_word2vec_format('embedding_imdb2k_bert.txt', binary=False)

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = 'GloVe_vectors.txt'
word2vec_temp_file = get_tmpfile("glove_word2vec.txt")
glove2word2vec(glove_file, word2vec_temp_file)
emb_glove = KeyedVectors.load_word2vec_format(word2vec_temp_file)

  glove2word2vec(glove_file, word2vec_temp_file)


In [7]:
# Tokenización y representación de documentos
def document_vector(doc, model):
    words = doc.split()
    vectors = [
        model[word] for word in words if word in model
    ]  # vectors = [embeddings.get(word, np.zeros(300)) for word in words]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        # return np.zeros(model.vector_size)  # Vector de ceros si no se encuentran palabras
        return np.zeros(300)

In [8]:
# Aplicar tokenización y representación a todas las reseñas
ds_sst_train["vector_sentence_wn"] = ds_sst_train["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_wn_graph))
ds_sst_train["vector_sentence_cn"] = ds_sst_train["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_wn_graph)) #emb_cn
ds_sst_train["vector_sentence_w2v"] = ds_sst_train["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_w2v))
ds_sst_train["vector_sentence_ft"] = ds_sst_train["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_ft))
ds_sst_train["vector_sentence_glove"] = ds_sst_train["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_glove)) #emb_glove
ds_sst_train["vector_sentence_bert"] = ds_sst_train["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_bert)) #emb_glove



In [9]:
# Aplicar tokenización y representación a todas las reseñas
ds_sst_test["vector_sentence_wn"] = ds_sst_test["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_wn_graph))
ds_sst_test["vector_sentence_cn"] = ds_sst_test["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_wn_graph))
ds_sst_test["vector_sentence_w2v"] = ds_sst_test["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_w2v))
ds_sst_test["vector_sentence_ft"] = ds_sst_test["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_ft))
ds_sst_test["vector_sentence_glove"] = ds_sst_test["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_glove))
ds_sst_test["vector_sentence_bert"] = ds_sst_test["sw_sentence"].parallel_apply(lambda x: document_vector(x, emb_bert))



In [10]:
x_ds_sst_train_wn = np.vstack(ds_sst_train["vector_sentence_wn"])
y_ds_sst_train_wn = ds_sst_train["label"]

x_ds_sst_train_cn = np.vstack(ds_sst_train["vector_sentence_cn"])
y_ds_sst_train_cn = ds_sst_train["label"]

x_ds_sst_train_w2v = np.vstack(ds_sst_train["vector_sentence_w2v"])
y_ds_sst_train_w2v = ds_sst_train["label"]

x_ds_sst_train_ft = np.vstack(ds_sst_train["vector_sentence_ft"])
y_ds_sst_train_ft = ds_sst_train["label"]

x_ds_sst_train_glove = np.vstack(ds_sst_train["vector_sentence_glove"])
y_ds_sst_train_glove = ds_sst_train["label"]

x_ds_sst_train_bert = np.vstack(ds_sst_train["vector_sentence_bert"])
y_ds_sst_train_bert = ds_sst_train["label"]

In [11]:
x_ds_sst_test_wn = np.vstack(ds_sst_test["vector_sentence_wn"])
y_ds_sst_test_wn = ds_sst_test["label"]

x_ds_sst_test_cn = np.vstack(ds_sst_test["vector_sentence_cn"])
y_ds_sst_test_cn = ds_sst_test["label"]

x_ds_sst_test_w2v = np.vstack(ds_sst_test["vector_sentence_w2v"])
y_ds_sst_test_w2v = ds_sst_test["label"]

x_ds_sst_test_ft = np.vstack(ds_sst_test["vector_sentence_ft"])
y_ds_sst_test_ft = ds_sst_test["label"]

x_ds_sst_test_glove = np.vstack(ds_sst_test["vector_sentence_glove"])
y_ds_sst_test_glove = ds_sst_test["label"]

x_ds_sst_test_bert = np.vstack(ds_sst_test["vector_sentence_bert"])
y_ds_sst_test_bert = ds_sst_test["label"]

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import (
    accuracy_score,
    accuracy_score,
    roc_auc_score,
    f1_score,
    classification_report,
    precision_score,
    recall_score,
)

In [13]:
def Evaluation(model, X_train, X_test, y_train, y_test, hypertuning=False):
    
    y_pred = model.predict(X_train)
    y_pred_proba = model.predict_proba(X_train)

    accuracy_train = accuracy_score(y_train, y_pred)
    precision_train = precision_score(y_train, y_pred)
    recall_train = recall_score(y_train, y_pred)
    F1_score_train = f1_score(y_train, y_pred)
    # print("F1_Score = ", F1_score_train )
    roc_auc_train = roc_auc_score(y_train, y_pred_proba[:, 1])
    # print( classification_report( y_train, y_pred ) )

    # print( " For Test Set :  ")
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    accuracy_test = accuracy_score(y_test, y_pred)
    precision_test = precision_score(y_test, y_pred)
    recall_test = recall_score(y_test, y_pred)
    F1_score_test = f1_score(y_test, y_pred)
    # print("F1_Score = ", F1_score_test )
    roc_auc_test = roc_auc_score(y_test, y_pred_proba[:, 1])
    
    #cross-validation
    cross_val = cross_val_score(model, X_train, y_train, cv=5).mean()
    
    return (
        accuracy_train,
        precision_train,
        recall_train,
        F1_score_train,
        roc_auc_train,
        accuracy_test,
        precision_test,
        recall_test,
        F1_score_test,
        roc_auc_test,
        cross_val
    )

In [14]:
def apply_models_with_default_paramters(X_train, X_test, y_train, y_test):
    models_default = [
        {"ModelNames": "SVM", "Model": SVC(kernel="poly", probability=True)},
        {
            "ModelNames": "RF",
            "Model": RandomForestClassifier(n_estimators=100, random_state=42),
        },
        {"ModelNames": "KNN", "Model": KNeighborsClassifier(n_neighbors=5)},
        {"ModelNames": "XGB", "Model": XGBClassifier()},
        {"ModelNames": "LR", "Model": LogisticRegression(max_iter=1000)},
    ]
    
    cross_val_train = []

    F1_Score_train = []
    Accuracy_train = []
    Recall_train = []
    Precision_train = []
    ROC_AUC_Score_train = []

    F1_Score_test = []
    Accuracy_test = []
    Recall_test = []
    Precision_test = []
    ROC_AUC_Score_test = []

    Model_Name = []

    for model in models_default:
        # print(model)
        Model_Name.append(model["ModelNames"])
        model["Model"].fit(X_train, y_train)

        (
            accuracy_train,
            precision_train,
            recall_train,
            F1_score_train,
            roc_auc_train,
            accuracy_test,
            precision_test,
            recall_test,
            F1_score_test,
            roc_auc_test,
            cross_val            
        ) = Evaluation(model["Model"], X_train, X_test, y_train, y_test, False)

        cross_val_train.append(cross_val)
        
        F1_Score_train.append(F1_score_train)
        Accuracy_train.append(accuracy_train)
        Recall_train.append(recall_train)
        Precision_train.append(precision_train)
        ROC_AUC_Score_train.append(roc_auc_train)

        F1_Score_test.append(F1_score_test)
        Accuracy_test.append(accuracy_test)
        Recall_test.append(recall_test)
        Precision_test.append(precision_test)
        ROC_AUC_Score_test.append(roc_auc_test)

    results = pd.DataFrame()
    results["Model_Name"] = Model_Name

    train_test_f1_score_difference = np.subtract(
        F1_Score_train, F1_Score_test
    )  # To Check Overfitting/Underfitting

    results["Cross validation mean"] = cross_val_train
    
    results["Accuracy on Test Set"] = Accuracy_test
    results["Precision on Test Set"] = Precision_test
    results["Recall on Test Set"] = Recall_test
    results["F1_Score on Test Set"] = F1_Score_test         
    results["ROC_AUC_Score on Test Set"] = ROC_AUC_Score_test

    results["Accuracy on Train Set"] = Accuracy_train
    results["Precision on Train Set"] = Precision_train
    results["Recall on Train Set"] = Recall_train
    results["F1_Score on Train Set"] = F1_Score_train      
    results["ROC_AUC_Score on Train Set"] = ROC_AUC_Score_train

    results["Difference of F1_Score on train and test"] = train_test_f1_score_difference

    results = results.sort_values(
        by=["F1_Score on Test Set", "Difference of F1_Score on train and test"],
        ascending=[False, True],
    )

    return results

In [19]:
Results_wn = apply_models_with_default_paramters(x_ds_sst_train_wn, x_ds_sst_test_wn, y_ds_sst_train_wn, y_ds_sst_test_wn)
Results_wn

Unnamed: 0,Model_Name,Cross validation mean,F1_Score on Test Set,Accuracy on Test Set,Recall on Test Set,Precision on Test Set,ROC_AUC_Score on Test Set,F1_Score on Train Set,Accuracy on Train Set,Recall on Train Set,Precision on Train Set,ROC_AUC_Score on Train Set,Difference of F1_Score on train and test
0,SVM,0.752023,0.775251,0.766612,0.806381,0.746436,0.849444,0.896845,0.891329,0.90554,0.888315,0.960601,0.121594
4,LR,0.742919,0.765393,0.757276,0.793179,0.739487,0.835392,0.78497,0.77341,0.792798,0.777295,0.857242,0.019577
3,XGB,0.714017,0.730042,0.717738,0.764576,0.698492,0.811717,0.998614,0.998555,0.997784,0.999445,0.999994,0.268572
1,RF,0.691763,0.711346,0.699616,0.741474,0.68357,0.781503,0.998613,0.998555,0.997507,0.999722,0.999994,0.287268
2,KNN,0.662572,0.699609,0.662823,0.786579,0.629956,0.723334,0.818671,0.800145,0.86482,0.777197,0.878197,0.119062


In [20]:
Results_cn = apply_models_with_default_paramters(x_ds_sst_train_cn, x_ds_sst_test_cn, y_ds_sst_train_cn, y_ds_sst_test_cn)
Results_cn

Unnamed: 0,Model_Name,Cross validation mean,F1_Score on Test Set,Accuracy on Test Set,Recall on Test Set,Precision on Test Set,ROC_AUC_Score on Test Set,F1_Score on Train Set,Accuracy on Train Set,Recall on Train Set,Precision on Train Set,ROC_AUC_Score on Train Set,Difference of F1_Score on train and test
0,SVM,0.752023,0.775251,0.766612,0.806381,0.746436,0.849445,0.896845,0.891329,0.90554,0.888315,0.960602,0.121594
4,LR,0.742919,0.765393,0.757276,0.793179,0.739487,0.835392,0.78497,0.77341,0.792798,0.777295,0.857242,0.019577
3,XGB,0.714017,0.730042,0.717738,0.764576,0.698492,0.811717,0.998614,0.998555,0.997784,0.999445,0.999994,0.268572
1,RF,0.691763,0.711346,0.699616,0.741474,0.68357,0.781503,0.998613,0.998555,0.997507,0.999722,0.999994,0.287268
2,KNN,0.662572,0.699609,0.662823,0.786579,0.629956,0.723334,0.818671,0.800145,0.86482,0.777197,0.878197,0.119062


In [21]:
Results_w2v = apply_models_with_default_paramters(x_ds_sst_train_w2v, x_ds_sst_test_w2v, y_ds_sst_train_w2v, y_ds_sst_test_w2v)
Results_w2v

Unnamed: 0,Model_Name,Cross validation mean,F1_Score on Test Set,Accuracy on Test Set,Recall on Test Set,Precision on Test Set,ROC_AUC_Score on Test Set,F1_Score on Train Set,Accuracy on Train Set,Recall on Train Set,Precision on Train Set,ROC_AUC_Score on Train Set,Difference of F1_Score on train and test
0,SVM,0.62659,0.690498,0.624382,0.839384,0.586472,0.720734,0.706769,0.640029,0.831579,0.614534,0.705084,0.016271
4,LR,0.642197,0.683249,0.657331,0.740374,0.634307,0.723403,0.683035,0.649277,0.724377,0.646158,0.703723,-0.000214
1,RF,0.640896,0.653643,0.639758,0.680968,0.628426,0.697908,0.998613,0.998555,0.997507,0.999722,0.99999,0.34497
3,XGB,0.637572,0.648197,0.630423,0.682068,0.61753,0.689235,0.998613,0.998555,0.997507,0.999722,0.999994,0.350417
2,KNN,0.593064,0.607069,0.584843,0.642464,0.575369,0.61363,0.762644,0.743642,0.789474,0.737578,0.817212,0.155575


In [22]:
Results_ft = apply_models_with_default_paramters(x_ds_sst_train_ft, x_ds_sst_test_ft, y_ds_sst_train_ft, y_ds_sst_test_ft)
Results_ft

Unnamed: 0,Model_Name,Cross validation mean,F1_Score on Test Set,Accuracy on Test Set,Recall on Test Set,Precision on Test Set,ROC_AUC_Score on Test Set,F1_Score on Train Set,Accuracy on Train Set,Recall on Train Set,Precision on Train Set,ROC_AUC_Score on Train Set,Difference of F1_Score on train and test
0,SVM,0.656214,0.70655,0.687534,0.753575,0.665049,0.746656,0.695067,0.665896,0.729917,0.663394,0.722063,-0.011483
4,LR,0.65159,0.695879,0.679846,0.733773,0.661706,0.743466,0.687824,0.66026,0.717452,0.660546,0.716461,-0.008055
3,XGB,0.641618,0.669801,0.652938,0.705171,0.637811,0.714128,0.995695,0.99552,0.993075,0.998329,0.999947,0.325894
1,RF,0.658092,0.668831,0.663921,0.679868,0.658147,0.726933,0.995691,0.99552,0.992244,0.999163,0.999917,0.32686
2,KNN,0.608526,0.624481,0.602416,0.662266,0.590775,0.652561,0.762096,0.743497,0.787535,0.73825,0.817731,0.137615


In [17]:
Results_glove = apply_models_with_default_paramters(x_ds_sst_train_glove, x_ds_sst_test_glove, y_ds_sst_train_glove, y_ds_sst_test_glove)
Results_glove

Unnamed: 0,Model_Name,Cross validation mean,F1_Score on Test Set,Accuracy on Test Set,Recall on Test Set,Precision on Test Set,ROC_AUC_Score on Test Set,F1_Score on Train Set,Accuracy on Train Set,Recall on Train Set,Precision on Train Set,ROC_AUC_Score on Train Set,Difference of F1_Score on train and test
4,LR,0.671965,0.703173,0.681494,0.755776,0.657416,0.747265,0.708178,0.68237,0.738781,0.68001,0.742984,0.005005
0,SVM,0.601734,0.690546,0.591982,0.911991,0.55563,0.719062,0.740315,0.654191,0.944875,0.608564,0.79834,0.049769
1,RF,0.648988,0.678879,0.672707,0.693069,0.665259,0.726503,0.995691,0.99552,0.992244,0.999163,0.999938,0.316812
3,XGB,0.645665,0.675632,0.668863,0.690869,0.661053,0.725115,0.995693,0.99552,0.992521,0.998885,0.999947,0.320061
2,KNN,0.6,0.619604,0.588688,0.671067,0.575472,0.619179,0.763321,0.740029,0.803601,0.726885,0.813141,0.143717


In [15]:
Results_bert = apply_models_with_default_paramters(x_ds_sst_train_bert, x_ds_sst_test_bert, y_ds_sst_train_bert, y_ds_sst_test_bert)
Results_bert

Unnamed: 0,Model_Name,Cross validation mean,Accuracy on Test Set,Precision on Test Set,Recall on Test Set,F1_Score on Test Set,ROC_AUC_Score on Test Set,Accuracy on Train Set,Precision on Train Set,Recall on Train Set,F1_Score on Train Set,ROC_AUC_Score on Train Set,Difference of F1_Score on train and test
0,SVM,0.722254,0.72927,0.709677,0.774477,0.740663,0.812397,0.747254,0.749397,0.774515,0.761749,0.824307,0.021086
4,LR,0.729769,0.725426,0.706775,0.768977,0.736565,0.817422,0.757659,0.762565,0.777562,0.76999,0.835335,0.033426
3,XGB,0.705202,0.698517,0.683673,0.737074,0.70937,0.787943,0.998555,0.999445,0.997784,0.998614,0.999994,0.289244
1,RF,0.671387,0.672707,0.656971,0.720572,0.687303,0.751461,0.998555,0.999722,0.997507,0.998613,0.999993,0.31131
2,KNN,0.616474,0.624382,0.599294,0.746975,0.665034,0.666606,0.76315,0.735596,0.852355,0.789683,0.842428,0.124649
