# Semantic Evaluation - Experiment 05
The objective of this trial is to expand the SCA_index (i.e., Semantic Content Analysis Index) to a full word embedding, setting a subjective or objective load for each word.

Unsuccessfull. 2024.01.30

## Introduction

### Libraries

In [1]:
## Data analysis packages:
import pandas as pd
import numpy as np
from math import isnan  #Verifies if a given value is numerical.

In [3]:
## Visualization packages:
# import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Definitions

In [251]:
## Forcing Pandas to display any number of elements
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)

In [250]:
# ## Based on: https://stackoverflow.com/questions/25351968/how-can-i-display-full-non-truncated-dataframe-information-in-html-when-conver
# def print_full(x):
#     pd.set_option('display.max_rows', None)
#     pd.set_option('display.max_columns', None)
#     pd.set_option('display.width', 2000)
#     pd.set_option('display.float_format', '{:20,.2f}'.format)
#     pd.set_option('display.max_colwidth', None)
#     print(x)
#     pd.reset_option('display.max_rows')
#     pd.reset_option('display.max_columns')
#     pd.reset_option('display.width')
#     pd.reset_option('display.float_format')
#     pd.reset_option('display.max_colwidth')

## Exploring the SpaCy Word Embeddings: 
Also using Spacy library: https://spacy.io/
> !pip install -U spacy  
> !python -m spacy download en_core_web_sm  
> !python -m spacy download en_core_web_lg

Some instructions on how to use it:  
https://spacy.io/usage/spacy-101

In [252]:
## Importing SpaCy library:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_lg")

In [253]:
## We check below that this model has 514.157 keys and vectors, respectively.
nlp.meta['vectors']

{'width': 300,
 'vectors': 514157,
 'keys': 514157,
 'name': 'en_vectors',
 'mode': 'default'}

In [254]:
## Again, checking the number of keys.
nlp.vocab.vectors.n_keys

514157

In [255]:
## Getting the word embedding: data (i.e., the matrix containing the vector values for each word)
word_embedding = nlp.vocab.vectors.data

## Verifying the shape of the word embedding matrix:
word_embedding.shape

(514157, 300)

--- 
### Finding the words associated with the embedding:

In [12]:
## Extracting the words associated with each index:
index = nlp.vocab.vectors.keys()
words_associated = [nlp.vocab[i].text for i in index]

In [13]:
## Checking the word in position 514156, wich is "Lahouaiej":
words_associated[514156]

'Lahouaiej'

In [14]:
## Finding the respective row (index) for a given word:
rows = nlp.vocab.vectors.find(keys=["cat", "dog", "Lahouaiej"])
rows

array([  3201,   1147, 514156], dtype=int32)

---
## SCA - Glasgow Norms
* Read the SCA from Glasgow Norms;  
* Import F_s and F_o from the previous study;  
* Train the MLP classifier.

In [258]:
df_factors = pd.read_csv('../data/df_factors.csv', sep=';')
df_factors.head()

Unnamed: 0,words,F_Objectivity,F_Subjectivity,F_Context
0,abattoir,0.512527,0.380603,0.960466
1,abbey,0.714765,0.240456,0.696198
2,abbreviate,0.286952,0.171052,0.767043
3,abdicate,0.144736,0.3843,0.863127
4,abdication,0.167654,0.334086,0.896733


In [259]:
# ### Selecionando apenas as palavras no df_factors que atendam aos critérios:
# df_selected = df_factors.loc[((df_factors['F_Subjectivity'] > 0.75) | (df_factors['F_Subjectivity'] < 0.3)) & ((df_factors['F_Objectivity'] > 0.75) | (df_factors['F_Objectivity'] < 0.3))]


In [260]:
# df_factors = df_selected.copy()

In [261]:
SCA_words = [word for word in df_factors.words]

In [262]:
SCA_embedding_rows =  nlp.vocab.vectors.find(keys=SCA_words)

In [263]:
len(SCA_embedding_rows)

5553

> Separating the SCA-GlasgowNorms data into train and test:

In [238]:
from sklearn.model_selection import train_test_split

# Separar os dados em conjuntos de treino (70%) e teste (30%)
train_df, test_df = train_test_split(df_factors, test_size=0.2, random_state=42)

In [239]:
# Função para criar os conjuntos de treino e resposta
def create_data(dataframe):
    X = {}
    Y = {}
    
    for index, row in dataframe.iterrows():
        word = row['words']
        f_objectivity = row['F_Objectivity']
        f_subjectivity = row['F_Subjectivity']

        if word in nlp.vocab:
            indice = nlp.vocab.strings[word]
            vetor_embedding = word_embedding[index]
            X[word] = vetor_embedding
            Y[word] = {'F_Objectivity': f_objectivity, 'F_Subjectivity': f_subjectivity}

    return pd.DataFrame.from_dict(X, orient='index'), pd.DataFrame.from_dict(Y, orient='index')

In [240]:
# Creating train and test datasets:
X_train, Y_train = create_data(train_df)
X_test, Y_test = create_data(test_df)

# Exibir as dimensões dos conjuntos de treino e teste
print("Train data dimension:")
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)

print("\nTest data dimension:")
print("X_test:", X_test.shape)
print("Y_test:", Y_test.shape)

Train data dimension:
X_train: (782, 300)
Y_train: (782, 2)

Test data dimension:
X_test: (191, 300)
Y_test: (191, 2)


In [241]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
oboe,-1.03200,0.98737,-5.25470,1.41810,-3.09390,-1.14310,0.51815,0.66743,-1.731400,2.72040,...,-0.93926,3.001900,3.078500,0.241190,-2.18510,-3.36460,-2.817300,2.26210,2.31840,-3.88840
edit,1.27200,1.36980,-4.32460,-3.27000,0.14935,3.62040,-0.42182,2.89660,-2.457500,-2.51630,...,0.32227,1.668600,-0.187530,-0.977240,3.42620,3.01240,-0.331750,1.04800,-8.55560,3.95100
monk,2.32180,-4.84560,0.91680,-1.55820,3.42590,-0.28098,2.62570,3.81500,-2.455100,-0.91494,...,-0.55232,1.882200,1.133800,1.954600,-0.89593,2.03250,3.088100,-5.67460,-1.80810,-0.83835
immense,-3.78210,-2.54130,-1.88610,-1.75230,5.28460,-3.69050,1.62720,-0.24383,3.868300,0.16730,...,1.96530,-4.923700,0.007899,-0.052437,-1.39950,-0.18180,0.010653,-5.11610,-0.31818,-0.91010
subtle,-1.91310,-1.14720,-2.06600,1.42700,2.76250,1.15930,1.03380,4.22500,-5.351700,-1.10080,...,0.08672,-1.949500,4.692300,0.052070,1.43810,-0.53350,-0.016207,-2.23830,-3.35880,-5.05740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dome,-3.85350,-0.42631,1.58950,-0.23799,7.57610,-1.98770,0.25155,2.83880,-1.446600,2.89630,...,-2.16160,-0.018642,-3.330400,0.158980,-4.20810,2.39790,0.507040,-3.39310,3.77300,-2.47780
ground,0.18496,3.13970,-4.45390,0.19329,-1.46760,2.24920,2.45650,3.35850,-1.918100,3.65060,...,1.16150,-1.545500,2.302200,-0.673940,-1.16270,-0.40462,-1.595300,-0.98753,-0.87880,0.35929
tinsel,-2.07730,-2.54090,0.29205,-2.45100,-1.54520,-2.29750,2.21560,0.68936,-0.306200,1.19000,...,1.49260,-0.276420,-0.339110,2.394800,-2.28260,-2.00760,1.488200,0.59898,-0.90295,-0.26682
pulley,0.65278,-3.02150,-0.83809,-2.14630,2.32530,-1.19310,0.86940,2.40750,0.034806,-0.39219,...,1.81510,-2.141800,-0.913120,-0.577410,-1.21540,0.18766,-1.125500,-0.15214,-4.52850,-1.58800


In [242]:
Y_train

Unnamed: 0,F_Objectivity,F_Subjectivity
oboe,0.820254,0.219737
edit,0.287283,0.238985
monk,0.847672,0.200527
immense,0.175470,0.778402
subtle,0.120231,0.288824
...,...,...
dome,0.847433,0.197189
ground,0.893482,0.267292
tinsel,0.843857,0.295516
pulley,0.832643,0.143870


#### Binarizing Y_train and Y_test
Once we run the first MLP model, the performance wasn't over 54%. 
In this Section, we will binarize the semantic factor values following the median values.

In [175]:
# Binarizing through list comprehension
Y_train['F_Objectivity'] = ['high' if f_objectivity >= 0.565 else 'low' for f_objectivity in Y_train['F_Objectivity']]
Y_train['F_Subjectivity'] = ['high' if f_subjectivity >= 0.392 else 'low' for f_subjectivity in Y_train['F_Subjectivity']]

Y_test['F_Objectivity'] = ['high' if f_objectivity >= 0.565 else 'low' for f_objectivity in Y_test['F_Objectivity']]
Y_test['F_Subjectivity'] = ['high' if f_subjectivity >= 0.392 else 'low' for f_subjectivity in Y_test['F_Subjectivity']]

In [220]:
# Binarizing through list comprehension
Y_train['F_Objectivity'] = [1 if f_objectivity >= 0.565 else 0 for f_objectivity in Y_train['F_Objectivity']]
Y_train['F_Subjectivity'] = [1 if f_subjectivity >= 0.392 else 0 for f_subjectivity in Y_train['F_Subjectivity']]

Y_test['F_Objectivity'] = [1 if f_objectivity >= 0.565 else 0 for f_objectivity in Y_test['F_Objectivity']]
Y_test['F_Subjectivity'] = [1 if f_subjectivity >= 0.392 else 0 for f_subjectivity in Y_test['F_Subjectivity']]

In [221]:
Y_train

Unnamed: 0,F_Objectivity,F_Subjectivity
oboe,1,0
edit,0,0
monk,1,0
immense,0,1
subtle,0,0
...,...,...
dome,1,0
ground,1,0
tinsel,1,0
pulley,1,0


---
### Training a MLP Classifier for word semantic content

In [75]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

In [113]:
# Define a new MLP architecture
model = Sequential([
    Dense(256, activation='relu', input_shape=(300,)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')  # 2 neurons for binary classification with softmax activation
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Use categorical_crossentropy for categorical data
              metrics=['accuracy'])

In [114]:
# Print model summary
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_39 (Dense)            (None, 256)               77056     
                                                                 
 dense_40 (Dense)            (None, 128)               32896     
                                                                 
 dense_41 (Dense)            (None, 64)                8256      
                                                                 
 dense_42 (Dense)            (None, 32)                2080      
                                                                 
 dense_43 (Dense)            (None, 2)                 66        
                                                                 
Total params: 120354 (470.13 KB)
Trainable params: 120354 (470.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


> Como converti a saída em dados categóricos, é preciso antes utilizar OneHotEncoder:

In [124]:
## Como já foram transformados anteriormente em [0,1], preciso apenas torná-los como lista:
Y_train_array = Y_train.to_numpy()
Y_test_array = Y_test.to_numpy()

X_train_array = X_train.to_numpy()
X_test_array = X_test.to_numpy()

# Print the first few elements to verify
print(Y_train_array[:5])  # Print the first 5 elements


[[1 1]
 [0 0]
 [0 1]
 [0 1]
 [0 1]]


In [79]:
# Convert multilabel categorical labels to binary vectors
multi_label_binarizer = MultiLabelBinarizer()
Y_train_encoded = multi_label_binarizer.fit_transform(Y_train)
Y_test_encoded = multi_label_binarizer.transform(Y_test)

In [85]:
len(Y_train_encoded)

2

In [86]:
Y_train_encoded

array([[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
       [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [108]:
Y_train

Unnamed: 0,F_Objectivity,F_Subjectivity
airplane,1,1
coarse,0,0
persecution,0,1
moment,0,1
responsible,0,1
...,...,...
prawn,1,0
tweezers,1,0
university,1,1
wasteful,0,1


In [125]:
X_train_array

array([[ -3.9003  ,   3.9343  ,   0.65395 , ...,   0.11406 ,  -5.1079  ,
          1.122   ],
       [  2.0427  ,  -2.0691  ,  -1.591   , ...,  -3.8809  ,  -5.2335  ,
          3.8502  ],
       [  0.068316,   4.3433  ,  -5.5537  , ...,   2.2451  ,  -1.9138  ,
         -0.074928],
       ...,
       [ -1.0686  , -10.114   ,   4.0364  , ...,   6.1114  ,  -5.9124  ,
         10.933   ],
       [ -5.8985  ,  -2.8247  ,   0.85307 , ...,  -0.22011 ,   3.1951  ,
          0.082503],
       [  2.5277  ,   2.4585  ,   1.4097  , ...,  -3.3495  ,  -4.2642  ,
          0.89491 ]], dtype=float32)

In [30]:
Y_train

Unnamed: 0,F_Objectivity,F_Subjectivity
airplane,0.960395,0.463949
coarse,0.401438,0.265841
persecution,0.319377,0.678163
moment,0.139091,0.413798
responsible,0.289618,0.654414
...,...,...
prawn,0.952891,0.110865
tweezers,0.953419,0.193659
university,0.835052,0.695757
wasteful,0.200742,0.492842


In [128]:
# Train the model
history = model.fit(X_train_array, Y_train_array, epochs=50, batch_size=16, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [129]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_array, Y_test_array)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 45.04%


In [65]:
# Treina o modelo
history = model.fit(X_train, Y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.1)  # Usamos parte dos dados de treino como validação

# Avalia o modelo com os dados de teste
loss, accuracy = model.evaluate(X_test.values, Y_test.values)
print(f"Acurácia do modelo nos dados de teste: {accuracy * 100:.2f}%")

Epoch 1/50


UnimplementedError: Graph execution error:

Detected at node categorical_crossentropy/Cast defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2032.0_x64__qbz5n2kfra8p0\Lib\asyncio\base_events.py", line 607, in run_forever

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2032.0_x64__qbz5n2kfra8p0\Lib\asyncio\base_events.py", line 1922, in _run_once

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2032.0_x64__qbz5n2kfra8p0\Lib\asyncio\events.py", line 80, in _run

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel\kernelbase.py", line 542, in dispatch_queue

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel\kernelbase.py", line 531, in process_one

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel\ipkernel.py", line 359, in execute_request

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel\kernelbase.py", line 775, in execute_request

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel\ipkernel.py", line 446, in do_execute

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3051, in run_cell

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3106, in _run_cell

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3311, in run_cell_async

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3493, in run_ast_nodes

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code

  File "C:\Users\tiago\AppData\Local\Temp\ipykernel_30152\650847344.py", line 2, in <module>

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\engine\training.py", line 1807, in fit

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\engine\training.py", line 1401, in train_function

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\engine\training.py", line 1384, in step_function

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\engine\training.py", line 1373, in run_step

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\engine\training.py", line 1151, in train_step

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\engine\training.py", line 1209, in compute_loss

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\losses.py", line 143, in __call__

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\losses.py", line 270, in call

  File "c:\Users\tiago\OneDrive - UNIVALI\PhD\atividades de pesquisa\semantic_similarity\.venv\Lib\site-packages\keras\src\losses.py", line 2198, in categorical_crossentropy

Cast string to float is not supported
	 [[{{node categorical_crossentropy/Cast}}]] [Op:__inference_train_function_62846]

### Utilizando XGBoosting for multilabel:

### Utilizando PyCaret:

In [86]:
import pycaret.classification
import pycaret.regression
import pycaret.clustering

> Adjustin Y_train for use with PyCaret:

In [243]:
df = X_train.copy()

In [223]:
# Definir uma função para mapear os valores das colunas para os rótulos desejados
def map_labels(row):
    if row['F_Objectivity'] == 0 and row['F_Subjectivity'] == 1:
        return 'Latent'
    elif row['F_Objectivity'] == 0 and row['F_Subjectivity'] == 0:
        return 'Contextual'
    elif row['F_Objectivity'] == 1 and row['F_Subjectivity'] == 0:
        return 'Manifest'
    elif row['F_Objectivity'] == 1 and row['F_Subjectivity'] == 1:
        return 'Perceptual'

In [224]:
# Aplicar a função de mapeamento para criar a nova coluna "Target"
df['target'] = Y_train.apply(map_labels, axis=1)

In [151]:
## Adding F_Subjectivity and F_Objectivity to df:
df['F_Subjectivity'] = Y_train['F_Subjectivity']
df['F_Objectivity'] = Y_train['F_Objectivity']

In [244]:
df['target'] = Y_train['F_Subjectivity']

In [245]:
# Exibir as primeiras linhas do DataFrame resultante para verificar
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,target
oboe,-1.032,0.98737,-5.2547,1.4181,-3.0939,-1.1431,0.51815,0.66743,-1.7314,2.7204,...,3.0019,3.0785,0.24119,-2.1851,-3.3646,-2.8173,2.2621,2.3184,-3.8884,0.219737
edit,1.272,1.3698,-4.3246,-3.27,0.14935,3.6204,-0.42182,2.8966,-2.4575,-2.5163,...,1.6686,-0.18753,-0.97724,3.4262,3.0124,-0.33175,1.048,-8.5556,3.951,0.238985
monk,2.3218,-4.8456,0.9168,-1.5582,3.4259,-0.28098,2.6257,3.815,-2.4551,-0.91494,...,1.8822,1.1338,1.9546,-0.89593,2.0325,3.0881,-5.6746,-1.8081,-0.83835,0.200527
immense,-3.7821,-2.5413,-1.8861,-1.7523,5.2846,-3.6905,1.6272,-0.24383,3.8683,0.1673,...,-4.9237,0.007899,-0.052437,-1.3995,-0.1818,0.010653,-5.1161,-0.31818,-0.9101,0.778402
subtle,-1.9131,-1.1472,-2.066,1.427,2.7625,1.1593,1.0338,4.225,-5.3517,-1.1008,...,-1.9495,4.6923,0.05207,1.4381,-0.5335,-0.016207,-2.2383,-3.3588,-5.0574,0.288824
fork,-2.8959,1.6072,0.35612,1.5022,1.3448,-4.4988,3.4471,1.432,1.385,0.31227,...,-4.1298,-0.18791,3.1915,-3.3862,0.11851,3.9532,0.42752,-0.66962,-0.99807,0.16664
racket,-3.9477,-3.024,1.9944,1.7417,7.5082,-0.40407,3.6541,-3.2722,4.8815,0.80714,...,2.2372,-6.0917,5.1172,-2.5129,-1.298,4.7691,-5.3468,-1.4423,2.9098,0.269917
miracle,-2.2535,-2.0237,-1.4117,2.399,6.4206,0.79994,0.12129,3.8419,-3.1209,-0.79811,...,0.032762,-1.2072,-2.1061,-2.4119,0.90826,-0.55318,-3.275,0.57154,0.63203,0.759893
van,0.20043,3.6917,-3.075,4.2336,2.1911,2.9321,-2.0681,6.8698,-3.3532,4.7387,...,0.49304,3.4682,0.09294,-4.62,-0.37087,1.0184,-0.75697,3.2175,-3.7679,0.156111
peanut,-3.9852,-3.1125,0.44393,1.4091,0.069728,2.0615,4.8625,1.8118,0.71466,-0.57344,...,-1.5885,-5.1016,-1.8941,-3.1816,3.3496,2.2247,3.57,2.2925,-1.7872,0.089743


In [104]:
## Descartando a última coluna:
df_last = df.iloc[:,:-1]
df_last.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,F_Subjectivity
periodical,-4.0295,1.4873,5.2789,1.5726,3.4575,-0.89522,5.6927,-3.2107,-3.6499,4.6471,...,0.30626,3.8057,0.7459,-1.5794,3.8898,-2.6885,-3.5452,-0.61754,2.148,0.15774


In [105]:
df_scnd_last = df.iloc[:, :-2].join(df.iloc[:, -1])
df_scnd_last.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,F_Objectivity
periodical,-4.0295,1.4873,5.2789,1.5726,3.4575,-0.89522,5.6927,-3.2107,-3.6499,4.6471,...,0.30626,3.8057,0.7459,-1.5794,3.8898,-2.6885,-3.5452,-0.61754,2.148,0.31695


### Pycaret Regression: 300 inputs and 1 numerical outputs

In [248]:
exp_regr = pycaret.regression.setup(df, target='target', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Regression
3,Original data shape,"(782, 301)"
4,Transformed data shape,"(782, 301)"
5,Transformed train set shape,"(547, 301)"
6,Transformed test set shape,"(235, 301)"
7,Numeric features,300
8,Preprocess,True
9,Imputation type,simple


In [249]:
exp_regr.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.1035,0.032,0.1772,-0.0225,0.1234,0.4751,0.006
en,Elastic Net,0.1035,0.032,0.1772,-0.0225,0.1234,0.4751,0.006
dummy,Dummy Regressor,0.1035,0.032,0.1772,-0.0225,0.1234,0.4751,0.006
llar,Lasso Least Angle Regression,0.1035,0.032,0.1772,-0.0225,0.1234,0.4751,0.006
br,Bayesian Ridge,0.1043,0.0322,0.1778,-0.0329,0.1239,0.4796,0.01
et,Extra Trees Regressor,0.1229,0.0368,0.1907,-0.2109,0.1349,0.5888,0.233
rf,Random Forest Regressor,0.1262,0.0378,0.1936,-0.2493,0.137,0.5979,1.879
ada,AdaBoost Regressor,0.1235,0.038,0.194,-0.2549,0.1371,0.5984,0.135
lightgbm,Light Gradient Boosting Machine,0.1269,0.0379,0.1937,-0.2673,0.1376,0.6054,0.179
knn,K Neighbors Regressor,0.1304,0.0399,0.1984,-0.3006,0.1408,0.5924,0.008


In [100]:
exp_regr.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.1549,0.035,0.1869,-0.003,0.1311,0.5698,0.033
en,Elastic Net,0.1549,0.035,0.1869,-0.003,0.1311,0.5698,0.03
dummy,Dummy Regressor,0.1549,0.035,0.1869,-0.003,0.1311,0.5698,0.03
llar,Lasso Least Angle Regression,0.1549,0.035,0.1869,-0.003,0.1311,0.5698,0.028
br,Bayesian Ridge,0.1552,0.0351,0.1873,-0.0076,0.1314,0.5705,0.032
omp,Orthogonal Matching Pursuit,0.1552,0.0352,0.1875,-0.0094,0.1315,0.57,0.029
lr,Linear Regression,0.1562,0.0357,0.1887,-0.0228,0.1324,0.5732,0.031
lar,Least Angle Regression,0.156,0.0357,0.1887,-0.0233,0.1324,0.5724,0.029
ridge,Ridge Regression,0.1562,0.0357,0.1888,-0.0239,0.1325,0.5725,0.03
ada,AdaBoost Regressor,0.1579,0.036,0.1895,-0.0318,0.1339,0.6074,0.146


### Pycaret Clustering

In [None]:
exp_cluster = pycaret.clustering.setup()

### Pycaret Classification

In [226]:
## Fazendo um experimento:
exp_class = pycaret.classification.setup(df, target='target', session_id=9088)

Unnamed: 0,Description,Value
0,Session id,9088
1,Target,target
2,Target type,Multiclass
3,Target mapping,"Contextual: 0, Latent: 1, Manifest: 2, Perceptual: 3"
4,Original data shape,"(782, 301)"
5,Transformed data shape,"(782, 301)"
6,Transformed train set shape,"(547, 301)"
7,Transformed test set shape,"(235, 301)"
8,Numeric features,300
9,Preprocess,True


In [227]:
exp_class.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
qda,Quadratic Discriminant Analysis,0.7386,0.2,0.7386,0.5456,0.6276,0.0,0.0,0.01
dummy,Dummy Classifier,0.7386,0.2,0.7386,0.5456,0.6276,0.0,0.0,0.007
lightgbm,Light Gradient Boosting Machine,0.7294,0.1862,0.7294,0.5725,0.63,0.0083,0.0125,0.614
et,Extra Trees Classifier,0.7276,0.2,0.7276,0.5532,0.6249,-0.0101,-0.0237,0.036
rf,Random Forest Classifier,0.7148,0.2215,0.7148,0.556,0.6219,-0.0149,-0.0273,0.097
gbc,Gradient Boosting Classifier,0.6875,0.1878,0.6875,0.5839,0.6238,0.021,0.0338,2.944
knn,K Neighbors Classifier,0.6821,0.2147,0.6821,0.5754,0.6203,0.0103,0.0159,0.012
lr,Logistic Regression,0.5812,0.2311,0.5812,0.5921,0.5847,0.0248,0.0248,0.084
dt,Decision Tree Classifier,0.5759,0.2162,0.5759,0.6089,0.5889,0.0565,0.0575,0.026
svm,SVM - Linear Kernel,0.5575,0.0,0.5575,0.5884,0.5695,0.0144,0.0143,0.009


#### New Test

In [130]:
## Selecionar apenas os elementos do SCA que possuem fatores < 0.25 e > 0.75:
new_SCA_words = [word for word, subjectivity, objectivity in zip(df_factors['words'], df_factors['F_Subjectivity'], df_factors['F_Objectivity']) if (subjectivity > 0.75 or subjectivity < 0.25) and (objectivity > 0.75 or objectivity < 0.25)]


In [132]:
len(new_SCA_words)

851

In [133]:
new_SCA_embedding_rows =  nlp.vocab.vectors.find(keys=new_SCA_words)

In [134]:
len(new_SCA_embedding_rows)

851

> Separating the SCA-GlasgowNorms data into train and test:

In [None]:
# Separar os dados em conjuntos de treino (70%) e teste (30%)
train_df, test_df = train_test_split(df_factors, test_size=0.2, random_state=42)

In [None]:
# Função para criar os conjuntos de treino e resposta
def create_data(dataframe):
    X = {}
    Y = {}
    
    for index, row in dataframe.iterrows():
        word = row['words']
        f_objectivity = row['F_Objectivity']
        f_subjectivity = row['F_Subjectivity']

        if word in nlp.vocab:
            indice = nlp.vocab.strings[word]
            vetor_embedding = word_embedding[index]
            X[word] = vetor_embedding
            Y[word] = {'F_Objectivity': f_objectivity, 'F_Subjectivity': f_subjectivity}

    return pd.DataFrame.from_dict(X, orient='index'), pd.DataFrame.from_dict(Y, orient='index')

In [None]:
# Creating train and test datasets:
X_train, Y_train = create_data(train_df)
X_test, Y_test = create_data(test_df)

# Exibir as dimensões dos conjuntos de treino e teste
print("Train data dimension:")
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)

print("\nTest data dimension:")
print("X_test:", X_test.shape)
print("Y_test:", Y_test.shape)

Train data dimension:
X_train: (3757, 300)
Y_train: (3757, 2)

Test data dimension:
X_test: (923, 300)
Y_test: (923, 2)
