# Semantic Similarity - Experiment 01
The objective of this trial is to expand the SCA_index (i.e., Semantic Content Analysis Index) to a full word embedding, setting a subjective or objective load for each word.

## Introduction

### Libraries

In [1]:
## Data analysis packages:
import pandas as pd
import numpy as np
from math import isnan  #Verifies if a given value is numerical.

In [2]:
## Visualization packages:
# import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Definitions

In [3]:
## Forcing Pandas to display any number of elements
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', None)

In [4]:
# ## Based on: https://stackoverflow.com/questions/25351968/how-can-i-display-full-non-truncated-dataframe-information-in-html-when-conver
# def print_full(x):
#     pd.set_option('display.max_rows', None)
#     pd.set_option('display.max_columns', None)
#     pd.set_option('display.width', 2000)
#     pd.set_option('display.float_format', '{:20,.2f}'.format)
#     pd.set_option('display.max_colwidth', None)
#     print(x)
#     pd.reset_option('display.max_rows')
#     pd.reset_option('display.max_columns')
#     pd.reset_option('display.width')
#     pd.reset_option('display.float_format')
#     pd.reset_option('display.max_colwidth')

## Exploring the SpaCy Word Embeddings: 
Also using Spacy library: https://spacy.io/
> !pip install -U spacy  
> !python -m spacy download en_core_web_sm  
> !python -m spacy download en_core_web_lg

Some instructions on how to use it:  
https://spacy.io/usage/spacy-101

In [5]:
## Importing SpaCy library:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_lg")




In [6]:
## We check below that this model has 514.157 keys and vectors, respectively.
nlp.meta['vectors']

{'width': 300,
 'vectors': 514157,
 'keys': 514157,
 'name': 'en_vectors',
 'mode': 'default'}

In [7]:
## Again, checking the number of keys.
nlp.vocab.vectors.n_keys

514157

In [8]:
## Getting the word embedding: data (i.e., the matrix containing the vector values for each word)
word_embedding = nlp.vocab.vectors.data

## Verifying the shape of the word embedding matrix:
word_embedding.shape

(514157, 300)

--- 
### Finding the words associated with the embedding:

In [39]:
## Extracting the words associated with each index:
index = nlp.vocab.vectors.keys()
words_associated = [nlp.vocab[i].text for i in index]

In [10]:
## Checking the word in position 514156, wich is "Lahouaiej":
words_associated[514156]

'Lahouaiej'

In [11]:
## Finding the respective row (index) for a given word:
rows = nlp.vocab.vectors.find(keys=["cat", "dog", "Lahouaiej"])
rows

array([  3201,   1147, 514156], dtype=int32)

---
## SCA - Glasgow Norms
* Read the SCA from Glasgow Norms;  
* Import F_s and F_o from the previous study;  
* Train the MLP classifier.

In [25]:
df_factors = pd.read_csv('../data/df_factors.csv', sep=';')
df_factors.head()

Unnamed: 0,words,F_Objectivity,F_Subjectivity,F_Context
0,abattoir,0.512527,0.380603,0.960466
1,abbey,0.714765,0.240456,0.696198
2,abbreviate,0.286952,0.171052,0.767043
3,abdicate,0.144736,0.3843,0.863127
4,abdication,0.167654,0.334086,0.896733


In [26]:
# ### Selecionando apenas as palavras no df_factors que atendam aos critérios:
# df_selected = df_factors.loc[((df_factors['F_Subjectivity'] > 0.75) | (df_factors['F_Subjectivity'] < 0.3)) & ((df_factors['F_Objectivity'] > 0.75) | (df_factors['F_Objectivity'] < 0.3))]


In [27]:
# df_factors = df_selected.copy()

In [28]:
SCA_words = [word for word in df_factors.words]

In [29]:
SCA_embedding_rows =  nlp.vocab.vectors.find(keys=SCA_words)

In [30]:
len(SCA_embedding_rows)

5553

> Separating the SCA-GlasgowNorms data into train and test:

In [31]:
from sklearn.model_selection import train_test_split

# Separar os dados em conjuntos de treino (70%) e teste (30%)
train_df, test_df = train_test_split(df_factors, test_size=0.2, random_state=42)

In [19]:
# Função para criar os conjuntos de treino e resposta
def create_data(dataframe):
    X = {}
    Y = {}
    
    for index, row in dataframe.iterrows():
        word = row['words']
        f_objectivity = row['F_Objectivity']
        f_subjectivity = row['F_Subjectivity']

        if word in nlp.vocab:
            indice = nlp.vocab.strings[word]
            vetor_embedding = word_embedding[index]
            X[word] = vetor_embedding
            Y[word] = {'F_Objectivity': f_objectivity, 'F_Subjectivity': f_subjectivity}

    return pd.DataFrame.from_dict(X, orient='index'), pd.DataFrame.from_dict(Y, orient='index')

In [45]:
# Função para criar os conjuntos de treino e resposta
def create_data_debug(dataframe):
    X = {}
    Y = {}
    
    for index, row in dataframe.iterrows():
        word = row['words']
        f_objectivity = row['F_Objectivity']
        f_subjectivity = row['F_Subjectivity']

        if word in nlp.vocab:
            idx = nlp.vocab.strings[word]
            vetor_embedding = nlp.vocab[idx].vector
            X[word] = vetor_embedding
            Y[word] = {'F_Objectivity': f_objectivity, 'F_Subjectivity': f_subjectivity}
            print(f'-- DEBUG:\nIndex: {index}\nIndice: {idx}\nWord: {word}\n\n')

    return pd.DataFrame.from_dict(X, orient='index'), pd.DataFrame.from_dict(Y, orient='index')

In [44]:
nlp.vocab.strings['pudding']

3403131918490088723

In [None]:
nlp.vocab[3403131918490088723].vector

In [46]:
## Debugging:
Xbug_test, Ybug_test = create_data_debug(test_df)

-- DEBUG:
Index: 3865
Indice: 3403131918490088723
Word: pudding


-- DEBUG:
Index: 2801
Indice: 720313458719916916
Word: letter


-- DEBUG:
Index: 230
Indice: 13605738838253397229
Word: aroused


-- DEBUG:
Index: 3361
Indice: 13398675276606405380
Word: only


-- DEBUG:
Index: 2924
Indice: 4855781600843238028
Word: lunching


-- DEBUG:
Index: 1964
Indice: 527094160297002804
Word: folk


-- DEBUG:
Index: 3575
Indice: 10686358792985228098
Word: persist


-- DEBUG:
Index: 3221
Indice: 14565421918427191397
Word: national


-- DEBUG:
Index: 2022
Indice: 11216980060904606566
Word: freeze


-- DEBUG:
Index: 5146
Indice: 5714931530947501195
Word: trim


-- DEBUG:
Index: 439
Indice: 922631143178789688
Word: betray


-- DEBUG:
Index: 2759
Indice: 12601672956933186127
Word: lawyer


-- DEBUG:
Index: 373
Indice: 3119909018940161727
Word: bass


-- DEBUG:
Index: 4882
Indice: 16190201684402242524
Word: symptom


-- DEBUG:
Index: 4035
Indice: 6003327276716044531
Word: relate


-- DEBUG:
Index: 576
Ind

In [48]:
# Creating train and test datasets:
X_train, Y_train = create_data_debug(train_df)
X_test, Y_test = create_data_debug(test_df)

# Exibir as dimensões dos conjuntos de treino e teste
print("Train data dimension:")
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)

print("\nTest data dimension:")
print("X_test:", X_test.shape)
print("Y_test:", Y_test.shape)

-- DEBUG:
Index: 3565
Indice: 16752602526984977479
Word: periodical


-- DEBUG:
Index: 2572
Indice: 3166149224535108880
Word: intellect


-- DEBUG:
Index: 1253
Indice: 6807689115566132187
Word: culture


-- DEBUG:
Index: 1657
Indice: 4467559365180225526
Word: emancipation


-- DEBUG:
Index: 831
Indice: 17889400642917665921
Word: chasm


-- DEBUG:
Index: 2784
Indice: 17067105940613322182
Word: legend


-- DEBUG:
Index: 1335
Indice: 5441004699069252923
Word: decree


-- DEBUG:
Index: 2846
Indice: 2025327972831713556
Word: linear


-- DEBUG:
Index: 238
Indice: 14002191346934931615
Word: arse


-- DEBUG:
Index: 2617
Indice: 5481122766517169115
Word: jaguar


-- DEBUG:
Index: 3768
Indice: 18237595148359232214
Word: practice


-- DEBUG:
Index: 2505
Indice: 7936449425443571492
Word: impulse


-- DEBUG:
Index: 3240
Indice: 14014441641201808195
Word: neglect


-- DEBUG:
Index: 2974
Indice: 15691136892911553773
Word: mansion


-- DEBUG:
Index: 1001
Indice: 4303796670756400024
Word: combat


-- D

In [49]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
periodical,-1.627200,-0.782380,1.666100,0.64415,5.802500,-3.86060,2.11450,2.246400,0.11389,-0.57345,5.73660,2.68970,-2.9695,2.05920,1.17530,2.126600,4.70350,2.73360,-3.58720,1.118700,-0.33484,-1.407100,-4.75610,2.23020,-2.05900,-0.144110,-1.90420,-3.21300,-0.34878,1.15970,-1.340600,0.47455,0.76463,0.62292,-2.395900,-0.086495,1.158500,-0.680910,1.14940,1.32050,1.89910,-0.035937,-0.36217,2.45830,1.10640,2.30780,-0.640620,0.28873,-1.60050,-2.39470,-2.21640,3.73500,-0.78451,-0.50572,-2.403900,1.987700,-2.20640,2.12540,2.011200,-3.42740,0.386360,-0.990080,-2.24970,1.62110,3.09440,0.64471,1.432100,-0.62388,0.867700,1.11960,-1.925100,-2.90010,-2.830300,-2.88330,-0.33352,3.17870,-1.47500,1.181100,-1.86000,0.123780,-1.649600,-0.14419,-0.56384,1.13990,2.22990,3.66950,-1.08090,-2.70110,2.361700,-2.088400,-0.71105,1.610800,3.72370,-1.2607,2.37400,-1.998000,1.720000,1.45120,-0.399680,4.17960,2.69910,2.52750,2.08680,1.219600,-1.02390,3.43800,3.123000,-3.86070,1.48550,-4.18480,3.79030,0.88335,-2.85300,-0.98828,1.94630,1.176900,-1.83060,2.754000,-1.781200,-1.47680,0.84724,-4.41080,1.74440,1.40660,1.85590,-4.2074,-0.42883,-2.28530,2.61080,-2.44520,-1.565800,0.60370,2.07170,2.74070,0.50984,0.82838,-1.914900,-1.22880,0.70129,-3.16500,-0.403730,-1.53140,-0.564260,1.63540,3.8474,-0.94437,-0.99321,-0.052637,0.75979,3.275000,2.39540,0.05155,0.777010,1.33140,-1.79120,0.54084,-0.39345,-0.945120,-1.19330,-1.52460,-1.97940,1.4443,1.13990,3.445800,-2.80270,-1.21350,-6.362900,1.643800,-0.24045,0.16754,1.418400,0.92152,2.77860,0.41264,2.33820,3.16470,1.26470,1.10720,-3.23810,-1.25910,-0.83185,-1.605900,0.57993,-2.11800,-0.54878,0.76078,-0.074174,0.30904,-0.28318,1.759700,-0.83301,-2.57780,-3.72420,0.58209,3.82490,1.47940,-2.24200,-2.071500,0.82211,3.48490,-1.61720,-4.01410,-0.16727,-2.02820,7.086500,0.84765,-3.49450,1.27650,2.29560,0.475230,2.31430,-0.93912,0.030626,-1.464700,-0.42383,1.79430,-0.78176,-3.80790,0.45426,0.584070,-0.151750,2.42480,-3.875900,-0.19335,-3.51550,-1.390700,-1.231900,0.48635,1.660200,0.84975,4.61750,-0.81367,0.47439,1.677400,1.73930,4.31650,0.14046,1.41820,-1.87130,-1.494800,-1.02730,-1.264000,2.511700,2.146300,-1.05200,2.575400,-2.890900,2.070600,-0.56698,2.65870,2.59990,-4.95480,-2.07780,-0.67968,-0.62967,-2.59250,-1.57080,-2.25200,-0.10327,1.25720,-2.79040,2.89630,3.159800,2.28880,3.2515,-0.41810,2.101200,2.73860,-0.14320,-0.355010,-1.738700,1.54270,1.733800,-2.47970,1.041200,1.845200,-0.012340,-1.63990,-1.290000,-0.055819,3.78240,0.870420,3.85640,0.43643,0.13927,-2.903700,0.46992,0.76273,0.524870,-2.51860,2.55550,-1.15170,-1.55030,-2.03400,-2.33070,-0.35606,0.241200,-5.179000,-3.23970,-1.40730
intellect,-0.154690,1.676800,-0.351630,-0.57609,2.088700,2.61040,0.66495,2.229900,-4.99240,-1.65200,4.49340,3.32340,-1.5071,2.74590,1.54030,1.224500,1.71150,0.29566,-1.08720,-0.314340,1.09350,0.442640,-4.21780,-2.21770,-0.33491,-2.498900,-2.99290,-0.89613,-1.53340,2.52890,0.994780,0.24772,-0.47069,-0.42523,-2.946600,1.334300,0.589560,1.654400,2.08020,1.19020,-2.59160,2.190500,-1.19950,1.60120,-1.75390,1.97450,0.721130,-1.73170,0.54115,1.38490,-2.34060,2.23350,0.62044,-3.26140,-0.643240,-0.215590,-0.46420,3.68690,1.071700,-0.75127,3.244400,-0.049207,-2.92030,-3.47130,1.82240,3.72050,-2.088000,-6.61280,0.493980,3.00960,-0.572920,1.05430,-0.859930,1.67740,-3.97830,0.76590,-4.66610,3.192000,-0.93724,1.523100,-5.450300,0.74619,-0.54236,0.17702,0.88874,1.17650,-3.30020,-3.28040,1.087400,-0.005989,1.20840,1.451000,2.50880,-4.6394,-1.26170,-2.229100,-1.274400,-1.04070,-0.515950,-1.29090,2.07050,1.01100,3.28590,2.592700,-0.65749,3.79270,1.998900,-3.40520,0.15880,-0.95836,2.60320,-1.63630,-5.72650,1.48430,-0.53107,1.280700,-2.64150,-0.031421,-1.829600,-1.29770,-3.53430,-3.55110,-0.33288,-1.62280,1.65600,-2.6977,3.36630,-3.93610,4.78580,-1.45580,-0.869150,0.36156,4.53630,0.78411,-1.70530,2.83580,-2.682600,1.97320,1.30520,-0.81652,-1.535800,-0.94404,-0.372750,1.95850,1.9799,-1.33740,-3.64900,1.803300,0.10371,1.118000,1.87990,2.81200,-0.332130,1.00180,-2.54640,5.11280,2.43620,0.839770,-2.14260,-2.03840,-0.78856,0.4313,-0.77638,1.155100,-0.42149,-2.81420,-3.375400,0.614590,0.58055,-3.21500,-0.130490,-1.48030,4.28630,0.20696,4.24150,-0.23174,1.07430,-0.61895,-1.75000,-2.75500,-0.95037,0.070558,2.13730,-1.89640,-0.16132,0.34528,2.171200,-0.13851,2.09230,2.389700,2.02180,-1.24390,1.62920,-2.75260,1.08400,-0.29743,-4.04890,1.055500,2.78150,1.25040,0.56411,-2.20210,-1.69230,-1.03570,0.658580,0.25058,-3.30520,0.70577,0.52892,-3.168700,-1.19750,-0.43625,-1.373400,2.668900,-1.11050,2.49660,-0.66450,-1.17620,0.27200,-0.732460,0.216170,3.67900,1.160800,1.96280,0.60815,3.470100,1.750700,1.96270,3.680300,-2.89880,1.73320,-3.51050,-1.72720,1.511200,1.07170,1.29860,1.79330,0.95399,0.73600,-0.330690,-0.92509,-1.287000,0.005891,0.672720,-1.70060,-0.046672,-1.993700,-1.189500,1.28810,-0.36031,-0.33003,-2.70100,-3.99520,-0.92792,-0.47960,-3.57270,2.68390,0.32132,0.07164,-1.88370,1.05480,2.58480,1.543700,2.13180,2.1394,2.03950,-0.766690,2.02490,-1.17640,-0.846090,2.358600,0.51567,-0.579650,0.51374,-0.070151,0.607870,2.861000,-1.52260,-2.962800,2.607100,0.41263,0.704090,2.59140,1.27620,2.01980,-2.431600,2.20100,0.79349,2.710500,1.50770,0.51742,-1.79910,-1.50010,-3.83950,1.14140,0.38565,1.783600,-0.127660,-5.47600,5.23490
culture,-0.025891,-2.009300,0.494080,-0.33639,5.294000,-0.38846,3.17880,2.941100,-2.58270,-1.90800,9.93210,2.57700,-2.7695,2.40870,-0.93724,3.569700,4.01520,1.24150,-6.63620,1.338100,-0.58018,1.412300,-5.63670,-2.22540,-1.49760,-0.790420,-4.58810,-2.72850,-3.01050,-2.88340,0.886020,-2.87880,-0.71606,-2.92560,-3.411100,1.245400,-0.974340,-0.065676,-2.58170,-4.85310,-1.81980,-0.065377,-1.06350,-1.21150,-2.37400,5.15970,-0.036392,0.75706,2.40150,2.61130,-4.64370,7.00890,1.27430,-0.34365,0.155760,-3.304200,-0.58156,-0.38113,2.718400,-1.26620,3.574200,-1.545200,4.28390,-5.63260,3.92600,5.60770,-5.617400,-3.16180,1.724900,3.25270,-2.147000,0.19087,0.129230,1.89770,-0.55995,0.87206,-7.33190,3.662600,-2.45240,-1.228900,-5.418100,4.58060,1.81350,-2.51080,4.35600,5.18560,-1.80650,-4.31080,1.419100,-2.903500,1.14010,0.420130,3.38390,-6.9062,-0.81909,0.021181,1.714300,-0.13658,0.056538,-1.29300,2.67110,1.10230,3.96860,1.357700,-2.47190,6.29400,4.162000,-2.92320,1.48740,-4.60240,0.45782,-0.88730,-3.65720,-1.78900,4.08540,3.396500,-5.64910,-4.797700,-2.016400,-4.27010,-0.56682,-2.66340,3.53200,-3.05910,-0.82543,-10.1040,2.17150,-4.79580,4.00200,-4.54470,-3.034200,0.90221,5.07690,0.36526,-3.50820,0.91573,-2.528100,2.55330,0.20130,0.25602,-0.099994,-2.35600,-1.326100,7.01660,6.2343,-1.75840,-7.96250,1.276100,-1.96440,1.754200,-0.67633,2.30410,0.216890,-1.94510,-0.22362,3.96070,4.08070,6.144700,-1.96020,-1.81160,-4.26070,2.6072,3.97100,2.424100,-1.12290,-7.11140,-2.139500,-2.172900,0.48087,-0.48392,0.543610,-1.75280,2.10890,3.60080,1.22520,0.60290,6.61360,2.73010,-4.03740,-3.27810,-4.63570,0.240240,0.90178,-9.09790,-3.67030,-1.59940,1.590200,0.42520,2.16350,-0.728710,-1.26980,-4.58810,3.81620,-0.48525,4.92570,-0.20912,-6.26470,2.050500,-1.15220,1.77700,0.49617,0.23389,-4.36010,0.17055,5.299000,-1.52680,-5.88510,-2.81700,0.14140,-4.961700,4.10160,-0.68934,-4.344400,-1.096100,3.00740,0.43708,-3.21700,-3.41920,3.10250,0.175950,-1.273400,7.31250,-1.175500,2.09870,-0.26209,2.058100,-0.098894,2.09790,3.488400,0.17538,5.83040,-0.15934,-0.19243,-0.833850,0.48663,2.24560,-0.14880,1.16130,-0.20422,-1.027500,-5.71580,1.804600,-1.066400,1.981100,-2.31230,-0.236490,-2.342300,1.143500,0.20301,1.61580,-2.51160,-2.39530,-3.57160,0.16571,3.17390,-2.61400,-0.41361,1.91390,0.10456,-3.97950,1.58700,4.07450,7.139200,0.87387,1.6085,2.17180,-2.344000,4.60760,-2.14800,2.672800,3.026700,1.00370,6.393100,-1.55260,2.381300,-0.519570,8.008300,-2.47130,-3.721400,-1.216200,2.73980,2.088700,3.85770,0.13715,0.28561,0.699800,3.24230,-1.49210,0.092843,0.15072,6.35320,-0.75585,-0.47891,-2.78880,-0.65771,-4.51980,1.242700,-1.250500,-7.25270,5.32900
emancipation,-3.593900,-0.710900,2.035000,2.18820,2.327500,0.66056,2.73950,0.424270,1.17990,-0.65433,5.16570,0.46039,-2.0579,0.70445,0.36773,2.295300,1.86580,4.62370,-3.41510,-2.199300,0.21437,0.053364,-1.46410,2.18700,1.33930,0.998360,-0.91223,0.33866,-1.50080,1.15420,0.700000,1.53670,-0.27007,-1.96330,-2.191300,0.186620,3.972300,-1.362400,-0.83770,-2.46570,-0.27012,-1.781800,-1.99920,2.76880,-2.40490,1.39090,-0.588020,0.60950,0.29728,-0.95512,-4.55100,3.12170,-1.19070,-2.68970,-0.022625,0.291730,-0.64829,0.52194,-1.159500,-1.82420,-1.181400,-2.711800,0.38731,-0.68361,2.79870,0.10960,-0.562670,-1.87880,1.958400,1.33660,-0.656860,-1.65140,-1.772600,-0.46708,-2.67600,1.39790,-0.79687,3.565100,-3.07810,1.277100,-0.216810,1.88260,-0.87186,0.86288,0.68938,1.99030,-4.22980,-3.47770,2.047200,-0.330290,1.26850,-0.219880,2.37120,-4.0231,-0.19215,1.861500,-2.830800,2.28260,0.746800,1.09940,3.59450,-0.14420,4.45280,2.605300,-3.56310,3.98320,2.740700,-1.42860,-0.26001,-2.60270,2.32010,0.27439,-3.68150,0.42722,2.06820,-0.227510,-0.31495,-1.633200,-1.586600,-1.54400,-0.85545,-4.61010,0.42561,-0.37434,-0.66904,-1.8518,2.21440,-2.17700,1.60470,-5.03290,-4.913600,1.70130,3.38390,1.19980,-0.85484,2.33130,0.046068,0.03594,0.42032,-0.17111,-2.165400,0.19060,1.327600,2.37620,1.2239,1.72810,-3.59750,-1.746600,2.26110,1.372400,1.00670,2.67910,0.012013,1.29500,-1.51490,3.01670,-0.70644,-1.282700,-1.90060,-2.85490,-1.73340,-1.1747,-0.32036,3.397900,0.25931,-1.40360,-4.786100,-0.578550,1.71910,-1.49570,0.026414,-0.63771,1.59070,-0.70014,3.84290,0.69901,2.15640,1.20940,-1.32800,-0.97633,-0.98791,-1.126700,0.58835,3.11560,-0.36938,1.35430,-3.623900,2.21050,3.76670,0.522340,-1.23490,-2.15050,2.11280,0.13715,2.06560,-0.49369,-3.92640,-0.538980,1.31620,0.48772,0.42021,-1.03570,-1.56240,-1.86160,1.368700,-1.20730,-0.39899,1.94360,-0.74404,-1.024600,-0.17769,-0.13308,-4.302600,1.671600,0.18445,1.96200,0.61542,-1.51150,3.16760,0.976680,-3.430400,0.99340,-0.239890,0.66352,-0.70664,-0.508860,0.099583,1.72000,-0.600780,-0.45151,4.19120,-2.18170,-0.48550,1.869300,3.08770,1.79100,0.49664,0.90111,-0.16035,-0.370410,1.75720,1.986600,1.179800,0.842560,-0.70874,-2.143500,-2.193500,1.925100,0.81953,0.66142,-0.26772,-3.31380,-3.40180,-1.10330,2.25250,-2.95100,1.93140,-2.60100,0.40183,0.43561,-1.75290,2.79980,4.230700,3.56460,3.1861,0.79113,0.160230,-0.56664,-0.67909,1.063200,0.058384,-0.87996,0.137170,-3.74850,2.179400,0.681880,-0.031669,-3.23550,-0.039727,0.842840,-0.10720,0.056777,1.46410,-1.07080,-0.07976,-1.468400,1.13720,1.50810,0.416920,-0.92111,0.46328,-0.22063,-1.37600,2.01930,1.47420,-0.93661,0.339360,-4.583200,-1.69060,-0.82634
chasm,2.750600,2.573200,1.016900,-2.14450,-1.060800,1.80230,3.07630,3.057700,-1.46700,1.42200,0.70305,-1.97780,2.9831,0.36193,2.52700,-1.948600,-1.25480,0.51131,1.62750,-2.613300,0.58493,-1.803500,2.46640,-5.31790,-0.50935,-1.294100,-0.65024,-2.92210,0.95135,4.97530,-1.826700,-0.55654,2.49200,-1.12020,0.339700,-0.561840,-2.837500,0.588370,0.74502,1.85070,2.81040,-0.549920,1.26800,-0.81937,-0.55677,1.66870,-2.035500,-0.91479,-0.19931,1.64840,-0.63491,3.40740,0.53539,2.66370,5.122700,-1.884100,3.20670,4.81970,1.614400,3.62620,0.987940,0.956470,-0.84016,-1.17320,0.72473,0.15449,-1.810300,1.01140,3.335300,-0.22606,0.044205,1.26410,0.230430,-0.16398,-0.12312,-0.57496,-0.14546,3.340400,0.46135,0.005658,-1.337800,-1.57200,4.16080,-1.26620,3.56330,0.32614,0.77551,1.39830,-0.948420,-2.047900,1.76320,-2.040900,0.54665,-1.1183,-1.02900,-5.396700,-0.072316,1.71340,2.097800,-3.86850,0.53684,4.64100,0.94240,-0.920300,1.20590,-4.50880,0.029027,-1.29970,-3.05030,0.70353,-2.42910,-0.22757,-2.12470,1.79200,0.10570,2.729100,0.92573,-0.235050,2.104500,2.24490,0.31430,0.20011,-2.77360,1.66740,-0.67716,-1.4961,-0.72889,0.38276,1.23860,2.67100,-0.966700,-0.72480,0.53563,1.73520,-0.99791,0.24672,0.154150,2.44150,-0.27611,1.39880,-2.366800,0.70963,-0.036663,2.61300,1.7017,-1.10840,-3.59690,0.287030,-2.23330,0.213010,0.76537,0.75961,-0.216280,0.61621,-0.74944,1.75730,0.94366,0.396800,0.97275,0.60177,1.63480,1.2012,-1.12590,0.657470,-2.53780,-1.38280,3.541600,-0.207880,-0.38460,0.73433,-0.814550,1.99140,3.02940,-2.15540,-0.23910,-0.93230,-1.48080,-2.41170,0.19378,-0.89883,2.00230,-0.683490,2.30590,6.06410,-2.53060,-2.66340,1.516300,3.23530,0.74947,-2.040800,1.72680,0.11092,1.62050,-1.62180,0.97280,-2.60500,0.41622,1.600300,-0.73288,2.87940,1.92910,1.80330,-0.90962,0.95697,-2.309700,0.88276,2.15630,-2.97900,-0.60915,2.719300,-0.53551,0.31742,-4.203100,0.739750,1.01990,1.12210,-2.10050,2.18220,-2.24410,-3.917300,0.051043,-1.53780,0.112810,3.63420,-1.22350,0.688780,2.968800,1.66070,-1.092700,1.52060,-1.12890,-2.34390,1.06860,-0.673900,-0.51437,-1.77700,-2.80930,-0.85966,1.03230,2.320200,2.12760,1.292600,1.912900,2.265500,-1.33760,-0.390130,-0.588680,0.028383,2.57520,-0.33339,1.31490,-0.30675,1.89310,1.11620,-0.92958,2.48760,-2.39580,3.20550,0.80498,-1.20660,2.27700,2.27880,2.767900,-0.77083,-1.9490,0.12276,-2.089900,-1.59570,-0.76543,2.218800,0.601270,-1.97700,-0.632140,0.56501,1.716500,0.083645,0.408870,-2.23580,-3.022200,-1.513700,-2.29610,0.275880,-0.32983,0.78036,0.24884,0.524830,-1.78930,1.46540,-2.705100,1.32230,-1.68800,0.71314,3.44560,-1.67040,2.73970,1.79250,-1.171300,0.825910,1.09540,-0.33145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prawn,-1.104300,-2.707300,-0.040561,0.26686,-0.001637,-2.23060,-0.22431,0.000931,-3.39360,0.79063,1.27040,-1.07850,4.0531,0.67150,2.11690,-2.671500,2.42790,1.05880,-0.45178,-1.957400,-1.84740,0.457250,-0.24633,-0.33081,0.80056,-2.540900,-1.50370,1.73170,0.41756,-1.06280,-0.759530,-1.01910,0.35531,1.69750,0.031765,-2.213700,-0.835700,-2.115100,-2.24870,-1.28940,-0.27724,-0.688070,-1.22270,1.25360,3.23680,-2.09900,1.358700,0.41670,-1.36030,0.40274,-2.15990,1.77110,3.16060,-1.10020,-0.047590,0.389680,-4.11870,-0.72922,3.945300,-1.10940,0.091196,-3.118800,0.60101,0.41856,-0.12499,-1.04300,-2.139500,0.79306,4.476200,-0.60113,-1.159700,-0.47491,0.876180,0.12360,0.55055,0.57404,-0.41902,-2.232700,1.61490,-0.670920,-0.231260,1.32260,1.16620,1.64360,0.77525,-1.45310,4.45290,-0.24775,-5.076800,-0.592100,4.22130,1.223100,0.06833,-1.3037,0.82144,-0.382020,-1.615600,-4.27030,1.169000,-2.67750,-0.76433,-2.87180,-1.15110,-2.639100,-2.32520,0.84838,-4.056700,2.26470,-0.10721,-0.93210,1.58070,2.37090,0.13026,1.07810,-1.45340,1.160300,1.76380,0.472660,-1.862400,1.36450,2.19590,1.58800,-0.91232,-0.22428,-2.60760,1.3206,0.60568,-3.32510,1.68120,0.49812,0.005416,-1.00250,4.08640,-0.59756,2.99110,-0.34123,-2.445600,-3.01660,-2.31230,-2.21710,-2.207500,-1.84500,-2.935300,-1.22150,-1.0298,0.11511,2.25820,1.921700,-2.38190,-2.914300,0.33902,2.02470,1.600300,0.22631,1.83640,-2.89190,4.53980,0.649300,-2.97270,-1.13010,0.66959,-3.0716,0.86142,-0.035147,0.85396,-0.45759,0.491200,1.853100,0.16183,1.77550,2.534600,-1.61130,-0.19102,-1.21700,-2.99390,1.33810,1.03430,-3.44960,0.19574,1.90480,-0.26760,1.720100,-0.94370,-0.77787,0.79945,-1.45430,0.876900,0.26753,-0.36387,-0.093211,-1.32240,1.48300,0.26998,2.50130,2.98360,0.20873,-1.59740,0.039919,-1.25460,1.11380,-0.29331,0.10032,1.44510,0.77492,-0.859970,-2.31500,1.13370,0.58997,1.57200,1.535600,1.09310,4.73890,1.367200,-1.365400,2.43540,-0.22668,0.32661,1.39960,-0.17115,-2.220800,1.620800,-0.04317,0.019486,1.16710,-2.54490,-3.576900,-0.755350,0.93861,0.745220,0.99256,0.45533,-1.65400,3.83300,0.874210,-2.10910,0.69480,0.13602,-0.25346,-2.25260,-0.461470,0.57045,-0.844940,-1.812400,3.517500,-2.49160,1.544200,-0.110820,-1.169600,0.75190,-1.04560,-0.60534,0.73169,-1.51630,0.36447,-0.43996,-2.98020,1.30050,5.71540,-0.43361,-0.10413,2.24360,-0.65180,2.253600,1.25310,-2.1737,2.05000,-0.966150,-1.37290,-2.06260,1.236500,1.385500,0.74181,-0.747560,0.53471,-0.478210,-1.642500,4.299100,1.95100,2.398200,0.755330,1.32540,-0.999460,-3.18630,-0.46555,-0.85569,1.996800,1.08150,-2.33210,-3.177300,1.32630,2.81550,-2.56200,1.02140,-2.52040,-1.98950,1.48670,1.859700,-0.720330,-1.17220,4.63340
tweezers,-1.051400,2.791100,1.661000,2.09520,-0.197260,-1.18820,-2.49820,0.762750,-3.10200,0.28440,1.25780,-0.25551,-2.4484,1.66040,2.60270,-0.786900,-0.27374,0.45025,-1.19510,-1.397300,1.83930,0.777190,-0.42865,-1.18360,-2.51810,1.663300,0.51111,-2.17110,1.66160,-1.41530,-0.719240,-1.32160,0.96204,0.57908,-1.569400,0.488770,-0.080231,1.210400,0.93008,0.63568,3.39440,-1.291600,-1.21000,0.93155,0.67805,-0.88822,-0.324650,-0.43579,1.67340,-1.13810,0.13689,-0.21652,1.59280,0.58053,-2.240600,0.736970,-1.40510,1.10800,1.239700,1.87910,1.446200,-0.969960,-0.41381,-0.75209,-1.38560,-1.91540,-1.743300,-0.84920,-1.219800,0.61595,0.246810,-0.16661,1.111400,0.73979,-0.87424,1.03820,-0.96422,0.099101,1.95790,-1.929300,0.243220,0.37107,0.29520,-1.26250,0.98840,-1.67420,2.04930,0.86087,0.289030,0.409750,-0.26276,2.195400,0.58195,-1.5595,-0.57985,0.362730,1.877900,0.66343,1.662200,-3.43360,-4.07940,-0.26883,1.29730,0.935120,1.80660,-0.40820,-3.347100,-0.75727,0.29333,-2.88540,1.21160,-1.92670,1.05800,-2.10070,1.49010,1.954800,-2.14730,-1.565200,1.276700,-0.54121,-2.84620,1.98850,-1.76640,-0.82500,0.44079,1.5465,-1.18060,1.10080,0.99759,0.66480,0.541960,0.27648,3.28520,-0.77516,0.79350,-0.89177,-1.323600,-2.61820,0.56128,2.99470,1.031700,-0.75328,-0.960240,-1.54360,1.3853,0.71152,1.96390,1.367000,-0.14363,0.307950,-0.76938,1.50040,-2.753400,0.12923,0.14465,-0.19616,3.39110,0.014322,-0.22051,-0.19223,-0.96584,1.2381,1.55070,1.055800,-0.51918,1.74750,0.449510,-0.075654,0.81403,1.85680,-0.574880,-0.42613,-0.30243,1.99100,1.73850,-0.92640,1.15100,0.15304,-1.40080,0.29378,-2.23010,-1.678300,0.16599,-2.13930,-0.18528,-0.28358,-2.073900,0.39420,2.32540,1.753600,-0.78371,2.47380,-0.17944,1.23790,-0.97951,-0.23136,1.25360,-1.462400,-0.74346,1.68660,-1.52400,-0.37083,-0.04727,1.84810,-3.210200,0.78113,-1.99610,0.21307,0.76154,-0.161610,0.36111,1.56480,2.045600,-0.579510,0.24752,-1.91470,1.79970,-1.48600,-0.64946,0.092398,0.033733,-1.01340,-0.005258,0.16369,0.20604,-0.475930,0.925110,1.09510,0.449390,-1.30950,-3.82180,-1.66720,2.09050,-2.444100,-1.02850,0.35641,0.33633,0.67849,1.06070,0.024615,-1.26350,-0.885400,0.511510,1.377200,-0.65389,-0.896320,-0.052985,1.622000,-1.51960,-0.14711,-0.25025,-0.30835,-0.37862,-1.04880,-2.88280,-1.68220,1.25040,-0.36352,-1.33040,-0.78464,0.64079,1.25230,-0.378400,0.07206,2.8684,-0.67813,-0.051556,0.66535,-1.88780,-0.096846,1.567400,-0.13019,0.812810,0.43700,-0.997170,0.205930,-2.795000,0.22152,1.164000,0.400630,-2.67430,-1.743600,-2.21190,-1.44910,0.48884,0.163980,0.42230,-0.10075,-2.088400,0.12695,0.86796,-0.44270,1.51110,0.88290,-2.94660,0.53499,0.968740,-0.909550,-0.77088,-1.42830
university,-1.409000,0.093909,0.172970,-0.25512,2.648800,2.68320,3.31220,3.572100,-0.04109,0.20994,6.32210,2.16870,-3.5058,1.47520,-4.75480,3.350800,-0.33362,1.60770,-3.55040,1.577800,2.69070,-1.828200,-1.44860,4.05240,1.51340,-0.738930,-2.27990,-0.59881,0.17003,0.86724,-1.115300,0.97831,-0.11701,-4.63420,-0.425280,2.721900,2.783400,-0.977430,-1.82040,1.00540,-2.38000,1.019100,-0.43979,1.32090,-0.55085,-0.71298,1.985400,-3.22160,0.32454,0.96552,-5.33280,2.39640,-2.49370,-1.27780,2.410100,0.017914,-1.67100,0.46102,1.483300,-1.13750,2.062600,2.129600,-1.38010,-0.36057,3.11800,1.25480,-1.632900,-0.88696,3.423400,3.12200,-2.172100,-0.22962,-0.053071,-2.72480,1.52750,-0.24999,-6.84280,2.715800,-2.68020,0.001401,-4.277600,1.41080,1.22410,-2.00500,2.74760,4.41640,-2.25800,-2.51410,3.461400,-4.544500,0.40168,-0.418510,2.38540,-2.1468,-0.55905,3.212200,-1.798900,1.05870,3.614100,1.81860,2.08940,0.98631,0.56596,0.070605,-4.00980,0.86601,2.498700,-3.34370,-1.88030,-2.28510,4.58290,0.34637,-0.75228,-0.15655,1.39970,-0.090387,-0.74818,0.579160,-3.092900,-3.87350,3.25160,-3.07060,-0.12363,-0.99010,-0.81342,-4.1108,0.80477,2.56820,2.48870,-4.93190,-1.997500,-1.44020,0.92411,4.37520,2.14460,-0.26910,-3.525400,1.33400,1.61900,0.13575,2.022400,-0.24849,2.476500,3.55800,2.6428,1.87540,-3.49840,-0.129610,1.12580,0.949480,3.34290,0.17419,0.255350,-0.85691,-2.23920,-1.23240,2.32680,3.497000,-2.71310,-2.84180,-3.79680,-4.4961,0.41794,2.915800,0.12558,-1.38420,-0.594160,0.940440,1.20770,-3.37770,1.535100,0.80680,-0.32871,3.07940,1.68720,3.19920,-0.38148,-3.12800,-5.55640,-4.20330,-4.83770,-0.460500,0.56142,1.18600,0.71141,0.21683,0.587450,-1.91980,3.42750,2.464100,0.54541,-4.45570,-0.35613,0.10274,1.62080,0.83618,-3.12180,0.039807,1.17790,3.81150,0.85812,1.15590,0.91935,-0.11443,5.923900,-2.99090,-0.12038,2.07070,0.78058,-3.073900,2.90390,-0.36320,0.031618,3.516100,-1.17930,0.40910,1.33100,-3.55970,1.24180,0.163920,-0.427040,0.54761,-1.957000,1.70520,-3.01380,-0.581950,0.931020,2.97690,-0.001848,2.54760,5.06570,-1.46890,-0.12631,2.299300,1.11340,-0.72946,2.33190,-2.60930,-6.44370,-0.425730,-3.91270,3.703600,1.511700,2.848100,-2.20460,1.375500,-1.203600,-0.865640,0.33426,1.15710,1.86250,-1.48870,-4.59200,-0.71449,-2.72820,-0.64239,1.12360,-2.95370,0.29098,-0.82635,-0.83189,2.93170,-0.025029,1.47910,3.3588,-0.72842,-2.406700,2.57350,-2.46410,-0.865740,-5.400700,4.47350,-0.045084,-2.22420,-0.505390,-3.249100,2.548300,-0.50388,-1.994300,2.232200,4.77230,1.011600,2.72900,-3.88070,1.42500,0.087203,6.92320,3.19050,-3.446400,-4.56790,3.26230,-1.17040,-1.25060,-0.83331,0.25252,0.19663,1.934500,-0.546330,0.59882,0.67005
wasteful,-1.105000,-0.335100,-1.827500,-3.49660,1.209300,1.20280,0.13930,-1.561000,-0.51685,-0.91023,-1.17200,1.41780,-1.3183,-0.41318,-1.32020,-0.065525,0.44967,-3.58220,-1.34340,-0.089962,0.45934,2.268900,1.43170,-0.39841,-0.62190,-0.160290,-0.58802,-0.35059,-1.65700,1.35900,0.054986,-1.76320,2.74640,-0.35222,-0.161130,-3.106100,0.872900,1.180900,1.79850,4.34930,-2.12080,-0.815080,1.12240,-2.91550,-3.86080,3.60060,1.589700,-4.24100,-1.52270,1.69500,2.84760,0.57630,0.05843,-4.25040,-1.141400,2.924100,-0.77971,1.24690,-0.054012,1.33850,1.601400,0.720690,-0.23659,-3.25930,-1.71380,1.42530,-0.036378,-1.99000,2.606700,0.12317,2.770300,0.81775,-0.476030,0.54408,1.46390,-0.85801,0.45131,1.308900,2.01630,3.325400,-0.062706,-0.18580,1.31710,-2.75250,0.98363,-2.13360,-1.08230,0.31999,1.393000,0.663660,-1.55320,-0.013998,-0.32254,-3.0208,1.49210,-1.863100,4.849900,2.53550,-1.579400,-0.76569,-2.18390,2.18990,0.78856,2.838100,-2.02300,2.54270,-2.298400,-1.74060,0.01312,-0.41795,0.28606,0.33239,1.26100,-1.43130,-0.13631,6.166700,-2.28340,-1.147200,-0.003134,-0.13650,-1.28380,-2.66780,-3.31710,0.85749,0.93906,-1.6213,0.11316,-0.38535,-0.89591,-2.41900,-2.960400,1.20210,0.24789,-0.94692,-1.11340,-0.33343,0.369070,-3.02930,1.39670,-1.62660,-0.837390,-1.51920,0.824220,0.98789,-3.1735,0.17848,-2.80420,-1.275600,-0.22956,0.300370,-3.66690,3.46120,0.836550,1.55180,-2.86200,0.43425,4.20240,-0.378450,-1.89990,-2.92280,3.79100,-1.0364,-0.44754,0.038782,-2.31370,0.72862,-1.389000,0.078435,1.02510,0.36570,2.302700,2.99070,2.15750,-3.05700,-0.20469,-1.55610,-0.48963,2.21570,-0.13542,-0.87399,-0.54787,1.536900,2.87570,-0.85629,-0.85608,1.57460,-1.748700,-0.83955,-0.22999,3.595800,-2.61620,3.69480,-4.11620,-2.31480,-3.19620,0.69048,-2.87030,-0.571620,-1.86030,-0.22516,-1.51820,1.70160,1.76680,0.34031,-0.063556,-0.67667,-1.28090,-1.95720,2.07360,-0.076503,-0.62076,1.30930,1.876000,-0.092753,-0.49408,0.67805,-1.62550,-0.37743,0.40012,0.701570,0.306580,0.90685,1.348900,-0.46080,-1.74560,-0.044984,2.033900,1.08190,-0.113160,-0.80531,1.58630,-0.86387,-0.91331,0.019052,2.12610,-0.14218,-0.69877,-0.79824,-0.34580,1.620800,-0.73612,-0.068276,-1.358700,-1.631200,-2.51900,-4.497000,-1.405400,-0.801590,-0.73764,2.13100,2.63070,0.35094,-1.48070,1.23020,2.16190,-4.10750,-0.69514,2.25390,-2.37720,1.02170,1.74560,2.06350,0.065202,-0.42910,1.7553,3.05110,-1.182600,0.90868,-3.67790,-0.381250,0.312420,-2.48710,-0.164910,2.47460,0.008135,2.393200,1.508000,-1.11210,0.031565,3.545900,-0.23326,-1.590300,-2.89540,-0.62763,1.56390,-1.607500,1.48040,0.54725,-1.614600,0.79069,1.19360,-0.92690,2.68170,-0.71914,-1.27960,0.42766,-1.082500,0.035378,-3.72800,1.17320


In [50]:
Y_train

Unnamed: 0,F_Objectivity,F_Subjectivity
periodical,0.316950,0.157740
intellect,0.166311,0.756693
culture,0.368721,0.672379
emancipation,0.126689,0.368884
chasm,0.450759,0.459234
...,...,...
prawn,0.952891,0.110865
tweezers,0.953419,0.193659
university,0.835052,0.695757
wasteful,0.200742,0.492842


> Comparing X_test with Xbug_test

In [51]:
X_test == Xbug_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
pudding,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
letter,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
aroused,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
only,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
lunching,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pass,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
stark,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
decay,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
deceit,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


#### Binarizing Y_train and Y_test
Once we run the first MLP model, the performance wasn't over 54%. 
In this Section, we will binarize the semantic factor values following the median values.

In [23]:
# Binarizing through list comprehension
Y_train['F_Objectivity'] = ['high' if f_objectivity >= 0.565 else 'low' for f_objectivity in Y_train['F_Objectivity']]
Y_train['F_Subjectivity'] = ['high' if f_subjectivity >= 0.392 else 'low' for f_subjectivity in Y_train['F_Subjectivity']]

Y_test['F_Objectivity'] = ['high' if f_objectivity >= 0.565 else 'low' for f_objectivity in Y_test['F_Objectivity']]
Y_test['F_Subjectivity'] = ['high' if f_subjectivity >= 0.392 else 'low' for f_subjectivity in Y_test['F_Subjectivity']]

In [52]:
# Binarizing through list comprehension
Y_train['F_Objectivity'] = [1 if f_objectivity >= 0.565 else 0 for f_objectivity in Y_train['F_Objectivity']]
Y_train['F_Subjectivity'] = [1 if f_subjectivity >= 0.392 else 0 for f_subjectivity in Y_train['F_Subjectivity']]

Y_test['F_Objectivity'] = [1 if f_objectivity >= 0.565 else 0 for f_objectivity in Y_test['F_Objectivity']]
Y_test['F_Subjectivity'] = [1 if f_subjectivity >= 0.392 else 0 for f_subjectivity in Y_test['F_Subjectivity']]

In [53]:
Y_train

Unnamed: 0,F_Objectivity,F_Subjectivity
periodical,0,0
intellect,0,1
culture,0,1
emancipation,0,0
chasm,0,1
...,...,...
prawn,1,0
tweezers,1,0
university,1,1
wasteful,0,1


---
### Training a MLP Classifier for word semantic content

In [54]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

In [55]:
# Define a new MLP architecture
model = Sequential([
    Dense(256, activation='relu', input_shape=(300,)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')  # 2 neurons for binary classification with softmax activation
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Use categorical_crossentropy for categorical data
              metrics=['accuracy'])





In [56]:
# Print model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               77056     
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 32)                2080      
                                                                 
 dense_4 (Dense)             (None, 2)                 66        
                                                                 
Total params: 120354 (470.13 KB)
Trainable params: 120354 (470.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


> Como converti a saída em dados categóricos, é preciso antes utilizar OneHotEncoder:

In [57]:
## Como já foram transformados anteriormente em [0,1], preciso apenas torná-los como lista:
Y_train_array = Y_train.to_numpy()
Y_test_array = Y_test.to_numpy()

X_train_array = X_train.to_numpy()
X_test_array = X_test.to_numpy()

# Print the first few elements to verify
print(Y_train_array[:5])  # Print the first 5 elements


[[0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]]


In [58]:
# Convert multilabel categorical labels to binary vectors
multi_label_binarizer = MultiLabelBinarizer()
Y_train_encoded = multi_label_binarizer.fit_transform(Y_train)
Y_test_encoded = multi_label_binarizer.transform(Y_test)

In [59]:
len(Y_train_encoded)

2

In [60]:
Y_train_encoded

array([[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
       [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [61]:
Y_train

Unnamed: 0,F_Objectivity,F_Subjectivity
periodical,0,0
intellect,0,1
culture,0,1
emancipation,0,0
chasm,0,1
...,...,...
prawn,1,0
tweezers,1,0
university,1,1
wasteful,0,1


In [62]:
X_train_array

array([[-1.6272  , -0.78238 ,  1.6661  , ..., -5.179   , -3.2397  ,
        -1.4073  ],
       [-0.15469 ,  1.6768  , -0.35163 , ..., -0.12766 , -5.476   ,
         5.2349  ],
       [-0.025891, -2.0093  ,  0.49408 , ..., -1.2505  , -7.2527  ,
         5.329   ],
       ...,
       [-1.409   ,  0.093909,  0.17297 , ..., -0.54633 ,  0.59882 ,
         0.67005 ],
       [-1.105   , -0.3351  , -1.8275  , ...,  0.035378, -3.728   ,
         1.1732  ],
       [-0.92249 ,  2.2266  , -7.5058  , ...,  2.9505  , -5.3231  ,
         2.5729  ]], dtype=float32)

In [63]:
Y_train

Unnamed: 0,F_Objectivity,F_Subjectivity
periodical,0,0
intellect,0,1
culture,0,1
emancipation,0,0
chasm,0,1
...,...,...
prawn,1,0
tweezers,1,0
university,1,1
wasteful,0,1


In [64]:
# Train the model
history = model.fit(X_train_array, Y_train_array, epochs=50, batch_size=16, validation_split=0.2)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [65]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_array, Y_test_array)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 77.36%


In [66]:
# Treina o modelo
history = model.fit(X_train, Y_train,
                    epochs=50,
                    batch_size=32,
                    validation_split=0.1)  # Usamos parte dos dados de treino como validação

# Avalia o modelo com os dados de teste
loss, accuracy = model.evaluate(X_test.values, Y_test.values)
print(f"Acurácia do modelo nos dados de teste: {accuracy * 100:.2f}%")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Acurácia do modelo nos dados de teste: 76.71%


### Utilizando XGBoosting for multilabel:

### Utilizando PyCaret:

In [67]:
import pycaret.classification
import pycaret.regression
import pycaret.clustering

> Adjustin Y_train for use with PyCaret:

In [84]:
df = X_train.copy()

In [81]:
# Definir uma função para mapear os valores das colunas para os rótulos desejados
def map_labels(row):
    if row['F_Objectivity'] == 0 and row['F_Subjectivity'] == 1:
        return 'Latent'
    elif row['F_Objectivity'] == 0 and row['F_Subjectivity'] == 0:
        return 'Contextual'
    elif row['F_Objectivity'] == 1 and row['F_Subjectivity'] == 0:
        return 'Manifest'
    elif row['F_Objectivity'] == 1 and row['F_Subjectivity'] == 1:
        return 'Perceptual'

In [86]:
# Aplicar a função de mapeamento para criar a nova coluna "Target"
df['target'] = Y_train.apply(map_labels, axis=1)

In [71]:
## Adding F_Subjectivity and F_Objectivity to df:
df['F_Subjectivity'] = Y_train['F_Subjectivity']
df['F_Objectivity'] = Y_train['F_Objectivity']

In [72]:
df['target'] = Y_train['F_Subjectivity']

In [87]:
# Exibir as primeiras linhas do DataFrame resultante para verificar
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,target
periodical,-1.6272,-0.78238,1.6661,0.64415,5.8025,-3.8606,2.1145,2.2464,0.11389,-0.57345,...,-1.1517,-1.5503,-2.034,-2.3307,-0.35606,0.2412,-5.179,-3.2397,-1.4073,Contextual
intellect,-0.15469,1.6768,-0.35163,-0.57609,2.0887,2.6104,0.66495,2.2299,-4.9924,-1.652,...,-1.7991,-1.5001,-3.8395,1.1414,0.38565,1.7836,-0.12766,-5.476,5.2349,Latent
culture,-0.025891,-2.0093,0.49408,-0.33639,5.294,-0.38846,3.1788,2.9411,-2.5827,-1.908,...,-0.75585,-0.47891,-2.7888,-0.65771,-4.5198,1.2427,-1.2505,-7.2527,5.329,Latent
emancipation,-3.5939,-0.7109,2.035,2.1882,2.3275,0.66056,2.7395,0.42427,1.1799,-0.65433,...,-0.22063,-1.376,2.0193,1.4742,-0.93661,0.33936,-4.5832,-1.6906,-0.82634,Contextual
chasm,2.7506,2.5732,1.0169,-2.1445,-1.0608,1.8023,3.0763,3.0577,-1.467,1.422,...,0.71314,3.4456,-1.6704,2.7397,1.7925,-1.1713,0.82591,1.0954,-0.33145,Latent
legend,1.8293,-1.7252,5.1394,-2.1605,-1.0839,0.60729,4.5257,1.5189,5.9501,-1.8531,...,-0.35684,2.8417,2.2139,-0.22289,-2.0552,-0.70992,3.7021,-2.1626,-0.92589,Latent
decree,-3.3282,-2.8391,-1.438,0.24277,2.3709,-2.005,0.58508,0.32332,3.6652,-2.5349,...,5.5971,-1.1944,1.4372,1.8781,2.1233,2.1412,-0.40861,1.2343,1.2787,Contextual
linear,-3.7085,4.7745,-2.1743,2.8154,3.549,-1.6867,2.0505,4.0183,-0.76764,1.3478,...,2.4437,4.66,-2.2629,1.1615,-1.2295,-0.43797,-1.6751,-3.7896,-1.7731,Contextual
arse,2.0757,-0.026898,-0.060543,2.459,-2.3987,1.2346,-3.5634,-0.69466,-0.87055,0.73186,...,0.49877,3.1281,-2.8107,-1.1068,2.3014,-0.60199,1.3736,0.26794,3.2829,Perceptual
jaguar,0.58436,2.3705,-1.796,-2.6764,-0.31655,0.94561,-0.086297,0.01113,2.467,1.8037,...,0.2704,3.2865,0.46756,1.3544,1.6206,-1.5607,-0.99583,-4.0979,-1.0307,Perceptual


In [74]:
## Descartando a última coluna:
df_last = df.iloc[:,:-1]
df_last.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,target,F_Subjectivity
periodical,-1.6272,-0.78238,1.6661,0.64415,5.8025,-3.8606,2.1145,2.2464,0.11389,-0.57345,5.7366,2.6897,-2.9695,2.0592,1.1753,2.1266,4.7035,2.7336,-3.5872,1.1187,-0.33484,-1.4071,-4.7561,2.2302,-2.059,-0.14411,-1.9042,-3.213,-0.34878,1.1597,-1.3406,0.47455,0.76463,0.62292,-2.3959,-0.086495,1.1585,-0.68091,1.1494,1.3205,1.8991,-0.035937,-0.36217,2.4583,1.1064,2.3078,-0.64062,0.28873,-1.6005,-2.3947,-2.2164,3.735,-0.78451,-0.50572,-2.4039,1.9877,-2.2064,2.1254,2.0112,-3.4274,0.38636,-0.99008,-2.2497,1.6211,3.0944,0.64471,1.4321,-0.62388,0.8677,1.1196,-1.9251,-2.9001,-2.8303,-2.8833,-0.33352,3.1787,-1.475,1.1811,-1.86,0.12378,-1.6496,-0.14419,-0.56384,1.1399,2.2299,3.6695,-1.0809,-2.7011,2.3617,-2.0884,-0.71105,1.6108,3.7237,-1.2607,2.374,-1.998,1.72,1.4512,-0.39968,4.1796,2.6991,2.5275,2.0868,1.2196,-1.0239,3.438,3.123,-3.8607,1.4855,-4.1848,3.7903,0.88335,-2.853,-0.98828,1.9463,1.1769,-1.8306,2.754,-1.7812,-1.4768,0.84724,-4.4108,1.7444,1.4066,1.8559,-4.2074,-0.42883,-2.2853,2.6108,-2.4452,-1.5658,0.6037,2.0717,2.7407,0.50984,0.82838,-1.9149,-1.2288,0.70129,-3.165,-0.40373,-1.5314,-0.56426,1.6354,3.8474,-0.94437,-0.99321,-0.052637,0.75979,3.275,2.3954,0.05155,0.77701,1.3314,-1.7912,0.54084,-0.39345,-0.94512,-1.1933,-1.5246,-1.9794,1.4443,1.1399,3.4458,-2.8027,-1.2135,-6.3629,1.6438,-0.24045,0.16754,1.4184,0.92152,2.7786,0.41264,2.3382,3.1647,1.2647,1.1072,-3.2381,-1.2591,-0.83185,-1.6059,0.57993,-2.118,-0.54878,0.76078,-0.074174,0.30904,-0.28318,1.7597,-0.83301,-2.5778,-3.7242,0.58209,3.8249,1.4794,-2.242,-2.0715,0.82211,3.4849,-1.6172,-4.0141,-0.16727,-2.0282,7.0865,0.84765,-3.4945,1.2765,2.2956,0.47523,2.3143,-0.93912,0.030626,-1.4647,-0.42383,1.7943,-0.78176,-3.8079,0.45426,0.58407,-0.15175,2.4248,-3.8759,-0.19335,-3.5155,-1.3907,-1.2319,0.48635,1.6602,0.84975,4.6175,-0.81367,0.47439,1.6774,1.7393,4.3165,0.14046,1.4182,-1.8713,-1.4948,-1.0273,-1.264,2.5117,2.1463,-1.052,2.5754,-2.8909,2.0706,-0.56698,2.6587,2.5999,-4.9548,-2.0778,-0.67968,-0.62967,-2.5925,-1.5708,-2.252,-0.10327,1.2572,-2.7904,2.8963,3.1598,2.2888,3.2515,-0.4181,2.1012,2.7386,-0.1432,-0.35501,-1.7387,1.5427,1.7338,-2.4797,1.0412,1.8452,-0.01234,-1.6399,-1.29,-0.055819,3.7824,0.87042,3.8564,0.43643,0.13927,-2.9037,0.46992,0.76273,0.52487,-2.5186,2.5555,-1.1517,-1.5503,-2.034,-2.3307,-0.35606,0.2412,-5.179,-3.2397,-1.4073,0,0


In [75]:
df_scnd_last = df.iloc[:, :-2].join(df.iloc[:, -1])
df_scnd_last.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,target,F_Objectivity
periodical,-1.6272,-0.78238,1.6661,0.64415,5.8025,-3.8606,2.1145,2.2464,0.11389,-0.57345,5.7366,2.6897,-2.9695,2.0592,1.1753,2.1266,4.7035,2.7336,-3.5872,1.1187,-0.33484,-1.4071,-4.7561,2.2302,-2.059,-0.14411,-1.9042,-3.213,-0.34878,1.1597,-1.3406,0.47455,0.76463,0.62292,-2.3959,-0.086495,1.1585,-0.68091,1.1494,1.3205,1.8991,-0.035937,-0.36217,2.4583,1.1064,2.3078,-0.64062,0.28873,-1.6005,-2.3947,-2.2164,3.735,-0.78451,-0.50572,-2.4039,1.9877,-2.2064,2.1254,2.0112,-3.4274,0.38636,-0.99008,-2.2497,1.6211,3.0944,0.64471,1.4321,-0.62388,0.8677,1.1196,-1.9251,-2.9001,-2.8303,-2.8833,-0.33352,3.1787,-1.475,1.1811,-1.86,0.12378,-1.6496,-0.14419,-0.56384,1.1399,2.2299,3.6695,-1.0809,-2.7011,2.3617,-2.0884,-0.71105,1.6108,3.7237,-1.2607,2.374,-1.998,1.72,1.4512,-0.39968,4.1796,2.6991,2.5275,2.0868,1.2196,-1.0239,3.438,3.123,-3.8607,1.4855,-4.1848,3.7903,0.88335,-2.853,-0.98828,1.9463,1.1769,-1.8306,2.754,-1.7812,-1.4768,0.84724,-4.4108,1.7444,1.4066,1.8559,-4.2074,-0.42883,-2.2853,2.6108,-2.4452,-1.5658,0.6037,2.0717,2.7407,0.50984,0.82838,-1.9149,-1.2288,0.70129,-3.165,-0.40373,-1.5314,-0.56426,1.6354,3.8474,-0.94437,-0.99321,-0.052637,0.75979,3.275,2.3954,0.05155,0.77701,1.3314,-1.7912,0.54084,-0.39345,-0.94512,-1.1933,-1.5246,-1.9794,1.4443,1.1399,3.4458,-2.8027,-1.2135,-6.3629,1.6438,-0.24045,0.16754,1.4184,0.92152,2.7786,0.41264,2.3382,3.1647,1.2647,1.1072,-3.2381,-1.2591,-0.83185,-1.6059,0.57993,-2.118,-0.54878,0.76078,-0.074174,0.30904,-0.28318,1.7597,-0.83301,-2.5778,-3.7242,0.58209,3.8249,1.4794,-2.242,-2.0715,0.82211,3.4849,-1.6172,-4.0141,-0.16727,-2.0282,7.0865,0.84765,-3.4945,1.2765,2.2956,0.47523,2.3143,-0.93912,0.030626,-1.4647,-0.42383,1.7943,-0.78176,-3.8079,0.45426,0.58407,-0.15175,2.4248,-3.8759,-0.19335,-3.5155,-1.3907,-1.2319,0.48635,1.6602,0.84975,4.6175,-0.81367,0.47439,1.6774,1.7393,4.3165,0.14046,1.4182,-1.8713,-1.4948,-1.0273,-1.264,2.5117,2.1463,-1.052,2.5754,-2.8909,2.0706,-0.56698,2.6587,2.5999,-4.9548,-2.0778,-0.67968,-0.62967,-2.5925,-1.5708,-2.252,-0.10327,1.2572,-2.7904,2.8963,3.1598,2.2888,3.2515,-0.4181,2.1012,2.7386,-0.1432,-0.35501,-1.7387,1.5427,1.7338,-2.4797,1.0412,1.8452,-0.01234,-1.6399,-1.29,-0.055819,3.7824,0.87042,3.8564,0.43643,0.13927,-2.9037,0.46992,0.76273,0.52487,-2.5186,2.5555,-1.1517,-1.5503,-2.034,-2.3307,-0.35606,0.2412,-5.179,-3.2397,-1.4073,0,0


### Pycaret Regression: 300 inputs and 1 numerical outputs

In [88]:
exp_regr = pycaret.regression.setup(df, target='target', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Regression
3,Original data shape,"(3757, 301)"
4,Transformed data shape,"(3757, 301)"
5,Transformed train set shape,"(2629, 301)"
6,Transformed test set shape,"(1128, 301)"
7,Numeric features,300
8,Preprocess,True
9,Imputation type,simple


In [89]:
exp_regr.compare_models()

[]

In [100]:
exp_regr.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.1549,0.035,0.1869,-0.003,0.1311,0.5698,0.033
en,Elastic Net,0.1549,0.035,0.1869,-0.003,0.1311,0.5698,0.03
dummy,Dummy Regressor,0.1549,0.035,0.1869,-0.003,0.1311,0.5698,0.03
llar,Lasso Least Angle Regression,0.1549,0.035,0.1869,-0.003,0.1311,0.5698,0.028
br,Bayesian Ridge,0.1552,0.0351,0.1873,-0.0076,0.1314,0.5705,0.032
omp,Orthogonal Matching Pursuit,0.1552,0.0352,0.1875,-0.0094,0.1315,0.57,0.029
lr,Linear Regression,0.1562,0.0357,0.1887,-0.0228,0.1324,0.5732,0.031
lar,Least Angle Regression,0.156,0.0357,0.1887,-0.0233,0.1324,0.5724,0.029
ridge,Ridge Regression,0.1562,0.0357,0.1888,-0.0239,0.1325,0.5725,0.03
ada,AdaBoost Regressor,0.1579,0.036,0.1895,-0.0318,0.1339,0.6074,0.146


### Pycaret Clustering

In [None]:
exp_cluster = pycaret.clustering.setup()

### Pycaret Classification

In [90]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,target
periodical,-1.627200,-0.782380,1.666100,0.64415,5.802500,-3.86060,2.11450,2.246400,0.11389,-0.57345,...,-1.15170,-1.55030,-2.03400,-2.33070,-0.35606,0.241200,-5.179000,-3.23970,-1.40730,Contextual
intellect,-0.154690,1.676800,-0.351630,-0.57609,2.088700,2.61040,0.66495,2.229900,-4.99240,-1.65200,...,-1.79910,-1.50010,-3.83950,1.14140,0.38565,1.783600,-0.127660,-5.47600,5.23490,Latent
culture,-0.025891,-2.009300,0.494080,-0.33639,5.294000,-0.38846,3.17880,2.941100,-2.58270,-1.90800,...,-0.75585,-0.47891,-2.78880,-0.65771,-4.51980,1.242700,-1.250500,-7.25270,5.32900,Latent
emancipation,-3.593900,-0.710900,2.035000,2.18820,2.327500,0.66056,2.73950,0.424270,1.17990,-0.65433,...,-0.22063,-1.37600,2.01930,1.47420,-0.93661,0.339360,-4.583200,-1.69060,-0.82634,Contextual
chasm,2.750600,2.573200,1.016900,-2.14450,-1.060800,1.80230,3.07630,3.057700,-1.46700,1.42200,...,0.71314,3.44560,-1.67040,2.73970,1.79250,-1.171300,0.825910,1.09540,-0.33145,Latent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
prawn,-1.104300,-2.707300,-0.040561,0.26686,-0.001637,-2.23060,-0.22431,0.000931,-3.39360,0.79063,...,-2.56200,1.02140,-2.52040,-1.98950,1.48670,1.859700,-0.720330,-1.17220,4.63340,Manifest
tweezers,-1.051400,2.791100,1.661000,2.09520,-0.197260,-1.18820,-2.49820,0.762750,-3.10200,0.28440,...,-0.44270,1.51110,0.88290,-2.94660,0.53499,0.968740,-0.909550,-0.77088,-1.42830,Manifest
university,-1.409000,0.093909,0.172970,-0.25512,2.648800,2.68320,3.31220,3.572100,-0.04109,0.20994,...,-1.17040,-1.25060,-0.83331,0.25252,0.19663,1.934500,-0.546330,0.59882,0.67005,Perceptual
wasteful,-1.105000,-0.335100,-1.827500,-3.49660,1.209300,1.20280,0.13930,-1.561000,-0.51685,-0.91023,...,-0.92690,2.68170,-0.71914,-1.27960,0.42766,-1.082500,0.035378,-3.72800,1.17320,Latent


In [91]:
## Fazendo um experimento:
exp_class = pycaret.classification.setup(df, target='target', session_id=9088)

Unnamed: 0,Description,Value
0,Session id,9088
1,Target,target
2,Target type,Multiclass
3,Target mapping,"Contextual: 0, Latent: 1, Manifest: 2, Perceptual: 3"
4,Original data shape,"(3757, 301)"
5,Transformed data shape,"(3757, 301)"
6,Transformed train set shape,"(2629, 301)"
7,Transformed test set shape,"(1128, 301)"
8,Numeric features,300
9,Preprocess,True


In [92]:
exp_class.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.6736,0.8904,0.6736,0.6632,0.6546,0.5408,0.5483,2.671
lda,Linear Discriminant Analysis,0.6721,0.8801,0.6721,0.6697,0.6691,0.5483,0.5493,0.064
ridge,Ridge Classifier,0.6691,0.0,0.6691,0.6599,0.6558,0.5373,0.5421,0.026
gbc,Gradient Boosting Classifier,0.6596,0.8855,0.6596,0.6462,0.6442,0.5234,0.5286,15.672
lr,Logistic Regression,0.6489,0.858,0.6489,0.6482,0.6466,0.5174,0.5185,0.377
rf,Random Forest Classifier,0.6337,0.8569,0.6337,0.6165,0.5929,0.4764,0.4933,0.409
knn,K Neighbors Classifier,0.6329,0.8297,0.6329,0.6184,0.6046,0.48,0.4909,0.03
svm,SVM - Linear Kernel,0.6314,0.0,0.6314,0.6287,0.6243,0.4911,0.4941,0.059
et,Extra Trees Classifier,0.625,0.864,0.625,0.6174,0.5649,0.4577,0.4844,0.108
qda,Quadratic Discriminant Analysis,0.6185,0.7886,0.6185,0.6541,0.5176,0.441,0.4861,0.041


#### New Test

In [130]:
## Selecionar apenas os elementos do SCA que possuem fatores < 0.25 e > 0.75:
new_SCA_words = [word for word, subjectivity, objectivity in zip(df_factors['words'], df_factors['F_Subjectivity'], df_factors['F_Objectivity']) if (subjectivity > 0.75 or subjectivity < 0.25) and (objectivity > 0.75 or objectivity < 0.25)]


In [132]:
len(new_SCA_words)

851

In [133]:
new_SCA_embedding_rows =  nlp.vocab.vectors.find(keys=new_SCA_words)

In [134]:
len(new_SCA_embedding_rows)

851

> Separating the SCA-GlasgowNorms data into train and test:

In [None]:
# Separar os dados em conjuntos de treino (70%) e teste (30%)
train_df, test_df = train_test_split(df_factors, test_size=0.2, random_state=42)

In [None]:
# Função para criar os conjuntos de treino e resposta
def create_data(dataframe):
    X = {}
    Y = {}
    
    for index, row in dataframe.iterrows():
        word = row['words']
        f_objectivity = row['F_Objectivity']
        f_subjectivity = row['F_Subjectivity']

        if word in nlp.vocab:
            indice = nlp.vocab.strings[word]
            vetor_embedding = word_embedding[index]
            X[word] = vetor_embedding
            Y[word] = {'F_Objectivity': f_objectivity, 'F_Subjectivity': f_subjectivity}

    return pd.DataFrame.from_dict(X, orient='index'), pd.DataFrame.from_dict(Y, orient='index')

In [None]:
# Creating train and test datasets:
X_train, Y_train = create_data(train_df)
X_test, Y_test = create_data(test_df)

# Exibir as dimensões dos conjuntos de treino e teste
print("Train data dimension:")
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)

print("\nTest data dimension:")
print("X_test:", X_test.shape)
print("Y_test:", Y_test.shape)

Train data dimension:
X_train: (3757, 300)
Y_train: (3757, 2)

Test data dimension:
X_test: (923, 300)
Y_test: (923, 2)
