In [1]:
import pandas as pd

splits = {"train": "train_df.csv", "validation": "val_df.csv", "test": "test_df.csv"}
df = pd.read_csv(
    "hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/" + splits["train"]
)

In [2]:
df.head()

Unnamed: 0,id,text,label,sentiment
0,9536,"Cooking microwave pizzas, yummy",2,positive
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive
3,14182,naw idk what ur talkin about,1,neutral
4,17840,That sucks to hear. I hate days like that,0,negative


In [12]:
from sentence_transformers import SentenceTransformer

# Load model
encoder = SentenceTransformer("all-mpnet-base-v2")

# Encode the text column
embeddings = encoder.encode(df["text"].values.tolist())

# Add the embeddings to the DataFrame
df["embedding"] = embeddings.tolist()  # Convert the numpy array to a list for each row

In [13]:
df.head()

Unnamed: 0,id,text,label,sentiment,embedding
0,9536,"Cooking microwave pizzas, yummy",2,positive,"[-0.0736706405878067, 0.01039914507418871, -0...."
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral,"[-0.03147735074162483, 0.002035659970715642, -..."
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive,"[0.03868205472826958, 0.09765569865703583, -0...."
3,14182,naw idk what ur talkin about,1,neutral,"[-0.009805279783904552, -0.007695794105529785,..."
4,17840,That sucks to hear. I hate days like that,0,negative,"[-0.001938657253049314, 0.029627734795212746, ..."


In [14]:
# Split the dataset into train, test and conformal prediction
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df, test_size=0.3, random_state=42)
X_test, X_cp = train_test_split(X_test, test_size=0.5, random_state=42)

In [15]:
y_train, y_test, y_cp = X_train["label"], X_test["label"], X_cp["label"]

In [16]:
X_train.drop(columns=["label"], inplace=True)
X_test.drop(columns=["label"], inplace=True)
X_cp.drop(columns=["label"], inplace=True)

In [18]:
X_train.sample()

Unnamed: 0,id,text,sentiment,embedding
28526,6370,just because you`re a superstar it doesn`t mea...,neutral,"[-0.021009523421525955, 0.07494054734706879, 0..."


In [23]:
df.to_csv("../../DATA/train_data.csv", index=False, lineterminator="\n")

In [3]:
df = pd.read_csv("../../DATA/train_data.csv", lineterminator="\n")

In [4]:
df.dtypes

id            int64
text         object
label         int64
sentiment    object
embedding    object
dtype: object

In [5]:
import ast

df["embedding"] = df["embedding"].apply(ast.literal_eval)

In [6]:
df["embedding"].values[0]

[-0.0736706405878067,
 0.01039914507418871,
 -0.010018889792263508,
 -0.030519705265760422,
 0.00807702261954546,
 0.044372837990522385,
 -0.0073929824866354465,
 0.06213642656803131,
 -0.03863219544291496,
 0.022956782951951027,
 -0.03392069786787033,
 0.022982768714427948,
 0.01052404660731554,
 0.02917753905057907,
 -0.00532034644857049,
 0.02931918203830719,
 -0.009797734208405018,
 -0.0029640402644872665,
 0.007386019453406334,
 0.03174268454313278,
 -0.04579973593354225,
 -0.0025213034823536873,
 0.007614090573042631,
 -0.029536966234445572,
 -0.03337959945201874,
 -0.017690274864435196,
 -0.007512160576879978,
 0.04746999591588974,
 0.004457591567188501,
 -0.028459645807743073,
 0.0006771224434487522,
 0.040869809687137604,
 -0.04382946342229843,
 -0.020092355087399483,
 1.264824163627054e-06,
 0.01142472680658102,
 0.011785716749727726,
 -0.008841365575790405,
 -0.02251448482275009,
 0.018754787743091583,
 0.05524468794465065,
 0.007697438821196556,
 -0.040401048958301544,
 0.0

In [33]:
df.to_parquet("../../DATA/train_data.parquet", index=False)

In [7]:
df = pd.read_parquet("../../DATA/train_data.parquet")

In [8]:
df.dtypes

id            int64
text         object
label         int64
sentiment    object
embedding    object
dtype: object

In [9]:
df.sample(10)

Unnamed: 0,id,text,label,sentiment,embedding
27061,21372,Is home from MARCO ISLAND and missing it and m...,0,negative,"[-0.05310976877808571, -0.017842255532741547, ..."
10727,10769,"_01 here is winter, but not snow",1,neutral,"[-0.04018426686525345, 0.022156665101647377, -..."
24721,15138,_Thompson u may need this pill...i think u are...,0,negative,"[-0.024111008271574974, 0.052062585949897766, ..."
4409,20395,join the club dougie i have a cold too x,0,negative,"[-0.02195223607122898, 0.02986782416701317, 0...."
6440,964,werd. that`s very true,1,neutral,"[0.00881123822182417, -0.00661575049161911, 0...."
6454,14262,Such a beautiful day...wish my bike hadn`t bee...,2,positive,"[-0.0574975311756134, 0.06314639002084732, -0...."
11208,657,Very helpful for me to list out my works,2,positive,"[-0.024653641507029533, -0.021444518119096756,..."
14401,187,Have to create an account just to check out fu...,0,negative,"[-0.042708463966846466, 0.03947445750236511, 0..."
17166,9665,So off work about to do what I don`t know I`m ...,0,negative,"[-0.04608186334371567, -0.04728476330637932, -..."
15681,4547,I paid for the premium app and still get mashe...,1,neutral,"[0.043469637632369995, 0.021658027544617653, -..."


In [12]:
df["embedding"].values[0]

array([-7.36706406e-02,  1.03991451e-02, -1.00188898e-02, -3.05197053e-02,
        8.07702262e-03,  4.43728380e-02, -7.39298249e-03,  6.21364266e-02,
       -3.86321954e-02,  2.29567830e-02, -3.39206979e-02,  2.29827687e-02,
        1.05240466e-02,  2.91775391e-02, -5.32034645e-03,  2.93191820e-02,
       -9.79773421e-03, -2.96404026e-03,  7.38601945e-03,  3.17426845e-02,
       -4.57997359e-02, -2.52130348e-03,  7.61409057e-03, -2.95369662e-02,
       -3.33795995e-02, -1.76902749e-02, -7.51216058e-03,  4.74699959e-02,
        4.45759157e-03, -2.84596458e-02,  6.77122443e-04,  4.08698097e-02,
       -4.38294634e-02, -2.00923551e-02,  1.26482416e-06,  1.14247268e-02,
        1.17857167e-02, -8.84136558e-03, -2.25144848e-02,  1.87547877e-02,
        5.52446879e-02,  7.69743882e-03, -4.04010490e-02,  5.35446405e-02,
        6.26359368e-03, -1.34290420e-02,  3.02479863e-02,  4.29128073e-02,
       -4.30533923e-02, -2.09250003e-02, -4.53251507e-03, -5.38195409e-02,
       -7.96645060e-02, -

In [14]:
from xgboost import XGBClassifier


model = XGBClassifier()
model.fit(list(df["embedding"].values), df["label"])

In [20]:
df_sample = df.sample(frac=0.1)

In [21]:
df_sample

Unnamed: 0,id,text,label,sentiment,embedding
6172,8511,Easy and usefully and helpfully,2,positive,"[0.014361497946083546, 0.008050103671848774, -..."
12107,3255,Hairspray in hair + lighter&bong = new haircut,1,neutral,"[0.012690233998000622, 0.03290047124028206, -0..."
3153,9553,8 month old African grey parrot sad sale - Re...,0,negative,"[0.03990549221634865, 0.059928037226200104, -0..."
17573,8302,They follow my,1,neutral,"[0.016326026991009712, 0.037099115550518036, -..."
14119,9818,fellow twitterers. if anyone one has a leather...,1,neutral,"[0.04978827387094498, 0.03639218211174011, 0.0..."
...,...,...,...,...,...
6100,1070,"While everything else about this app is good, ...",0,negative,"[-0.01664506271481514, 0.0037492115516215563, ..."
12637,13967,Well this is just lovely. I am completely fla...,2,positive,"[-0.04822392016649246, 0.03628288209438324, -0..."
20234,4349,Why I gotta be all that?... I sad now,0,negative,"[0.07446438819169998, 0.01700625754892826, -0...."
26413,4586,Starting to get annoyed with socialscope. Need...,0,negative,"[0.020712219178676605, 0.06034257635474205, -8..."


In [22]:
df_sample.to_parquet("../../DATA/sample_data.parquet", index=False)