In [72]:
import pandas as pd 
import numpy as np

df = pd.read_csv("../data/data.csv")

In [73]:
import nlpaug
import nlpaug.augmenter.word as naw

import nltk
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

aug_synonym = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=3)
aug_insert = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=2, aug_p=0.3)

In [74]:
df["cleaned_text"].isnull().sum()
df.count()

airline_sentiment    14601
cleaned_text         14580
dtype: int64

In [75]:
df.dropna(inplace=True)
df["cleaned_text"].isnull().sum()
df.count()


airline_sentiment    14580
cleaned_text         14580
dtype: int64

In [76]:
def safe_augment(text, augmenter):
    """Return augmented text; fallback to original on failure/no-change."""
    try:
        aug = augmenter.augment(text)
        if isinstance(aug, list):
            aug = aug[0] if aug else None
        if isinstance(aug, str) and aug and aug != text:
            return aug
    except Exception:
        pass
    return text

print("=" * 60)
print("AUGMENTING MINORITY CLASSES")
print("=" * 60)

# Split classes
negative_df = df[df["airline_sentiment"] == "negative"]
neutral_df = df[df["airline_sentiment"] == "neutral"]
positive_df = df[df["airline_sentiment"] == "positive"]

max_count = len(negative_df)
neutral_needed = max_count - len(neutral_df)
positive_needed = max_count - len(positive_df)

print(f"Targets: negative={max_count}, add neutral={neutral_needed}, add positive={positive_needed}")

def build_augmented(class_df, needed, augmenter, seed):
    if needed <= 0:
        return pd.DataFrame(columns=["airline_sentiment", "cleaned_text"])

    sampled = class_df.sample(n=needed, replace=True, random_state=seed).copy()
    sampled["cleaned_text"] = sampled["cleaned_text"].apply(lambda t: safe_augment(t, augmenter))
    return sampled[["airline_sentiment", "cleaned_text"]].reset_index(drop=True)

neutral_aug = build_augmented(neutral_df, neutral_needed, aug_synonym, seed=42)
positive_aug = build_augmented(positive_df, positive_needed, aug_synonym, seed=43)

augmented_needed = pd.concat([neutral_aug, positive_aug], ignore_index=True)

print("Augmented added:")
print(augmented_needed["airline_sentiment"].value_counts())

AUGMENTING MINORITY CLASSES
Targets: negative=9151, add neutral=6071, add positive=6802
Augmented added:
airline_sentiment
positive    6802
neutral     6071
Name: count, dtype: int64


In [77]:
augmented_needed

Unnamed: 0,airline_sentiment,cleaned_text
0,neutral,daytime travel
1,neutral,moldiness 445pm nonstop bosbna flight stop fee...
2,neutral,flight 4315 n231wn hack prior flight of stairs
3,neutral,cancelled flightled flights 525 pm today due w...
4,neutral,so
...,...,...
12868,positive,thank
12869,positive,awe inspiring thank
12870,positive,genus bos everything current thanks be
12871,positive,flight 2954 dallas grand junction


In [78]:
df

Unnamed: 0,airline_sentiment,cleaned_text
0,neutral,said
1,positive,plus youve added commercials experience tacky
2,neutral,didnt today must mean need take another trip
3,negative,really aggressive blast obnoxious entertainmen...
4,negative,really big bad thing
...,...,...
14596,positive,thank got different flight chicago
14597,negative,leaving 20 minutes late flight warnings commun...
14598,neutral,please bring american airlines
14599,negative,money change flight dont answer phones suggest...


In [79]:
df.groupby("airline_sentiment")["airline_sentiment"].count()

airline_sentiment
negative    9151
neutral     3080
positive    2349
Name: airline_sentiment, dtype: int64

In [80]:
augmented_needed.groupby("airline_sentiment")["airline_sentiment"].count()

airline_sentiment
neutral     6071
positive    6802
Name: airline_sentiment, dtype: int64

In [81]:
# concate the df with augmented_needed to make new balanced data 
df_aug = pd.concat([df,augmented_needed], axis=0) 
df_aug = df_aug.sample(frac=1, random_state=42).reset_index(drop=True)

In [82]:
df_aug.duplicated().sum()

np.int64(2880)

In [83]:
df_aug.groupby("airline_sentiment")["airline_sentiment"].count()


airline_sentiment
negative    9151
neutral     9151
positive    9151
Name: airline_sentiment, dtype: int64

<h1 style="color:orange;"> Generate embedding ! </h1>



In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L12-v2")

In [None]:
# model2 = SentenceTransformer("all-mpnet-base-v2") 

# cardiffnlp/twitter-roberta-base-sentiment : resultat faible 


In [93]:
df_aug.count()

airline_sentiment    27453
cleaned_text         27453
embedding            27453
dtype: int64

In [94]:
embedding = model.encode(
    df_aug["cleaned_text"].tolist(),
    normalize_embeddings=True,
    show_progress_bar=True
)

Batches:   0%|          | 0/858 [00:00<?, ?it/s]

In [95]:
embedding

array([[ 0.013154  ,  0.00518841, -0.08946048, ...,  0.0473549 ,
        -0.06022904, -0.03324136],
       [-0.03579652,  0.04009912,  0.02993601, ...,  0.01039924,
        -0.03022531,  0.03018399],
       [ 0.02799473,  0.02059757, -0.02282682, ...,  0.10877176,
        -0.01468947, -0.06644722],
       ...,
       [-0.05137814,  0.06362412, -0.01940895, ..., -0.04517402,
        -0.03786246,  0.00171394],
       [-0.04977728, -0.04378831,  0.04175003, ..., -0.03448128,
         0.03421117, -0.02427115],
       [ 0.03904052, -0.00398473,  0.02052151, ...,  0.00371986,
        -0.10326481,  0.03546768]], shape=(27453, 384), dtype=float32)

In [96]:
# Convert numpy arrays to standard lists for ChromaDB compatibility

df_aug["embedding"] = embedding.tolist()


In [97]:
df_aug.to_pickle("../data/embedded_data.pkl")