In [1]:
import pandas as pd 
import numpy as np

df = pd.read_csv("../data/data.csv")

In [2]:
import nlpaug
import nlpaug.augmenter.word as naw

import nltk
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
aug_synonym = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=3)

aug_insert = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=2, aug_p=0.3)

In [20]:
def augment_text(text, augmenter, num_augmentations=1):
    augmented_texts = []

    for _ in range(num_augmentations):
        try:
            aug = augmenter.augment(text)

            # If augmenter returns a list, take first element
            if isinstance(aug, list):
                aug = aug[0] if len(aug) > 0 else None

            if aug and isinstance(aug, str) and aug != text:
                augmented_texts.append(aug)
                print(augmented_texts)
        except Exception:
            continue

    return augmented_texts


print("=" * 60)
print("AUGMENTING MINORITY CLASSES")
print("=" * 60)

negative_df = df[df['airline_sentiment'] == 'negative']
neutral_df = df[df['airline_sentiment'] == 'neutral']
positive_df = df[df['airline_sentiment'] == 'positive']

max_count = len(negative_df)

neutral_needed = max_count - len(neutral_df)
positive_needed = max_count - len(positive_df)

print(f"\nAugmentation targets:")
print(f"  ‚Ä¢ Neutral class:  {len(neutral_df):,} ‚Üí {max_count:,} (need {neutral_needed:,} more)")
print(f"  ‚Ä¢ Positive class: {len(positive_df):,} ‚Üí {max_count:,} (need {positive_needed:,} more)")

augmented_data = []

print(f"\n‚è≥ Augmenting neutral class...")
neutral_samples = neutral_df.sample(n=neutral_needed, replace=True, random_state=42)
for idx, row in enumerate(neutral_samples.itertuples(), 1):
    if idx % 500 == 0:
        print(f"   Processed {idx}/{neutral_needed}...")
    
    augmenter = aug_synonym 
    aug_texts = augment_text(row.text, augmenter, num_augmentations=1)
    
    if aug_texts:
        augmented_data.append({
            'airline_sentiment': row.airline_sentiment,
            'text': f"{row.text}"
        })

print(f"‚úì Neutral class augmented: {len([d for d in augmented_data if d['airline_sentiment'] == 'neutral'])} new samples")

print(f"\n‚è≥ Augmenting positive class...")
positive_samples = positive_df.sample(n=positive_needed, replace=True, random_state=42)
for idx, row in enumerate(positive_samples.itertuples(), 1):
    if idx % 500 == 0:
        print(f"   Processed {idx}/{positive_needed}...")
    
    augmenter = aug_synonym 
    aug_texts = augment_text(row.text, augmenter, num_augmentations=1)
    
    if aug_texts:
        augmented_data.append({
            'airline_sentiment': row.airline_sentiment,
            'text': f"{row.text}"
        })

print(f"‚úì Positive class augmented: {len([d for d in augmented_data if d['airline_sentiment'] == 'positive'])} new samples")

augmented_needed = pd.DataFrame(augmented_data)



AUGMENTING MINORITY CLASSES

Augmentation targets:
  ‚Ä¢ Neutral class:  3,091 ‚Üí 9,157 (need 6,066 more)
  ‚Ä¢ Positive class: 2,353 ‚Üí 9,157 (need 6,804 more)

‚è≥ Augmenting neutral class...
["@ united + UA has been rolling out improvements too, we think. Hoping they ' ll catch up soon? Meantime, ANA represent our druthers!"]
["@ SouthwestAir exercise you have a pair of ticket to the @ Imaginedragons show in Atlanta? ! I ' d making love to go! # DestinationDragons"]
["@ SouthwestAir Plz consider customers you ' re losing to # AA for üö´ the KC ‚Üî Ô∏è OKC calculate. # okcdirects # OKC # MCI # flights # okcprofessionals"]
['@ SouthwestAir hey southwest! Can I realise @ Imaginedragons in Atlanta? My supporter hold never been there, loves them, and he is from Taiwan!']
['@ JetBlue How can I check that? I do non believe I was given a credit for that automatically. : /']
['@ JetBlue Anywhere warm cause its freeze in NYC']
["@ united Can you help me get a flight out tonight to Houston?

In [21]:
augmented_needed

Unnamed: 0,airline_sentiment,text
0,neutral,@united + UA has been rolling out improvements...
1,neutral,@SouthwestAir do you have a pair of tickets to...
2,neutral,@SouthwestAir Plz consider customers you're lo...
3,neutral,@SouthwestAir hey southwest! Can I see @Imagin...
4,neutral,@JetBlue How can I check that? I do not believ...
...,...,...
12865,positive,@SouthwestAir LUV Ya Too!!!! I will sing a so...
12866,positive,"@VirginAmerica has getaway deals through May, ..."
12867,positive,@SouthwestAir already booked my tickets for Au...
12868,positive,@SouthwestAir great cabin and flight crew this...


In [22]:
df = df[["airline_sentiment","text"]]

df.groupby("airline_sentiment")["airline_sentiment"].count()

airline_sentiment
negative    9157
neutral     3091
positive    2353
Name: airline_sentiment, dtype: int64

In [23]:
augmented_needed.groupby("airline_sentiment")["airline_sentiment"].count()

airline_sentiment
neutral     6066
positive    6804
Name: airline_sentiment, dtype: int64

In [24]:
# concate the df with augmented_needed to make new balanced data 
df_aug = pd.concat([df,augmented_needed], axis=0)

In [25]:
df_aug.groupby("airline_sentiment")["airline_sentiment"].count()


airline_sentiment
negative    9157
neutral     9157
positive    9157
Name: airline_sentiment, dtype: int64

In [26]:
from nltk.corpus import stopwords
import nltk

# T√©l√©chargez les ressources NLTK n√©cessaires si ce n'est pas d√©j√† fait
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ycode\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ycode\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ycode\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:

df_aug["text_normalized"] = (
df_aug["text"]
.str.lower()                                   # minuscules
.str.replace(r'http\S+|www\S+', '', regex=True) # suppression des URLs
.str.replace(r'@\w+', '', regex=True)           # suppression des mentions
.str.replace(r'#\w+', '', regex=True)           # suppression des hashtags
.str.replace(r'[^a-z0-9\s]', '', regex=True)    # ponctuation & caract√®res sp√©ciaux
.str.replace(r'\s+', ' ', regex=True)           # espaces multiples ‚Üí 1 espace
.str.strip()                                    # suppression espaces d√©but/fin
.str.strip()
)

print(df_aug["text_normalized"])


0                                                what said
1        plus youve added commercials to the experience...
2        i didnt today must mean i need to take another...
3        its really aggressive to blast obnoxious enter...
4                  and its a really big bad thing about it
                               ...                        
12865    luv ya too i will sing a song for yall when i ...
12866    has getaway deals through may from 59 oneway l...
12867    already booked my tickets for august 20th30th ...
12868    great cabin and flight crew this morning on a ...
12869    thank you for your help adam and to the awesom...
Name: text_normalized, Length: 27471, dtype: object


In [28]:
stop_words = set(stopwords.words("english"))

def stop_words_remover(text): 
    tokens = text.split()
    tokens_filter = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens_filter)

In [29]:
df_aug["text_normalized"] = df_aug["text_normalized"].apply(lambda x: stop_words_remover(x))

<h1 style="color:orange;"> Generate embedding ! </h1>



In [30]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L12-v2")

In [31]:
# better embedding model (but it's more heavy) 
# model2 = SentenceTransformer("all-mpnet-base-v2")


In [32]:
df_aug

Unnamed: 0,airline_sentiment,text,text_normalized
0,neutral,@VirginAmerica What @dhepburn said.,said
1,positive,@VirginAmerica plus you've added commercials t...,plus youve added commercials experience tacky
2,neutral,@VirginAmerica I didn't today... Must mean I n...,didnt today must mean need take another trip
3,negative,@VirginAmerica it's really aggressive to blast...,really aggressive blast obnoxious entertainmen...
4,negative,@VirginAmerica and it's a really big bad thing...,really big bad thing
...,...,...,...
12865,positive,@SouthwestAir LUV Ya Too!!!! I will sing a so...,luv ya sing song yall finally get plane back n...
12866,positive,"@VirginAmerica has getaway deals through May, ...",getaway deals may 59 oneway lots cool cities
12867,positive,@SouthwestAir already booked my tickets for Au...,already booked tickets august 20th30th cant wa...
12868,positive,@SouthwestAir great cabin and flight crew this...,great cabin flight crew morning great smile ha...


In [45]:
embedding = model.encode(
    df_aug["text"].tolist(),
    normalize_embeddings=True,
    show_progress_bar=True
)

Batches:   0%|          | 0/859 [00:00<?, ?it/s]

In [46]:
embedding

array([[ 0.0030891 ,  0.08647377, -0.0453333 , ..., -0.06762812,
        -0.00764808, -0.02906491],
       [-0.01704391,  0.0413912 ,  0.05309898, ..., -0.07246983,
         0.00061238, -0.03143211],
       [ 0.01765142, -0.00182621,  0.05536732, ..., -0.08177378,
        -0.11430358, -0.04604586],
       ...,
       [ 0.04800796, -0.0421735 ,  0.03474437, ...,  0.00460419,
        -0.06315816, -0.03838864],
       [ 0.02445298, -0.01317603,  0.08007919, ...,  0.05164472,
        -0.07421819, -0.04779695],
       [ 0.02990052, -0.05761112,  0.0138815 , ..., -0.02579542,
        -0.05957169, -0.06932101]], shape=(27471, 384), dtype=float32)

In [47]:
# Convert numpy arrays to standard lists for ChromaDB compatibility

df_aug["embedding"] = embedding.tolist()


In [48]:
df_aug.to_pickle("../data/embedded_data.pkl")