#### Import libraries

In [1]:
import pandas as pd

#### Merge and Handle 2 file data
- Drop child_alone column because it's all 0 values
- Remove duplicated values

In [7]:
disaster_categories_path = '../origin_data/disaster_categories.csv'
disaster_messages_path = '../origin_data/disaster_messages.csv'

In [8]:
disaster_categories = pd.read_csv(disaster_categories_path)
disaster_messages = pd.read_csv(disaster_messages_path)

In [9]:
df = pd.merge(disaster_messages, disaster_categories, on='id', how='inner')

In [10]:
categories = df['categories'].str.split(pat=';', expand=True)

In [11]:
row = categories.iloc[0]
category_colnames = row.apply(lambda x: x[:-2])

In [12]:
categories.columns = category_colnames

In [13]:
for column in categories.columns:
  categories[column] = categories[column].astype(str).str[-1]

  categories[column] = categories[column].astype(int)

In [14]:
df.drop('categories', axis=1, inplace=True)

In [15]:
df = pd.concat([df, categories], axis=1)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df.drop('child_alone', axis=1, inplace=True)

#### Data Augmentation
- Use textattack library
- Generate into 6 small augment data file them combite to 1 file

In [17]:
from textattack.augmentation import WordNetAugmenter, EmbeddingAugmenter, EasyDataAugmenter

In [23]:
# Create Augmenters
wordnet_augmenter = WordNetAugmenter()
embedding_augmenter = EmbeddingAugmenter()
easy_augmenter = EasyDataAugmenter()

# Augment each sentence
augmentation_count_per_sentence = 5

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [33]:
# Create dictionary of augment data
def generate_augmented_data(df, wordnet_augmenter, embedding_augmenter, easy_augmenter):
    for index, row in df.iterrows():
        if len(row['message']) > 200:
            continue
        
        original_sentence = row['message']
        new_sentences = set()

        wordnet_sentences = wordnet_augmenter.augment(original_sentence)
        new_sentences.update(wordnet_sentences[:augmentation_count_per_sentence])

        embedding_sentences = embedding_augmenter.augment(original_sentence)
        new_sentences.update(embedding_sentences[:augmentation_count_per_sentence])

        easy_sentences = easy_augmenter.augment(original_sentence)
        new_sentences.update(easy_sentences[:augmentation_count_per_sentence])

        for sentence in new_sentences:
            new_row = row.copy()
            new_row['message'] = sentence
            yield new_row.to_dict()

        print(f"Original sentence {index + 1}/{df.index[-1]}: Added {len(new_sentences)} augmented sentences.")

In [None]:
augmented_data = generate_augmented_data(df.iloc[:3600,:], wordnet_augmenter, embedding_augmenter, easy_augmenter)

augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('./span_data/augmented_data_00.csv', index=False, encoding='utf-8')

print(f"Generated {len(augmented_df)} augmented rows.")

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Original sentence 1/3600: Added 6 augmented sentences.
Original sentence 2/3600: Added 6 augmented sentences.
Original sentence 3/3600: Added 6 augmented sentences.
Original sentence 4/3600: Added 6 augmented sentences.
Original sentence 5/3600: Added 6 augmented sentences.
Original sentence 6/3600: Added 6 augmented sentences.
Original sentence 7/3600: Added 6 augmented sentences.
Original sentence 8/3600: Added 6 augmented sentences.
Original sentence 9/3600: Added 6 augmented sentences.
Original sentence 10/3600: Added 6 augmented sentences.
Original sentence 11/3600: Added 6 augmented sentences.
Original sentence 12/3600: Added 6 augmented sentences.
Original sentence 13/3600: Added 6 augmented sentences.
Original sentence 14/3600: Added 6 augmented sentences.
Original sentence 15/3600: Added 6 augmented sentences.
Original sentence 16/3600: Added 6 augmented sentences.
Original sentence 17/3600: Added 6 augmented sentences.
Original sentence 18/3600: Added 6 augmented sentences.
O

In [None]:
augmented_data = generate_augmented_data(df.iloc[3600:7200,:], wordnet_augmenter, embedding_augmenter, easy_augmenter)

augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('./span_data/augmented_data_01.csv', index=False, encoding='utf-8')

print(f"Generated {len(augmented_df)} augmented rows.")

Original sentence 3620/7200: Added 6 augmented sentences.
Original sentence 3621/7200: Added 6 augmented sentences.
Original sentence 3622/7200: Added 6 augmented sentences.
Original sentence 3623/7200: Added 6 augmented sentences.
Original sentence 3624/7200: Added 6 augmented sentences.
Original sentence 3625/7200: Added 6 augmented sentences.
Original sentence 3626/7200: Added 6 augmented sentences.
Original sentence 3627/7200: Added 6 augmented sentences.
Original sentence 3628/7200: Added 6 augmented sentences.
Original sentence 3629/7200: Added 6 augmented sentences.
Original sentence 3630/7200: Added 6 augmented sentences.
Original sentence 3631/7200: Added 4 augmented sentences.
Original sentence 3632/7200: Added 6 augmented sentences.
Original sentence 3633/7200: Added 6 augmented sentences.
Original sentence 3634/7200: Added 6 augmented sentences.
Original sentence 3635/7200: Added 6 augmented sentences.
Original sentence 3636/7200: Added 6 augmented sentences.
Original sente

In [None]:
augmented_data = generate_augmented_data(df.iloc[7200:10800,:], wordnet_augmenter, embedding_augmenter, easy_augmenter)

# Tạo DataFrame từ generator và lưu vào file CSV
augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('./span_data/augmented_data_02.csv', index=False, encoding='utf-8')

print(f"Generated {len(augmented_df)} augmented rows.")

Original sentence 7259/10800: Added 6 augmented sentences.
Original sentence 7260/10800: Added 6 augmented sentences.
Original sentence 7261/10800: Added 6 augmented sentences.
Original sentence 7262/10800: Added 6 augmented sentences.
Original sentence 7263/10800: Added 6 augmented sentences.
Original sentence 7264/10800: Added 6 augmented sentences.
Original sentence 7265/10800: Added 6 augmented sentences.
Original sentence 7266/10800: Added 6 augmented sentences.
Original sentence 7267/10800: Added 6 augmented sentences.
Original sentence 7268/10800: Added 6 augmented sentences.
Original sentence 7269/10800: Added 6 augmented sentences.
Original sentence 7270/10800: Added 6 augmented sentences.
Original sentence 7271/10800: Added 6 augmented sentences.
Original sentence 7272/10800: Added 6 augmented sentences.
Original sentence 7273/10800: Added 6 augmented sentences.
Original sentence 7274/10800: Added 6 augmented sentences.
Original sentence 7275/10800: Added 6 augmented sentence

In [28]:
augmented_data = generate_augmented_data(df.iloc[10800:14400,:], wordnet_augmenter, embedding_augmenter, easy_augmenter)

augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('./span_data/augmented_data_03.csv', index=False, encoding='utf-8')

print(f"Generated {len(augmented_df)} augmented rows.")

Original sentence 10868/14505: Added 6 augmented sentences.
Original sentence 10870/14505: Added 6 augmented sentences.
Original sentence 10871/14505: Added 6 augmented sentences.
Original sentence 10872/14505: Added 6 augmented sentences.
Original sentence 10873/14505: Added 6 augmented sentences.
Original sentence 10874/14505: Added 6 augmented sentences.
Original sentence 10875/14505: Added 6 augmented sentences.
Original sentence 10876/14505: Added 6 augmented sentences.
Original sentence 10877/14505: Added 6 augmented sentences.
Original sentence 10878/14505: Added 6 augmented sentences.
Original sentence 10879/14505: Added 6 augmented sentences.
Original sentence 10882/14505: Added 6 augmented sentences.
Original sentence 10883/14505: Added 6 augmented sentences.
Original sentence 10884/14505: Added 6 augmented sentences.
Original sentence 10885/14505: Added 6 augmented sentences.
Original sentence 10886/14505: Added 6 augmented sentences.
Original sentence 10889/14505: Added 6 a

In [34]:
augmented_data = generate_augmented_data(df.iloc[14400:18000,:], wordnet_augmenter, embedding_augmenter, easy_augmenter)

augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('./span_data/augmented_data_04.csv', index=False, encoding='utf-8')

print(f"Generated {len(augmented_df)} augmented rows.")

Original sentence 14507/18126: Added 6 augmented sentences.
Original sentence 14508/18126: Added 6 augmented sentences.
Original sentence 14510/18126: Added 6 augmented sentences.
Original sentence 14511/18126: Added 6 augmented sentences.
Original sentence 14513/18126: Added 6 augmented sentences.
Original sentence 14514/18126: Added 6 augmented sentences.
Original sentence 14515/18126: Added 6 augmented sentences.
Original sentence 14516/18126: Added 6 augmented sentences.
Original sentence 14517/18126: Added 6 augmented sentences.
Original sentence 14520/18126: Added 6 augmented sentences.
Original sentence 14521/18126: Added 6 augmented sentences.
Original sentence 14522/18126: Added 6 augmented sentences.
Original sentence 14525/18126: Added 6 augmented sentences.
Original sentence 14527/18126: Added 6 augmented sentences.
Original sentence 14528/18126: Added 6 augmented sentences.
Original sentence 14529/18126: Added 6 augmented sentences.
Original sentence 14530/18126: Added 6 a

In [35]:
augmented_data = generate_augmented_data(df.iloc[18000:21600,:], wordnet_augmenter, embedding_augmenter, easy_augmenter)

augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('./span_data/augmented_data_05.csv', index=False, encoding='utf-8')

print(f"Generated {len(augmented_df)} augmented rows.")

Original sentence 18128/21752: Added 6 augmented sentences.
Original sentence 18129/21752: Added 6 augmented sentences.
Original sentence 18131/21752: Added 6 augmented sentences.
Original sentence 18133/21752: Added 6 augmented sentences.
Original sentence 18134/21752: Added 6 augmented sentences.
Original sentence 18135/21752: Added 6 augmented sentences.
Original sentence 18137/21752: Added 6 augmented sentences.
Original sentence 18138/21752: Added 6 augmented sentences.
Original sentence 18141/21752: Added 6 augmented sentences.
Original sentence 18142/21752: Added 6 augmented sentences.
Original sentence 18144/21752: Added 6 augmented sentences.
Original sentence 18147/21752: Added 6 augmented sentences.
Original sentence 18148/21752: Added 6 augmented sentences.
Original sentence 18150/21752: Added 6 augmented sentences.
Original sentence 18151/21752: Added 6 augmented sentences.
Original sentence 18152/21752: Added 6 augmented sentences.
Original sentence 18153/21752: Added 6 a

In [36]:
augmented_data = generate_augmented_data(df.iloc[21600:,:], wordnet_augmenter, embedding_augmenter, easy_augmenter)

augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('./span_data/augmented_data_06.csv', index=False, encoding='utf-8')

print(f"Generated {len(augmented_df)} augmented rows.")

Original sentence 21754/26385: Added 6 augmented sentences.
Original sentence 21755/26385: Added 6 augmented sentences.
Original sentence 21757/26385: Added 6 augmented sentences.
Original sentence 21758/26385: Added 6 augmented sentences.
Original sentence 21759/26385: Added 6 augmented sentences.
Original sentence 21760/26385: Added 6 augmented sentences.
Original sentence 21761/26385: Added 6 augmented sentences.
Original sentence 21762/26385: Added 6 augmented sentences.
Original sentence 21763/26385: Added 6 augmented sentences.
Original sentence 21765/26385: Added 6 augmented sentences.
Original sentence 21768/26385: Added 6 augmented sentences.
Original sentence 21769/26385: Added 6 augmented sentences.
Original sentence 21771/26385: Added 6 augmented sentences.
Original sentence 21772/26385: Added 6 augmented sentences.
Original sentence 21774/26385: Added 6 augmented sentences.
Original sentence 21775/26385: Added 6 augmented sentences.
Original sentence 21776/26385: Added 6 a

#### Combite it into 1

In [2]:
df0 = pd.read_csv('./span_data/augmented_data_00.csv')
df1 = pd.read_csv('./span_data/augmented_data_01.csv')
df2 = pd.read_csv('./span_data/augmented_data_02.csv')
df3 = pd.read_csv('./span_data/augmented_data_03.csv')
df4 = pd.read_csv('./span_data/augmented_data_04.csv')
df5 = pd.read_csv('./span_data/augmented_data_05.csv')
df6 = pd.read_csv('./span_data/augmented_data_06.csv')

  df2 = pd.read_csv('./span_data/augmented_data_02.csv')
  df3 = pd.read_csv('./span_data/augmented_data_03.csv')


In [3]:
dfs = [df0,df1,df2,df3,df4,df5,df6]

In [4]:
au_df = pd.concat(dfs, axis=0, ignore_index=True)

In [5]:
au_df.shape

(129665, 39)

In [18]:
df.shape

(26216, 39)

- Change 2s value in related column to 1s

In [18]:
au_df['related'] = au_df['related'].map(lambda x: 1 if x==2 else x)
df['related'] = df['related'].map(lambda x: 1 if x==2 else x)

- Save df

In [20]:
au_df.to_csv('../disaster_train.csv', index=False)
df.to_csv('../disaster_test.csv', index=False)