In [1]:
import re
import emoji
import langid
import random
import pandas as pd
import nlpaug.augmenter.word as naw

In [2]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='tiedeman/opus-mt-he-en', 
    to_model_name='tiedeman/opus-mt-en-he'
)

In [6]:
ALL_EMOJIS = list(reversed(sorted(emoji.EMOJI_DATA.keys(), key=len)))

REG_HASHTAG = re.compile('(#[a-zA-Z0-9]+)?')
REG_USERNAME = re.compile('(@[a-zA-Z0-9\-_]+)')

def clean_tweet(tweet):
    cleaned = tweet.replace('&amp;', '&')
    cleaned = re.sub(REG_USERNAME, '', cleaned)
    cleaned = cleaned.replace(' , ', ', ')
    cleaned = cleaned.replace(' &  ', ' ')
    cleaned = cleaned.replace('  & ', ' ')
    cleaned = cleaned.replace('  ', ' ')
    cleaned = cleaned.replace(',,', ',')
    cleaned = cleaned.replace('&gt;', '')
    
    for e in ALL_EMOJIS:
        cleaned = cleaned.replace(e, '')
    
    cleaned = cleaned.replace('קובי גדעון/לע״מ', '')
    cleaned = cleaned.replace('צילום: קובי גדעון, לע״מ', '')
    cleaned = cleaned.replace('צילום: קובי גדעון לע"מ', '')
    cleaned = cleaned.replace('צילום: קובי גדעון/ לע״מ', '')
    cleaned = cleaned.strip()
    
    return cleaned


def clean_tweets():
    tweets = []
    
    tdf = pd.read_csv('dataset/yairlapid_tweets.csv')
    
    for tweet in tdf['tweets']:
        nt = clean_tweet(tweet)
        if langid.classify(nt)[0] == 'he':
            tweets.append(nt)
            
    print(f'successfuly cleaned {len(tweets)} tweets')
    return tweets


def augment_tweets(tweets):
    
    aug_swap = naw.RandomWordAug(action="swap")
    aug_delete = naw.RandomWordAug(action="delete")

    tweets_aug_swap = aug_swap.augment(tweets)
    tweets_aug_del = aug_delete.augment(tweets)
    tweets_aug_translate = back_translation_aug.augment(tweets)

    dataset = tweets + tweets_aug_swap + tweets_aug_del + tweets_aug_translate
    random.shuffle(dataset)

    print(f'augmented dataset into {len(dataset)} tweets')

    new_tdf = pd.DataFrame(dataset, columns=['tweets'])
    new_tdf.to_csv('cleaned_tweets.csv', encoding='utf-8-sig', index=False)
    
def main():
    cleaned_tweets = clean_tweets()
    new_dataset = augment_tweets(cleaned_tweets)
    return new_dataset

dataset = main()

successfuly cleaned 3299 tweets
augmented dataset into 13196 tweets
