In [None]:
import pandas as pd
import gdown

In [None]:
url = "https://drive.google.com/uc?id=16rfiy-WrqBVBsrmE5VZk-Czk10wMAAmF"
output = "tweets.csv"
gdown.download(url, output, quiet=False)


In [None]:
tweets = pd.read_csv('tweets.csv', encoding='utf-8')
tweets.head(10)

# Text preprocessing

### Cleaning text

In [None]:
tweets = tweets.dropna(subset=['text'])

In [None]:
tweets['text_clean'] = tweets["text"].map(lambda x: x.lower() if isinstance(x, str) else x)
tweets.head(10)

### Removing URLs

In [None]:
import re

url_pattern = re.compile(r'https?://\S+')

def remove_urls(text):
  return url_pattern.sub('', text)


tweets['text_clean'] =  tweets['text_clean'].apply(remove_urls)
tweets.head(10)

### Replacing emojis and emoticons by their meanings

In [None]:
import emoji
tweets['text_clean'] = tweets['text_clean'].apply(
    lambda x: emoji.replace_emoji(x, replace=lambda c, d: f"[{d['en']}]" ) if isinstance(x, str) else x
)
tweets.head(10)

In [None]:
from emot.emo_unicode import EMOTICONS_EMO
import re

def replace_emoticons(text):
    if not isinstance(text, str):
        return text
    for emoticon, meaning in EMOTICONS_EMO.items():
        text = re.sub(re.escape(emoticon), f" {meaning} ", text)
    return text

tweets['text_clean'] = tweets['text_clean'].apply(replace_emoticons)
tweets.head(10)

### Removing non-word and non-whitespace characters

In [None]:
tweets['text_clean'] = tweets['text_clean'].replace(to_replace = r'[^\w\s]', value = '', regex = True)
tweets.head(10)

### Tokenization

In [None]:
from transformers import CamembertTokenizer
from tqdm import tqdm
tqdm.pandas()

tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base")

def tokenize_camembert(text):
    return tokenizer.tokenize(text)

tweets['text_token'] = tweets['text_clean'].progress_apply(tokenize_camembert)
tweets.head(10)

### Removing stopwords

In [None]:
#stopwords = [x.strip() for x in open('stop_word_fr.txt').readlines()]
#tweets['text_token'] = tweets['text_token'].apply(lambda x: [word for word in x if word not in stopwords])
#tweets.head(10)

In [None]:
tweets.to_csv('tweets_prepro.csv')