In [1]:
import os

import numpy as np
import pandas as pd
from helpers import tokenize_tweet

result_dir = "results"
tweets = pd.read_csv(os.path.join(result_dir, "10k_tweets_nl.csv"), index_col=False)
tweets.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Url
0,0,2023-06-26 13:11:32+00:00,1673318075465121792,Dus wij sturen wapens en geld naar de Oekraïne...,PMG_RIP,https://twitter.com/PMG_RIP/status/16733180754...
1,1,2023-06-26 13:11:31+00:00,1673318072550039553,Lumineon V $9 \nPWE $1 \nBMWT $3 https://t.co/...,TattooedPokemon,https://twitter.com/TattooedPokemon/status/167...
2,2,2023-06-26 13:11:31+00:00,1673318072457871360,Loodswezen Antwerpen wordt culinair centrum 23...,de_beyn,https://twitter.com/de_beyn/status/16733180724...
3,3,2023-06-26 13:11:31+00:00,1673318072365744129,"nvm, sis is here &gt;&lt; https://t.co/vgojxqaiAK",alx_yeonie,https://twitter.com/alx_yeonie/status/16733180...
4,4,2023-06-26 13:11:31+00:00,1673318071446953991,Mijn therapeut vond mijn jurk mooi vandaag ☺️,roosferrero,https://twitter.com/roosferrero/status/1673318...


In [2]:
def cleaning_up(word) -> str:
    word = word.lower()

    # Skip accounts
    if word.startswith("@"):
        return ""
    # Skip urls
    if word.startswith(r"https://"):
        return ""
    return word

In [3]:
all_words = {}

for tweet in tweets.Text:
    words = tokenize_tweet(tweet)
    for word in words:
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        all_words[word] = all_words.get(word, 0) + 1

In [4]:
print(f"In {len(tweets)} tweets we found {len(all_words)} unique words")

In 10000 tweets we found 29757 unique words


In [9]:
all_words_df = pd.DataFrame(all_words.items(), columns=["word", "occurrence"])
number_of_words = np.sum(all_words_df.occurrence)
all_words_df.reset_index(drop=True)
all_words_df = all_words_df.sort_values("occurrence", ascending=False)
all_words_df["percentage"] = np.round(all_words_df.occurrence / number_of_words, 6)
all_words_df["cumulative"] = all_words_df.percentage.cumsum()
all_words_df.to_csv(os.path.join(result_dir, "all_words.csv"), index=False)
all_words_df.head(n=25)

Unnamed: 0,word,occurrence,percentage,cumulative
7,de,5053,0.03369,0.03369
24,van,2886,0.019242,0.052932
110,het,2675,0.017835,0.070767
4,en,2655,0.017702,0.088469
18,in,2654,0.017695,0.106164
112,een,2642,0.017615,0.123779
57,is,2000,0.013335,0.137114
14,op,1635,0.010901,0.148015
71,voor,1443,0.009621,0.157636
85,ik,1407,0.009381,0.167017
