In [30]:
import os

import pandas as pd

tweets = pd.read_csv(os.path.join("results", "10k_tweets_nl.csv"), index_col=False)
tweets.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Url
0,0,2023-06-26 12:30:18+00:00,1673307697712619520,"Tekken 7, Street Fighter 3",TheRealDeljoo,https://twitter.com/TheRealDeljoo/status/16733...
1,1,2023-06-26 12:30:17+00:00,1673307695334432772,Avond4daagse wordt Avond3daagse in Annen - ht...,schakelgieten,https://twitter.com/schakelgieten/status/16733...
2,2,2023-06-26 12:30:17+00:00,1673307693811920896,"🎶🎤Suzanne, Suzanne, Suzanne....😍\nGoedemiddag ...",PietjePool1,https://twitter.com/PietjePool1/status/1673307...
3,3,2023-06-26 12:30:16+00:00,1673307689382817793,182 leerlingen van elf lagere scholen uit Diks...,hnvwvl,https://twitter.com/hnvwvl/status/167330768938...
4,4,2023-06-26 12:30:15+00:00,1673307687201693701,Ik zet me in tegen hitteleed bij dieren. Want ...,BrobbelC,https://twitter.com/BrobbelC/status/1673307687...


In [42]:
def cleaning_up(word) -> str:
    word = word.lower()
    # Skip accounts
    if word.startswith("@"):
        return ""
    # Skip urls
    if word.startswith(r"https://"):
        return ""
    return word

In [34]:
all_words = {}

for tweet in tweets.Text:
    for word in tweet.split(" "):
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        all_words[word] = all_words.get(word, 0) + 1

In [35]:
print(f"In {len(tweets)} tweets we found {len(all_words)} unique words")

In 1000 tweets we found 5961 unique words


In [36]:
all_words_sorted = {k: v for k, v in sorted(all_words.items(), key=lambda item: -item[1])}
all_words_sorted

{'de': 409,
 'en': 246,
 'een': 244,
 'van': 241,
 'het': 229,
 'in': 219,
 'is': 157,
 'op': 142,
 'voor': 134,
 'dat': 134,
 'met': 114,
 'je': 110,
 'te': 107,
 'ik': 106,
 'die': 94,
 'niet': 91,
 'bij': 68,
 'zijn': 65,
 'maar': 63,
 'om': 59,
 'naar': 57,
 'ook': 56,
 'dit': 56,
 'als': 55,
 'er': 54,
 'aan': 54,
 '-': 47,
 'nog': 44,
 'nu': 43,
 'ze': 42,
 'al': 42,
 'over': 41,
 'kan': 40,
 'meer': 38,
 'we': 38,
 'wat': 36,
 'dan': 35,
 'geen': 34,
 'of': 33,
 'deze': 32,
 ',': 31,
 'wordt': 30,
 'door': 30,
 'wil': 29,
 'via': 29,
 'wel': 27,
 'zo': 26,
 'uit': 25,
 'me': 25,
 'dus': 24,
 'weer': 24,
 'na': 24,
 'mensen': 24,
 'moet': 24,
 'hoe': 23,
 'omg': 22,
 'hij': 22,
 'jullie': 21,
 'toch': 21,
 'worden': 21,
 'heb': 20,
 'mij': 20,
 'iemand': 20,
 'mijn': 20,
 'gaan': 19,
 'hebben': 19,
 'heeft': 19,
 'jaar': 18,
 'gaat': 18,
 'willen': 18,
 'zou': 18,
 'kunnen': 17,
 'nieuwe': 17,
 'goed': 17,
 'waar': 17,
 'fvd': 17,
 'ben': 17,
 'hun': 16,
 'tegen': 16,
 'echt': 16

In [43]:
all_tweets = {}

for tweet, tweet_id in zip(tweets.Text, tweets["Tweet Id"]):
    rarest_word = len(tweets) # Nothing can occur more then in every tweet
    for word in tweet.split(" "):
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        rarest_word = min(rarest_word, all_words.get(word, len(tweets)))
    all_tweets[tweet_id] = {
        "Id": tweet_id,
        "Content": tweet,
        "Rarest word": rarest_word
    }

In [45]:
all_tweets_sorted = {k: v for k, v in sorted(all_tweets.items(), key=lambda item: -item[1]["Rarest word"])}
all_tweets_sorted

{1673307510164574208: {'Id': 1673307510164574208,
  'Content': 'omg omg omg',
  'Rarest word': 22},
 1673306318558617603: {'Id': 1673306318558617603,
  'Content': 'OMG OMG OMG',
  'Rarest word': 22},
 1673306253366525952: {'Id': 1673306253366525952,
  'Content': 'OMG OMG OMG',
  'Rarest word': 22},
 1673305319588372480: {'Id': 1673305319588372480,
  'Content': 'omg  omg omg omg omg omg omg',
  'Rarest word': 22},
 1673306479644884998: {'Id': 1673306479644884998,
  'Content': 'Deze gaat naar @Univers09206495',
  'Rarest word': 18},
 1673307424973881347: {'Id': 1673307424973881347,
  'Content': 'Waar is @KustawBessems…?👇🏻',
  'Rarest word': 17},
 1673307609078611968: {'Id': 1673307609078611968,
  'Content': 'FvD ligt onder vuur! Linkse Kamerleden willen de partij verbieden. Teken nu de petitie om dit te stoppen: https://t.co/u2UeqVe6kD',
  'Rarest word': 9},
 1673307450945019905: {'Id': 1673307450945019905,
  'Content': 'FvD ligt onder vuur! Linkse Kamerleden willen de partij verbieden. 