In [46]:
import os

import pandas as pd

tweets = pd.read_csv(os.path.join("results", "10k_tweets_nl.csv"), index_col=False)
tweets.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Url
0,0,2023-06-26 13:11:32+00:00,1673318075465121792,Dus wij sturen wapens en geld naar de Oekraïne...,PMG_RIP,https://twitter.com/PMG_RIP/status/16733180754...
1,1,2023-06-26 13:11:31+00:00,1673318072550039553,Lumineon V $9 \nPWE $1 \nBMWT $3 https://t.co/...,TattooedPokemon,https://twitter.com/TattooedPokemon/status/167...
2,2,2023-06-26 13:11:31+00:00,1673318072457871360,Loodswezen Antwerpen wordt culinair centrum 23...,de_beyn,https://twitter.com/de_beyn/status/16733180724...
3,3,2023-06-26 13:11:31+00:00,1673318072365744129,"nvm, sis is here &gt;&lt; https://t.co/vgojxqaiAK",alx_yeonie,https://twitter.com/alx_yeonie/status/16733180...
4,4,2023-06-26 13:11:31+00:00,1673318071446953991,Mijn therapeut vond mijn jurk mooi vandaag ☺️,roosferrero,https://twitter.com/roosferrero/status/1673318...


In [47]:
def cleaning_up(word) -> str:
    word = word.lower()
    # Skip accounts
    if word.startswith("@"):
        return ""
    # Skip urls
    if word.startswith(r"https://"):
        return ""
    return word

In [48]:
all_words = {}

for tweet in tweets.Text:
    for word in tweet.split(" "):
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        all_words[word] = all_words.get(word, 0) + 1

In [49]:
print(f"In {len(tweets)} tweets we found {len(all_words)} unique words")

In 10000 tweets we found 39276 unique words


In [50]:
all_words_sorted = {k: v for k, v in sorted(all_words.items(), key=lambda item: -item[1])}
all_words_sorted

{'de': 4914,
 'van': 2851,
 'een': 2582,
 'in': 2576,
 'en': 2565,
 'het': 2521,
 'is': 1841,
 'op': 1530,
 'voor': 1385,
 'met': 1297,
 'ik': 1243,
 'dat': 1234,
 'je': 1213,
 'te': 1150,
 'niet': 1010,
 'die': 1007,
 'zijn': 748,
 'om': 677,
 'maar': 660,
 'er': 651,
 'aan': 624,
 'bij': 599,
 'als': 592,
 'dit': 562,
 'naar': 554,
 'wat': 541,
 '-': 537,
 'over': 534,
 'nog': 515,
 'ook': 514,
 'ze': 458,
 'of': 451,
 'dan': 429,
 'meer': 419,
 'uit': 389,
 'we': 380,
 'nu': 375,
 'al': 375,
 'deze': 368,
 'door': 357,
 'zo': 351,
 'geen': 349,
 'kan': 301,
 'mijn': 300,
 'weer': 291,
 'wel': 288,
 'heeft': 285,
 'via': 284,
 'na': 254,
 'heb': 250,
 'mensen': 245,
 'ben': 239,
 'omg': 239,
 'moet': 234,
 'hebben': 233,
 '|': 231,
 'was': 230,
 'wordt': 229,
 'nieuwe': 227,
 ',': 227,
 'gaat': 221,
 'hoe': 219,
 'wil': 213,
 'tot': 211,
 'onze': 209,
 'kunnen': 207,
 'hier': 206,
 'jaar': 199,
 'gaan': 198,
 'echt': 196,
 'toch': 194,
 'hij': 192,
 'me': 186,
 'zou': 183,
 'jullie':

In [51]:
all_tweets = {}

for tweet, tweet_id in zip(tweets.Text, tweets["Tweet Id"]):
    rarest_word = len(tweets) # Nothing can occur more then in every tweet
    for word in tweet.split(" "):
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        rarest_word = min(rarest_word, all_words.get(word, len(tweets)))
    all_tweets[tweet_id] = {
        "Id": tweet_id,
        "Content": tweet,
        "Rarest word": rarest_word
    }

In [52]:
all_tweets_sorted = {k: v for k, v in sorted(all_tweets.items(), key=lambda item: -item[1]["Rarest word"])}
all_tweets_sorted

{1673317591228776450: {'Id': 1673317591228776450,
  'Content': 'OMG OMG OMG',
  'Rarest word': 239},
 1673316422582239232: {'Id': 1673316422582239232,
  'Content': 'OMG OMG OMG',
  'Rarest word': 239},
 1673316150942507008: {'Id': 1673316150942507008,
  'Content': 'OMG OMG OMG',
  'Rarest word': 239},
 1673315632597676033: {'Id': 1673315632597676033,
  'Content': 'OMG OMG OMG',
  'Rarest word': 239},
 1673315438191624192: {'Id': 1673315438191624192,
  'Content': 'OMG OMG OMG',
  'Rarest word': 239},
 1673315396504649728: {'Id': 1673315396504649728,
  'Content': 'OMG OMG OMG',
  'Rarest word': 239},
 1673314836216791040: {'Id': 1673314836216791040,
  'Content': 'OMG OMG OMG OMG',
  'Rarest word': 239},
 1673313768166813696: {'Id': 1673313768166813696,
  'Content': 'OMG OMG OMG',
  'Rarest word': 239},
 1673311765768339461: {'Id': 1673311765768339461,
  'Content': 'OMG OMG OMG',
  'Rarest word': 239},
 1673310678612860928: {'Id': 1673310678612860928,
  'Content': 'OMG OMG OMG OMG OMG OMG