In [1]:
import os

import numpy as np
import pandas as pd
from helpers import tokenize_tweet

result_dir = "results"
tweets = pd.read_csv(os.path.join(result_dir, "10k_tweets_nl.csv"), index_col=False)
tweets.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Url
0,0,2023-06-26 13:11:32+00:00,1673318075465121792,Dus wij sturen wapens en geld naar de Oekraïne...,PMG_RIP,https://twitter.com/PMG_RIP/status/16733180754...
1,1,2023-06-26 13:11:31+00:00,1673318072550039553,Lumineon V $9 \nPWE $1 \nBMWT $3 https://t.co/...,TattooedPokemon,https://twitter.com/TattooedPokemon/status/167...
2,2,2023-06-26 13:11:31+00:00,1673318072457871360,Loodswezen Antwerpen wordt culinair centrum 23...,de_beyn,https://twitter.com/de_beyn/status/16733180724...
3,3,2023-06-26 13:11:31+00:00,1673318072365744129,"nvm, sis is here &gt;&lt; https://t.co/vgojxqaiAK",alx_yeonie,https://twitter.com/alx_yeonie/status/16733180...
4,4,2023-06-26 13:11:31+00:00,1673318071446953991,Mijn therapeut vond mijn jurk mooi vandaag ☺️,roosferrero,https://twitter.com/roosferrero/status/1673318...


In [2]:
def cleaning_up(word) -> str:
    word = word.lower()

    # Skip accounts
    if word.startswith("@"):
        return ""
    # Skip urls
    if word.startswith(r"https://"):
        return ""
    return word

In [3]:
all_words = {}

for tweet in tweets.Text:
    words = tokenize_tweet(tweet)
    for word in words:
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        all_words[word] = all_words.get(word, 0) + 1

In [4]:
print(f"In {len(tweets)} tweets we found {len(all_words)} unique words")

In 10000 tweets we found 29757 unique words


In [5]:
all_words_df = pd.DataFrame(all_words.items(), columns=["word", "occurrence"])
number_of_words = np.sum(all_words_df.occurrence)
all_words_df.reset_index(drop=True)
all_words_df = all_words_df.sort_values("occurrence", ascending=False)
all_words_df["percentage"] = np.round(all_words_df.occurrence / number_of_words, 6)
all_words_df["cumulative"] = all_words_df.percentage.cumsum()
all_words_df.to_csv(os.path.join(result_dir, "all_words.csv"), index=False)

In [8]:
all_tweets = {}

for tweet, tweet_id in zip(tweets.Text, tweets["Tweet Id"]):
    rarest_word = len(tweets) # Nothing can occur more then in every tweet
    words = tokenize_tweet(tweet)
    for word in words:
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        rarest_word = min(rarest_word, all_words.get(word, len(tweets)))
    all_tweets[tweet_id] = {
        "Id": tweet_id,
        "Content": tweet,
        "Rarest word": rarest_word,
        "Unique words": len(set(words))
    }

In [9]:
all_tweets_sorted = {k: v for k, v in sorted(all_tweets.items(), key=lambda item: -item[1]["Rarest word"])}
all_tweets_sorted

{1673298398408212483: {'Id': 1673298398408212483,
  'Content': 'Op op 😨 https://t.co/yVvWbGeEnu',
  'Rarest word': 1635,
  'Unique words': 1},
 1673311082176454658: {'Id': 1673311082176454658,
  'Content': 'Zo Zo Zo 🤟',
  'Rarest word': 486,
  'Unique words': 1},
 1673310813116059649: {'Id': 1673310813116059649,
  'Content': 'Zo Zo Zo🔥❤️',
  'Rarest word': 486,
  'Unique words': 1},
 1673317246993661953: {'Id': 1673317246993661953,
  'Content': 'Zo dan.',
  'Rarest word': 478,
  'Unique words': 2},
 1673298820417847296: {'Id': 1673298820417847296,
  'Content': 'Dit wel ! 🙏',
  'Rarest word': 329,
  'Unique words': 2},
 1673317824272519168: {'Id': 1673317824272519168,
  'Content': 'Omg omg omg~~~~~~ https://t.co/L6Jo2yZBdc',
  'Rarest word': 255,
  'Unique words': 1},
 1673317591228776450: {'Id': 1673317591228776450,
  'Content': 'OMG OMG OMG',
  'Rarest word': 255,
  'Unique words': 1},
 1673316422582239232: {'Id': 1673316422582239232,
  'Content': 'OMG OMG OMG',
  'Rarest word': 255,
