In [1]:
import os

import numpy as np
import pandas as pd
from helpers import tokenize_tweet
from tqdm import tqdm
result_dir = "results"
tweets = pd.read_csv(os.path.join(result_dir, "10k_tweets_nl.csv"), index_col=False)
tweets.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Url
0,0,2023-06-26 13:11:32+00:00,1673318075465121792,Dus wij sturen wapens en geld naar de Oekraïne...,PMG_RIP,https://twitter.com/PMG_RIP/status/16733180754...
1,1,2023-06-26 13:11:31+00:00,1673318072550039553,Lumineon V $9 \nPWE $1 \nBMWT $3 https://t.co/...,TattooedPokemon,https://twitter.com/TattooedPokemon/status/167...
2,2,2023-06-26 13:11:31+00:00,1673318072457871360,Loodswezen Antwerpen wordt culinair centrum 23...,de_beyn,https://twitter.com/de_beyn/status/16733180724...
3,3,2023-06-26 13:11:31+00:00,1673318072365744129,"nvm, sis is here &gt;&lt; https://t.co/vgojxqaiAK",alx_yeonie,https://twitter.com/alx_yeonie/status/16733180...
4,4,2023-06-26 13:11:31+00:00,1673318071446953991,Mijn therapeut vond mijn jurk mooi vandaag ☺️,roosferrero,https://twitter.com/roosferrero/status/1673318...


# Naive word count

In [2]:
def cleaning_up(word) -> str:
    word = word.lower()

    # Skip accounts
    if word.startswith("@"):
        return ""
    # Skip urls
    if word.startswith(r"https://"):
        return ""
    
    if word.startswith("#"):
        return word[1:]
    return word

In [3]:
all_words = {}

for tweet in tweets.Text:
    words = tokenize_tweet(tweet)
    for word in words:
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        all_words[word] = all_words.get(word, 0) + 1

In [4]:
print(f"In {len(tweets)} tweets we found {len(all_words)} unique words")

In 10000 tweets we found 28660 unique words


In [5]:
all_words_df = pd.DataFrame(all_words.items(), columns=["word", "occurrence"])
number_of_words = np.sum(all_words_df.occurrence)
all_words_df.reset_index(drop=True)
all_words_df = all_words_df.sort_values("occurrence", ascending=False)
all_words_df["percentage"] = np.round(all_words_df.occurrence / number_of_words, 6)
all_words_df["cumulative"] = all_words_df.percentage.cumsum()
all_words_df.to_csv(os.path.join(result_dir, "all_words.csv"), index=False)
all_words_df.head(n=25)

Unnamed: 0,word,occurrence,percentage,cumulative
7,de,5053,0.03369,0.03369
24,van,2886,0.019242,0.052932
110,het,2675,0.017835,0.070767
4,en,2655,0.017702,0.088469
18,in,2654,0.017695,0.106164
112,een,2642,0.017615,0.123779
57,is,2000,0.013335,0.137114
14,op,1635,0.010901,0.148015
71,voor,1443,0.009621,0.157636
85,ik,1408,0.009388,0.167024


# Spacy lemma word count
A lot of the words are filler, we want nouns, verbs, and adjectives. For this we use spacy

In [6]:
import spacy

model_name = "nl_core_news_lg"
try:
    nlp = spacy.load(model_name)
except OSError:
    print(f"downloading the model {model_name}")
    spacy.cli.download(model_name)

In [7]:
doc = nlp("Er zijn 3 appels op de twee tafels")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    
for token in doc:
    # Skip hashtags
    if token.text.startswith("#"):
        continue

    # Skip stop words
    if token.is_stop:
        continue
    
    # Skip numbers
    if token.pos_ == "NUM":
        continue
    
    lemma = token.lemma_.lower()
    print(lemma)

Er er ADV VNW|aanw|adv-pron|stan|red|3|getal advmod Xx True True
zijn zijn VERB WW|pv|tgw|mv ROOT xxxx True True
3 3 NUM TW|hoofd|prenom|stan nummod d False False
appels appel NOUN N|soort|mv|basis nsubj xxxx True False
op op ADP VZ|init case xx True True
de de DET LID|bep|stan|rest det xx True True
twee twee NUM TW|hoofd|prenom|stan nummod xxxx True False
tafels tafel NOUN N|soort|mv|basis obl xxxx True False
appel
tafel


In [8]:
all_lemmas = {}

for tweet in tqdm(tweets.Text):
    words = tokenize_tweet(tweet)
    doc = nlp(" ".join(words))

    for token in doc:
        # Skip hashtags
        if token.text.startswith("#"):
           continue

        # Skip stop words
        if token.is_stop:
            continue
        
        # Skip numbers
        if token.pos_ == "NUM":
            continue
        
        lemma = token.lemma_.lower()

        # Skip empty strings
        if lemma in ["", " "]:
            continue

        all_lemmas[lemma] = all_lemmas.get(lemma, 0) + 1


100%|██████████| 10000/10000 [00:40<00:00, 249.67it/s]


In [9]:
print(f"In {len(tweets)} tweets we found {len(all_lemmas)} unique lemma")

In 10000 tweets we found 25430 unique lemma


In [10]:
all_lemmas_df = pd.DataFrame(all_lemmas.items(), columns=["word", "occurrence"])
number_of_lemmas = np.sum(all_lemmas_df.occurrence)
all_lemmas_df.reset_index(drop=True)
all_lemmas_df = all_lemmas_df.sort_values("occurrence", ascending=False)
all_lemmas_df["percentage"] = np.round(all_lemmas_df.occurrence / number_of_lemmas, 6)
all_lemmas_df["cumulative"] = all_lemmas_df.percentage.cumsum()
all_lemmas_df.to_csv(os.path.join(result_dir, "all_lemmas.csv"), index=False)
all_lemmas_df.head(n=25)

Unnamed: 0,word,occurrence,percentage,cumulative
66,gaan,625,0.007549,0.007549
291,komen,339,0.004095,0.011644
255,s,327,0.00395,0.015594
390,nieuw,325,0.003926,0.01952
173,zien,316,0.003817,0.023337
570,via,312,0.003769,0.027106
273,goed,312,0.003769,0.030875
224,mens,305,0.003684,0.034559
814,jaar,297,0.003587,0.038146
480,maken,262,0.003165,0.041311


# Compare results


In [11]:
len(set(all_lemmas_df.word) & set(all_words_df.word))

21912

In [12]:
new_lemmas = set(all_lemmas_df.word).difference(set(all_words_df.word))
print(f"{len(new_lemmas)} new lemmas that where not words")

all_lemmas_df.loc[all_lemmas_df.word.isin(new_lemmas)].head(25)

3518 new lemmas that where not words


Unnamed: 0,word,occurrence,percentage,cumulative
9920,rtlnieuw,24,0.00029,0.331248
8570,teletek,23,0.000278,0.342668
265,zoninstraling,21,0.000254,0.35754
6161,zek,20,0.000242,0.362052
6012,letlen,20,0.000242,0.364714
1750,huidig,20,0.000242,0.364956
906,verdenken,19,0.000229,0.373853
1153,zonnepane,16,0.000193,0.401525
4148,aanhouden,13,0.000157,0.4433
8490,betreffen,12,0.000145,0.448434


In [13]:
all_words_df.loc[all_words_df.word.str.startswith("rtlnieuw")]

Unnamed: 0,word,occurrence,percentage,cumulative
11383,rtlnieuws,30,0.0002,0.55789


In [14]:
all_lemmas_df.loc[all_lemmas_df.word.str.startswith("rtlnieuw")]

Unnamed: 0,word,occurrence,percentage,cumulative
9920,rtlnieuw,24,0.00029,0.331248
14441,rtlnieuws,6,7.2e-05,0.582055


In [15]:
dropped_words = set(all_words_df.word).difference(set(all_lemmas_df.word))
print(f"{len(dropped_words)} new words that where not lemmas")

all_words_df.loc[all_words_df.word.isin(dropped_words)].head(25)

6748 new words that where not lemmas


Unnamed: 0,word,occurrence,percentage,cumulative
7,de,5053,0.03369,0.03369
24,van,2886,0.019242,0.052932
110,het,2675,0.017835,0.070767
57,is,2000,0.013335,0.137114
14,op,1635,0.010901,0.148015
85,ik,1408,0.009388,0.167024
155,met,1320,0.008801,0.175825
23,dat,1293,0.008621,0.184446
246,je,1258,0.008388,0.192834
116,niet,1143,0.007621,0.208169
