In [11]:
import os

import numpy as np
import pandas as pd
from helpers import tokenize_tweet
from tqdm import tqdm
result_dir = "results"
tweets = pd.read_csv(os.path.join(result_dir, "10k_tweets_nl.csv"), index_col=False)
tweets.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Url
0,0,2023-06-26 13:11:32+00:00,1673318075465121792,Dus wij sturen wapens en geld naar de Oekraïne...,PMG_RIP,https://twitter.com/PMG_RIP/status/16733180754...
1,1,2023-06-26 13:11:31+00:00,1673318072550039553,Lumineon V $9 \nPWE $1 \nBMWT $3 https://t.co/...,TattooedPokemon,https://twitter.com/TattooedPokemon/status/167...
2,2,2023-06-26 13:11:31+00:00,1673318072457871360,Loodswezen Antwerpen wordt culinair centrum 23...,de_beyn,https://twitter.com/de_beyn/status/16733180724...
3,3,2023-06-26 13:11:31+00:00,1673318072365744129,"nvm, sis is here &gt;&lt; https://t.co/vgojxqaiAK",alx_yeonie,https://twitter.com/alx_yeonie/status/16733180...
4,4,2023-06-26 13:11:31+00:00,1673318071446953991,Mijn therapeut vond mijn jurk mooi vandaag ☺️,roosferrero,https://twitter.com/roosferrero/status/1673318...


# Naive word count

In [31]:
def cleaning_up(word) -> str:
    word = word.lower()

    # Skip accounts
    if word.startswith("@"):
        return ""
    # Skip urls
    if word.startswith(r"https://"):
        return ""
    
    if word.startswith("#"):
        return word[1:]
    return word

In [32]:
all_words = {}

for tweet in tweets.Text:
    words = tokenize_tweet(tweet)
    for word in words:
        word = cleaning_up(word)
        # Skip emtpy strings
        if len(word) == 0:
            continue
        all_words[word] = all_words.get(word, 0) + 1

In [33]:
print(f"In {len(tweets)} tweets we found {len(all_words)} unique words")

In 10000 tweets we found 28660 unique words


In [34]:
all_words_df = pd.DataFrame(all_words.items(), columns=["word", "occurrence"])
number_of_words = np.sum(all_words_df.occurrence)
all_words_df.reset_index(drop=True)
all_words_df = all_words_df.sort_values("occurrence", ascending=False)
all_words_df["percentage"] = np.round(all_words_df.occurrence / number_of_words, 6)
all_words_df["cumulative"] = all_words_df.percentage.cumsum()
all_words_df.to_csv(os.path.join(result_dir, "all_words.csv"), index=False)
all_words_df.head(n=25)

Unnamed: 0,word,occurrence,percentage,cumulative
7,de,5053,0.03369,0.03369
24,van,2886,0.019242,0.052932
110,het,2675,0.017835,0.070767
4,en,2655,0.017702,0.088469
18,in,2654,0.017695,0.106164
112,een,2642,0.017615,0.123779
57,is,2000,0.013335,0.137114
14,op,1635,0.010901,0.148015
71,voor,1443,0.009621,0.157636
85,ik,1408,0.009388,0.167024


# Spacy lemma word count
A lot of the words are filler, we want nouns, verbs, and adjectives. For this we use spacy

In [35]:
import spacy
model_name = "nl_core_news_lg"
try:
    nlp = spacy.load(model_name)
except OSError:
    print(f"downloading the model {model_name}")
    spacy.cli.download(model_name)

In [36]:
doc = nlp("Mijn appel is op de tafel")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Mijn mijn PRON VNW|bez|det|stan|vol|1|ev|prenom|zonder|agr nmod:poss Xxxx True True
appel appel NOUN N|soort|ev|basis|zijd|stan nsubj xxxx True False
is zijn AUX WW|pv|tgw|ev cop xx True True
op op ADP VZ|init case xx True True
de de DET LID|bep|stan|rest det xx True True
tafel tafel NOUN N|soort|ev|basis|zijd|stan ROOT xxxx True False


In [37]:
doc = nlp("Mijn #appels is op de tafel")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Mijn mijn PRON VNW|bez|det|stan|vol|1|ev|prenom|zonder|agr ROOT Xxxx True True
# # PUNCT LET punct # False False
appels appel NOUN N|soort|mv|basis nsubj xxxx True False
is zijn AUX WW|pv|tgw|ev cop xx True True
op op ADP VZ|init case xx True True
de de DET LID|bep|stan|rest det xx True True
tafel tafel NOUN N|soort|ev|basis|zijd|stan ROOT xxxx True False


In [38]:
all_lemmas = {}

for tweet in tqdm(tweets.Text):
    words = tokenize_tweet(tweet)
    doc = nlp(" ".join(words))

    for token in doc:
        # Don't break up hashtags
        if token.text.startswith("#"):
           continue

        lemma = token.lemma_.lower()
        all_lemmas[lemma] = all_lemmas.get(lemma, 0) + 1

100%|██████████| 10000/10000 [01:01<00:00, 162.89it/s]

In 10000 tweets we found 26269 unique lemma





In [40]:
print(f"In {len(tweets)} tweets we found {len(all_lemmas)} unique lemma")

In 10000 tweets we found 26269 unique lemma


In [39]:
all_lemmas_df = pd.DataFrame(all_lemmas.items(), columns=["word", "occurrence"])
number_of_lemmas = np.sum(all_lemmas_df.occurrence)
all_lemmas_df.reset_index(drop=True)
all_lemmas_df = all_lemmas_df.sort_values("occurrence", ascending=False)
all_lemmas_df["percentage"] = np.round(all_lemmas_df.occurrence / number_of_lemmas, 6)
all_lemmas_df["cumulative"] = all_lemmas_df.percentage.cumsum()
all_lemmas_df.to_csv(os.path.join(result_dir, "all_lemmas.csv"), index=False)
all_lemmas_df.head(n=25)

Unnamed: 0,word,occurrence,percentage,cumulative
7,de,5090,0.033911,0.033911
57,zijn,3544,0.023611,0.057522
24,van,2886,0.019227,0.076749
4,en,2675,0.017822,0.094571
107,het,2675,0.017822,0.112393
18,in,2657,0.017702,0.130095
109,een,2593,0.017275,0.14737
14,op,1635,0.010893,0.158263
70,voor,1457,0.009707,0.16797
83,ik,1408,0.009381,0.177351


# Grammar correction

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("cabir40/t5-dutch-grammar-correction")

model = AutoModelForSeq2SeqLM.from_pretrained("cabir40/t5-dutch-grammar-correction")

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)lve/main/config.json: 100%|██████████| 777/777 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin: 100%|██████████| 990M/990M [15:23<00:00, 1.07MB/s] 


In [9]:
input_text = "Mn kat zitten op de gront."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> Mn kat zit op de grond.</s>


In [12]:
corrected_tweets = []

for tweet in tqdm(tweets.Text[:30]):
    input_ids = tokenizer(tweet, return_tensors="pt").input_ids

    outputs = model.generate(input_ids)
    corrected_tweets.append(tokenizer.decode(outputs[0]))

  0%|          | 12/10000 [00:30<6:57:10,  2.51s/it]


KeyboardInterrupt: 

In [13]:
corrected_tweets

['<pad> Dus wij sturen wapens en geld naar de Oekraïne om vervolgens de North',
 '<pad> $9 PWE $1 BMWT $3 ht',
 '<pad> Brugge verraste 26/6/2023.</s>',
 '<pad> &gt; https://t.co/',
 '<pad> Mijn therapeut vond mijn jurk mooi vandaag ☺️.</s>',
 '<pad> Tegenwoordig worden namen van componisten tegenwoordig voor de raarste dingen gebruikt.',
 '<pad> Ik haat jullie zo erg!</s>',
 '<pad> 3 https://t.co/KAha6',
 '<pad> DRAGGENIUS.</s>',
 '<pad> Ik heb nogsteeds mn weetabix die ik voor ramadan',
 '<pad> Geitenbaard (Aruncus) #watgroeit ',
 '<pad> De lach van de euro https://t.co/']