In [46]:
import numpy as np
# import the texts as strings
from data.booknl import bookNL
from data.booken import bookEN

In [47]:
with open("data/nl/book2.txt") as file:
    bookNL = file.read()

with open("data/en/book2.txt",) as file:
    bookEN = file.read()

In [48]:
index_lookup = {
    "a": 0, "b": 1, "c": 2, "d": 3, "e": 4, "f": 5, "g": 6, "h": 7, "i": 8, "j": 9,
    "k": 10, "l": 11, "m": 12, "n": 13, "o": 14, "p": 15, "q": 16, "r": 17, "s": 18,
    "t": 19,"u": 20, "v": 21, "w": 22, "x": 23, "y": 24, "z": 25, " ": 26, "&": 27
}

unique_characters = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "@", ":", "-",
"!", "/", ".", ",", "(", ")", "[", "]", "{", "}", ";", "?", "'", '"', "*", "#", ]


In [49]:
def pre_procces(text: str) -> str:

    text = text.casefold()
    text = text.replace("_", "")
    text = text.replace(u"\n", "")
    text = text.replace(u"\xa0", "")

    for char in unique_characters:
        text = text.replace(char, "&")    
    
    return text

In [50]:
def getFreqMatrix(text: str) -> np.array:
    freq = {}

    for i in range(len(text)-1):
        bigramm = f"{text[i]}{text[i+1]}"
        freq[f"{bigramm}"] = freq[f"{bigramm}"]+1 if bigramm in freq else 1
        matrix = to_freq_matrix(freq)
    return matrix

In [51]:
def to_freq_matrix(freqdict: dict) -> np.array:
    freqmatrix = np.zeros((28, 28), dtype=int)
    for bigramm in freqdict:
        char1, char2 = bigramm
        freqmatrix[index_lookup[char1 if char1 in index_lookup else "&"]][index_lookup[char2 if char2 in index_lookup else "&"]] = freqdict[bigramm]
    return freqmatrix

In [52]:
def predict_language(sentence: str):
    """ prediction model of with language the given sentence is writen in.
    Predicts between English and Dutch. 

    Args:
        sentence (str): Given sentence to predict the language of.
    """
    # generate matrix for sentence
    matrix_sentence = getFreqMatrix(sentence)

    diff_score_nl = (matrix_bookNL * matrix_sentence).sum()
    diff_score_en = (matrix_bookEN * matrix_sentence).sum()

    print(f"NL score: {diff_score_nl}")
    print(f"EN score: {diff_score_en}")

In [63]:
# procces the english book and generate a freq matrix for it
print("Pre-proccesing english book")
bookEN = pre_procces(bookEN)

print("generating Matrix")
matrix_bookEN = getFreqMatrix(bookEN)

print("Normalizing Matrix")
matrix_bookEN = np.divide(matrix_bookEN, matrix_bookEN.sum())

print("Done")

Pre-proccesing english book
generating Matrix
Normalizing Matrix
Done


In [65]:
# procces the dutch book and generate a freq matrix for it
print("Pre-proccesing dutch book")
bookNL = pre_procces(bookNL)

print("generating Matrix")
matrix_bookNL = getFreqMatrix(bookNL)

print("Normalizing Matrix")
matrix_bookNL = np.divide(matrix_bookNL, matrix_bookNL.sum())

print("Done")

Pre-proccesing dutch book
generating Matrix
Normalizing Matrix
Done


In [66]:
predict_language("""And so it was indeed! She was now only ten inches high, and her face
                        brightened up at the thought that she was now the right size for going
                        through the little door into that lovely garden.""")

NL score: 1.338444571413386
EN score: 2.142334054986002


In [67]:
predict_language("""What a curious feeling!" said Alice. "I must be shutting up like a
telescope!""")

NL score: 0.3147689977278465
EN score: 0.3957689251306416


In [68]:
predict_language("""In de dagen van dominé Willems was dat nog zoo niet""")

NL score: 0.5300176723049735
EN score: 0.26266191555259777


In [69]:
predict_language("""I have already told you what I think of the functionaries
     in this department, but have not spoken sufficiently of the
     respectable bishop, M. Bourlier.""")

NL score: 0.8676915717721467
EN score: 1.154399975289714


In [70]:
predict_language("""Zijn scheepsvolk had bij toeval
vernomen, dat de galei door twee andere schepen beschermd werd, en
daardoor was het zoo beangstigd geworden, dat het weigerde den tocht
voort te zetten""")

NL score: 1.8442365697125926
EN score: 1.212915617181511
