In [152]:
import numpy as np

In [254]:
with open("data/nl/book2.txt") as file:
    bookNL = file.read()

with open("data/en/book2.txt",) as file:
    bookEN = file.read()

In [255]:
index_lookup = {
    "a": 0, "b": 1, "c": 2, "d": 3, "e": 4, "f": 5, "g": 6, "h": 7, "i": 8, "j": 9,
    "k": 10, "l": 11, "m": 12, "n": 13, "o": 14, "p": 15, "q": 16, "r": 17, "s": 18,
    "t": 19,"u": 20, "v": 21, "w": 22, "x": 23, "y": 24, "z": 25, " ": 26, "&": 27
}

unique_characters = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "@", ":", "-",
"!", "/", ",", "(", ")", "[", "]", "{", "}", ";", "?", "'", '"', "*", "#", ]


In [256]:
def pre_procces(text: str) -> str:

    text = text.casefold()
    text = text.replace("_", "")
    text = text.replace(u"\n", "")
    text = text.replace(u"\xa0", "")

    for char in unique_characters:
        text = text.replace(char, "&")    
    
    return text

In [257]:
def getFreqMatrix(text: str) -> np.array:
    freq = {}

    for i in range(len(text)-1):
        bigramm = f"{text[i]}{text[i+1]}"
        freq[f"{bigramm}"] = freq[f"{bigramm}"]+1 if bigramm in freq else 1
        matrix = to_freq_matrix(freq)
        
    return matrix

In [258]:
def to_freq_matrix(freqdict: dict) -> np.array:
    freqmatrix = np.zeros((28, 28), dtype=int)
    for bigramm in freqdict:
        char1, char2 = bigramm
        freqmatrix[index_lookup[char1 if char1 in index_lookup else "&"]][index_lookup[char2 if char2 in index_lookup else "&"]] = freqdict[bigramm]
        
    return freqmatrix

In [259]:
def predict_language(sentence: str) -> int:
    """ prediction model of with language the given sentence is writen in.
    Predicts between English and Dutch. 

    Args:
        sentence (str): Given sentence to predict the language of.

    Return:
        predicted language(int): 0(English) or 1(Dutch)
    """
    # generate matrix for sentence
    sentence = pre_procces(sentence)
    matrix_sentence = getFreqMatrix(sentence)

    diff_score_nl = (matrix_bookNL * matrix_sentence).sum()
    diff_score_en = (matrix_bookEN * matrix_sentence).sum()

    if diff_score_en > diff_score_nl:
        return 0
    else: return 1

In [260]:

def genMatrix(book: str):
    # procces the  book and generate a freq matrix for it
    print("Pre-proccesing book")
    book = pre_procces(book)

    print("generating Matrix")
    matrix_book = getFreqMatrix(book)

    print("Normalizing Matrix")
    matrix_book = np.divide(matrix_book, matrix_book.sum())

    print("Done")
    return matrix_book

In [261]:
matrix_bookNL = genMatrix(bookNL)

Pre-proccesing book
generating Matrix
Normalizing Matrix
Done


In [262]:
matrix_bookEN = genMatrix(bookEN)

Pre-proccesing book
generating Matrix
Normalizing Matrix
Done


In [267]:
def test_model(test_set: list) -> None:
    """This function tests the model with a set of unseen sentences.

    Args:
        test_set (list): list of sentences in dtring format with a target value
    """
    accuracy = []
    for sentence_set in test_set:
        if len(sentence_set[0]) > 10:
            target = sentence_set[1]
            output = predict_language(sentence=sentence_set[0])
            if output == target:
                accuracy.append(1)
            else: accuracy.append(0)
        
    print(f"Accuracy Score: {(sum(accuracy)/len(accuracy))*100}%")
            

In [268]:
def gen_test_set(file_path: str, target: int):
    test_set = ""
    # generate a test set
    with open(file_path) as file:
        test_set = file.read()
        test_set = pre_procces(test_set)
        test_set = test_set.split(".")
        test_set = list(zip(test_set, [target]*len(test_set)))

    return test_set

In [271]:
test_set_en = gen_test_set("data/en/book1.txt", 0)
print(f"Lenght of data set: {len(test_set_en)}")
test_model(test_set_en)

Lenght of data set: 401
Accuracy Score: 97.98488664987406%


In [272]:
test_set_nl = gen_test_set("data/nl/book3.txt", 1)
print(f"Lenght of data set: {len(test_set_nl)}")
test_model(test_set_nl)


Lenght of data set: 1462
Accuracy Score: 99.57356076759062%


In [234]:
predict_language("""And so it was indeed! She was now only ten inches high, and her face
                        brightened up at the thought that she was now the right size for going
                        through the little door into that lovely garden.""")

0

In [235]:
predict_language("""What a curious feeling!" said Alice. "I must be shutting up like a
telescope!""")

0

In [236]:
predict_language("""In de dagen van dominé Willems was dat nog zoo niet""")

1

In [237]:
predict_language("""I have already told you what I think of the functionaries
     in this department, but have not spoken sufficiently of the
     respectable bishop, M. Bourlier.""")

0

In [238]:
predict_language("""Zijn scheepsvolk had bij toeval
vernomen, dat de galei door twee andere schepen beschermd werd, en
daardoor was het zoo beangstigd geworden, dat het weigerde den tocht
voort te zetten""")

1