In [25]:
from collections import Counter
import urllib.request
from lxml import etree
 
import numpy as np
import pandas as pd
 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import model_selection

In [2]:
alphabet="abcdefghijklmnopqrstuvwxyzäö-"
alphabet_set = set(alphabet)

In [10]:
# Returns a list of Finnish words
def load_finnish():
    finnish_url="https://www.cs.helsinki.fi/u/jttoivon/dap/data/kotus-sanalista_v1/kotus-sanalista_v1.xml"
    filename="kotus.txt"
    load_from_net=False
    if load_from_net:
        with urllib.request.urlopen(finnish_url) as data:
            lines=[]
            for line in data:
                lines.append(line.decode('utf-8'))
        doc="".join(lines)
    else:
        with open(filename, "rb") as data:
            doc=data.read()
    tree = etree.XML(doc)
    s_elements = tree.xpath('/kotus-sanalista/st/s')
    return list(map(lambda s: s.text, s_elements))

In [19]:
# Returns a list of English words
def load_english():
    with open("words.txt", encoding="utf-8") as data:
        lines=map(lambda s: s.rstrip(), data.readlines())
    return lines

In [20]:
# Returns feature matrix
def get_features(a):
    feature_matrix = pd.DataFrame(index=a, columns=list(alphabet), data=[Counter(x) for x in a])
    feature_matrix.fillna(0, inplace=True)
    feature_matrix = np.array(feature_matrix)

    return feature_matrix

In [21]:
# Returns True, if all the chars are valid
def contains_valid_chars(s):
    for char in s:
        if char not in alphabet_set:
            return False
    return True

In [22]:
def get_features_and_labels():
    words_fin = load_finnish()
    words_eng = load_english()
    words_fin_ = []
    words_eng_ = []

    for word in words_fin:
        word = word.lower()
        if contains_valid_chars(word):
            words_fin_.append(word)
    
    for word in words_eng:
        if contains_valid_chars(word.lower()) and word[0].islower():
            words_eng_.append(word.lower())
 
    features_all = get_features(words_fin_ + words_eng_)
    labels_all = np.hstack([[0]*len(words_fin_), [1]*len(words_eng_)])

    return features_all, labels_all

In [23]:
def word_classification():
    X, y = get_features_and_labels()
    model = MultinomialNB()
    accuracy_scores = cross_val_score(model, X, y, cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=0))

    return accuracy_scores

In [27]:
# Printing accuracy scores
print(word_classification())

[0.89370104 0.89678673 0.89758288 0.89685042 0.89643642]
