In [10]:
from collections import Counter
import urllib.request
from lxml import etree
 
import numpy as np
 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import model_selection

In [11]:
alphabet="abcdefghijklmnopqrstuvwxyzäö-"
alphabet_set = set(alphabet)

In [12]:
# Returns a list of Finnish words
def load_finnish():
    finnish_url="https://www.cs.helsinki.fi/u/jttoivon/dap/data/kotus-sanalista_v1/kotus-sanalista_v1.xml"
    filename="src/kotus-sanalista_v1.xml"
    load_from_net=False
    if load_from_net:
        with urllib.request.urlopen(finnish_url) as data:
            lines=[]
            for line in data:
                lines.append(line.decode('utf-8'))
        doc="".join(lines)
    else:
        with open(filename, "rb") as data:
            doc=data.read()
    tree = etree.XML(doc)
    s_elements = tree.xpath('/kotus-sanalista/st/s')
    return list(map(lambda s: s.text, s_elements))

In [13]:
def load_english():
    with open("src/words", encoding="utf-8") as data:
        lines=map(lambda s: s.rstrip(), data.readlines())
    return lines

In [14]:
def get_features(a):
    from sklearn.feature_extraction.text import CountVectorizer
    count_vectorizer = CountVectorizer(analyzer="char", vocabulary=alphabet)
    count_vector = count_vectorizer.transform(a)
    feature_matrix = count_vector.toarray()

    return feature_matrix

In [15]:
def contains_valid_chars(s):
    for char in s:
        if char not in alphabet_set:
            return False
    return True

In [16]:
def get_features_and_labels():
    # processing finnish words
    print("Processing finnish words")
    fi = load_finnish()
    
    #Convert the Finnish words to lowercase, and 
    fi2 = [x.lower() for x in fi]
    # filter out words containing characters that don’t belong to the alphabet.
    fi3 = [x for x in fi2 if alphabet_set.issuperset(x)]
    fi_labels = [0 for x in fi3]
    fi4 = np.array(fi3)
    print(fi3)
    print(type(fi3))
    fif = get_features(fi4)
    
    print("Processing english words")
    en = load_english()
    en = list(en)
    en2 = [x for x in en if x[0].islower()]
    #Convert the english words to lowercase, and 
    en3 = [x.lower() for x in en2]
    # filter out words containing characters that don’t belong to the alphabet.
    en4 = [x for x in en3 if alphabet_set.issuperset(x)]
    en_labels = [0 for x in en4]
    en5 = np.array(en4)
    enf = get_features(en5)
    
    fien_labels = fi_labels+en_labels
    fien_labels2 = np.array(fien_labels)
    print("Both english and finnish words processed")
    print(type(fif))
    print(type(enf))
 
    # Stacking the two arrays along axis 0 
    fifenf = np.stack((fif, enf), axis = 0)  
    A = fif
    B = enf
    
    C = np.vstack((A, B))
    print(A.shape,B.shape,C.shape)
    print("Exiting the function")
    return C, fien_labels2

In [17]:
def word_classification():
    X, y = get_features_and_labels()
    #X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)
    #model = naive_bayes.GaussianNB()
    model_selection.KFold(n_splits=5,shuffle=True,random_state=0)
 
    model = MultinomialNB()
    #model.fit(X_train, y_train)
    #y_predicted = model.predict(X_test)
    #print(metrics.accuracy_score(y_test, y_predicted))
 
    scores = cross_val_score(model, X,y,cv=5)
    #clf = svm.SVC(kernel='poly', C=1)
 
    print(scores)
    return scores

In [19]:
# Continue dowloading the wordlist...

# print("Accuracy scores are:", word_classification())