In [1]:
import os
import urllib
from zipfile import ZipFile


def download_wili():
    """
    Downloads WiLI-2018 if not downloaded.
    """
    if not os.path.isdir('data/WiLI-2018'):
        urllib.request.urlretrieve(
            'https://zenodo.org/record/841984/files/wili-2018.zip',
            'data/wili-2018.zip')
        with ZipFile('data/wili-2018.zip', 'r') as zf:
            zf.extractall('data/WiLI-2018')
        os.remove('data/wili-2018.zip')


def load_wili(include=None):
    """
    Loads WiLI-2018 data, using provided train/test split. If
    'include' parameter is None, all labels (languages) will be
    loaded. If it's a list of strings, only instances with
    matching labels will be preserved. Returns a list of training
    label/sentence tuples and a list of testing label/sentence
    tuples.
    """
    with open('data/WiLI-2018/x_train.txt', 'r') as f:
        train_sents = [l.strip() for l in f.readlines()]
    with open('data/WiLI-2018/y_train.txt', 'r') as f:
        train_labels = [l.strip() for l in f.readlines()]
    with open('data/WiLI-2018/x_test.txt', 'r') as f:
        test_sents = [l.strip() for l in f.readlines()]
    with open('data/WiLI-2018/y_test.txt', 'r') as f:
        test_labels = [l.strip() for l in f.readlines()]
        
    train = zip(train_labels, train_sents)
    test = zip(test_labels, test_sents)
    if isinstance(include, list) and all(type(i) == str for i in include):
        return ( [(l, s) for l, s in train if l in include],
                 [(l, s) for l, s in test if l in include] )
    elif include is None:
        return list(train), list(test)
    else:
        raise ValueError("'include' parameter must be None or list of strs.")


def save_fasttext(train, test,
                  trainfile='data/WiLI-2018/wili.train.txt',
                  testfile='data/WiLI-2018/wili.test.txt'):
    """
    Transforms a list of training label/sentence tuples and a
    list of testing label/sentence tuples to the format
    required by fastText and saves them to disk.
    """
    with open(trainfile, 'w') as f:
        f.write('\n'.join(' '.join(('__label__' + l, s),) for l, s in train))
    with open(testfile, 'w') as f:
        f.write('\n'.join(' '.join(('__label__' + l, s),) for l, s in test))

        
download_wili()
# for languages see labels.csv in WiLI data.
train, test = load_wili(
    ['eng', 'spa', 'fra', 'ita', 'por', 'deu', 'nld',
     'dan', 'nno', 'fin',
     'ces', 'bul', 'pol', 'ron', 'slk', 'hun', 'rus'
    ]
)
save_fasttext(train, test)

In [2]:
import fasttext

model = fasttext.train_supervised(
    'data/WiLI-2018/wili.train.txt',
    lr=0.5,
    epoch=30,
    minCount=3,
    dim=50,
    maxn=3,
    #wordNgrams=3,
)
model.test('data/WiLI-2018/wili.test.txt')

(8500, 0.9841176470588235, 0.9841176470588235)