In [1]:
import info9

from pathlib import Path

import pandas as pd
import numpy as np


In [2]:
LAMBDA = 1
IOB_LABELS = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

ftrain = Path("../data/eng.train.conll")
ftest = Path("../data/eng.testa.conll")

In [3]:
DICT_LABELS = {l : i for (i,l) in enumerate(IOB_LABELS)}

In [4]:
def make_bag_of_word(fname, min_val=10):
    """
    Renvoie un dictionnaire avec les mots les plus fréquents JUSTE AVANT un mot
    marqué autre chose qu'I-PER. On compte les mots sur les max_row premières
    lignes du fichier, en excluant les lignes marquées "-DOCSTART-", et on renvoie
    un dictionnaire contenant tous les mots avec au moins min_val occurences.
    """
    df = pd.read_csv(fname, sep=" ", names=["word", "L1", "L2", "L3"])
    df.drop(df[df.word == "-DOCSTART-"].index, inplace=True)
    
    word_count = {w: 0 for w in df.word.unique()}
    
    precedw = "."
    for index, row in df.iterrows():
        if(row["L3"] != "O"):
            word_count[precedw] += 1
        precedw = row.word.tolow
    return {x[0] : i for (i,x) in enumerate(
                sorted(
                    filter(lambda x: x[1] > min_val, word_count.items()),
                    key=lambda x: x[1], reverse=True
                )
              )
           }

In [5]:
BAG_OF_WORDS = make_bag_of_word(ftrain, min_val=3)
print(len(BAG_OF_WORDS))
BAG_OF_WORDS

1053


{'.': 0,
 ',': 1,
 '(': 2,
 'the': 3,
 'of': 4,
 '-': 5,
 'in': 6,
 'and': 7,
 ')': 8,
 ':': 9,
 ' ': 10,
 '1': 11,
 'The': 12,
 'to': 13,
 'beat': 14,
 "'s": 15,
 'a': 16,
 '0': 17,
 '--': 18,
 '2': 19,
 '3': 20,
 'said': 21,
 'at': 22,
 '4': 23,
 'by': 24,
 'from': 25,
 'New': 26,
 'with': 27,
 'for': 28,
 'AT': 29,
 'South': 30,
 'on': 31,
 'Minister': 32,
 'President': 33,
 'NEW': 34,
 'that': 35,
 '6': 36,
 '2.': 37,
 '3.': 38,
 '1.': 39,
 'v': 40,
 'U.S.': 41,
 '5': 42,
 'against': 43,
 'United': 44,
 'told': 45,
 '1996-08-28': 46,
 'National': 47,
 '7': 48,
 'b': 49,
 'In': 50,
 '1/2': 51,
 '4.': 52,
 '5.': 53,
 '6.': 54,
 '/': 55,
 'West': 56,
 '1996-08-27': 57,
 'as': 58,
 'Czech': 59,
 'between': 60,
 'Mark': 61,
 'St': 62,
 'former': 63,
 'Michael': 64,
 'Hong': 65,
 '1996-08-29': 66,
 'World': 67,
 '9': 68,
 '7.': 69,
 'Sri': 70,
 '1996-08-26': 71,
 'an': 72,
 'John': 73,
 'SAN': 74,
 '8.': 75,
 '1996-08-22': 76,
 'A': 77,
 'over': 78,
 'vs.': 79,
 'European': 80,
 'Paul': 

In [None]:
def word_vector(word):
    """
    Renvoie une représentation du mot `word` par un vecteur encodant:
    - sa longueur
    - si sa première lettre est une majuscule
    - si sa deuxième lettre est une majuscule
    """
    v = np.zeros(3)
    v[0] = len(word)
    if len(word) > 0:
        v[1] = word[0].isupper()
    if len(word) > 1:
        v[2] = word[1].isupper()
    return v

def bow_vect(word_preced):
    v = np.zeros(len(BAG_OF_WORDS))
    if word_preced in BAG_OF_WORDS:
        v[BAG_OF_WORDS[word_preced]] = 1
    return v

In [None]:
N_DIMS_ORIG = 3 * 3 + len(BAG_OF_WORDS)
projecter = info9.RandomProjection(N_DIMS_ORIG, 64, "Gaussian")

def make_dataset(fname, binary=False, label_of_interest=4):
    df = pd.read_csv(fname, sep=" ", names=["word", "L1", "L2", "L3"])
    df.drop(df[df.word == "-DOCSTART-"].index, inplace=True)
    
    true_labels = np.array([DICT_LABELS[l] for l in df["L3"] if str(l) != "nan"])
    if (binary):
        true_labels = (true_labels == label_of_interest).astype(int)
    datapoints = np.zeros((df.shape[0], N_DIMS_ORIG))
    # words contient 3 mots :
    #    [<mot précédent> <mot en traitement> <mot suivant>]
    # vects contient les représentations par `word_vect` de ces trois mots
    words = ["", ".", df.loc[0]["word"]]
    vects = [word_vector(w) for w in words]

    i = 0
    for index, row in df.loc[2:].iterrows():
        #  Lecture du mot suivant et insertion dans les tableaux
        nextw = str(row["word"])
        words = *words[1:], nextw
        vects = *vects[1:], word_vector(nextw)
        # Les mots vides n'ont pas de label : ne pas les mettre dans les datapoints
        if(words[1] != ""):
            # [<lg/maj mot preced> <lg/maj mot> <lg/maj mot suiv> <bag of words mot preced>]
            datapoints[i] = np.concatenate([*vects, bow_vect(words[0])])
    return info9.Dataset(datapoints, true_labels)

In [None]:
dataset_train = make_dataset(ftrain, binary=True, label_of_interest=4)
dataset_test = make_dataset(ftest, binary=True,label_of_interest=4)
print(dataset_train.show(False))

In [None]:
classifier = info9.LogisticReg(dataset_train, LAMBDA, 0.5)
classifier.fit_sgd_rmsprop(1)

In [None]:
confusion_matrix = classifier.estimate_all(dataset_test)

In [None]:
print("Full confusion matrix after multilabel k-nn classification:\n")
print(confusion_matrix.PrintEvaluation())

## With word embedding

In [None]:
# Load glove word embedding dataset
EMBEDDINGS_DICT = {}
with open("glove_pretrained/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        EMBEDDINGS_DICT[word] = np.asarray(values[1:], "float32")

In [None]:
EMBEDDINGS_DICT.get("to")

In [None]:
N_ROWS_TRAIN = 100000

DIM_EMBEDDING = len(next(iter(EMBEDDINGS_DICT.values())))
N_DIMS_ORIG = 3 * 3 + 3 * DIM_EMBEDDING

def make_dataset_word_embedding(fname, binary=False, label_of_interest=4):
    df = pd.read_csv(fname, sep=" ", names=["word", "L1", "L2", "L3"])
    df.drop(df[df.word == "-DOCSTART-"].index, inplace=True)
    
    true_labels = np.array([DICT_LABELS[l] for l in df["L3"] if str(l) != "nan"])
    if (binary):
        true_labels = (true_labels == label_of_interest).astype(int)
    datapoints = np.zeros((N_ROWS_TRAIN, N_DIMS_ORIG))
    # words contient 3 mots :
    #    [<mot précédent> <mot en traitement> <mot suivant>]
    # vects contient les représentations par `word_vect` de ces trois mots
    words = ["", ".", df.loc[0]["word"]]
    vects = [word_vector(w) for w in words]

    i = 0
    for index, row in df.loc[2:].iterrows():
        #  Lecture du mot suivant et insertion dans les tableaux
        nextw = str(row["word"])
        words = *words[1:], nextw
        vects = *vects[1:], word_vector(nextw)
        # Les mots vides n'ont pas de label : ne pas les mettre dans les datapoints
        if(words[1] != ""):
            # [<lg/maj mot preced> <lg/maj mot> <lg/maj mot suiv> <bag of words mot preced>]
            embeddings = [EMBEDDINGS_DICT.get(words[i].lower(), np.zeros((DIM_EMBEDDING,))) for i in range(3)]
            datapoints[i] = np.concatenate([*vects, *embeddings])
            i += 1
            if i >= N_ROWS_TRAIN:
                break
    return info9.Dataset(datapoints, true_labels)

In [None]:
dataset_train = make_dataset_word_embedding(ftrain, binary=False, label_of_interest=4)
dataset_test = make_dataset_word_embedding(ftest, binary=False,label_of_interest=4)
print(dataset_train.show(False))

In [None]:
classifier = info9.LogisticRegMultinomial(dataset_train, 30, IOB_LABELS)
classifier.fit_sgd(1,0.1)
classifier.J()

In [None]:
confusion_matrix = classifier.estimate_all(dataset_test)

In [None]:
print("Full confusion matrix :\n")
print(confusion_matrix.PrintEvaluation())

In [None]:
classifier.estimate(dataset_train.getInstance(223))