In [1]:
import info9

from pathlib import Path

import pandas as pd
import numpy as np


In [62]:
K = 10

LABELS = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
#N_ROWS_TRAIN = 100000
N_ROWS_TRAIN = 20000
N_ROWS_TEST  = 10000

ftrain = Path("../data/eng.train.conll")
ftest = Path("../data/eng.testa.conll")

In [3]:
DICT_LABELS = {l : i for (i,l) in enumerate(LABELS)}

In [37]:
def make_bag_of_word(fname, max_row, min_val=10):
    """
    Renvoie un dictionnaire avec les mots les plus fréquents JUSTE AVANT un mot
    marqué autre chose qu'I-PER. On compte les mots sur les max_row premières
    lignes du fichier, en excluant les lignes marquées "-DOCSTART-", et on renvoie
    un dictionnaire contenant tous les mots avec au moins min_val occurences.
    """
    df = pd.read_csv(fname, sep=" ", names=["word", "L1", "L2", "L3"])
    df.drop(df[df.word == "-DOCSTART-"].index, inplace=True)
    
    word_count = {w: 0 for w in df.word.unique()}
    
    precedw = "."
    for index, row in df.iterrows():
        if(row["L3"] != "O"):
            word_count[precedw] += 1
        precedw = row.word
        if index > N_ROWS_TRAIN:
            break
    return {x[0] : i for (i,x) in enumerate(
                sorted(
                    filter(lambda x: x[1] > min_val, word_count.items()),
                    key=lambda x: x[1], reverse=True
                )
              )
           }

In [38]:
BAG_OF_WORDS = make_bag_of_word(ftrain, max_row=N_ROWS_TRAIN, min_val=3)
BAG_OF_WORDS

{'.': 0,
 ',': 1,
 '(': 2,
 'the': 3,
 'of': 4,
 '-': 5,
 'in': 6,
 'and': 7,
 ')': 8,
 ':': 9,
 ' ': 10,
 '1': 11,
 'The': 12,
 'to': 13,
 'beat': 14,
 "'s": 15,
 '0': 16,
 'a': 17,
 '2': 18,
 '--': 19,
 '3': 20,
 'said': 21,
 '4': 22,
 'at': 23,
 'by': 24,
 'from': 25,
 'New': 26,
 'with': 27,
 'for': 28,
 'AT': 29,
 'South': 30,
 'Minister': 31,
 'President': 32,
 'on': 33,
 'NEW': 34,
 '6': 35,
 'that': 36,
 '2.': 37,
 '3.': 38,
 '1.': 39,
 'v': 40,
 'U.S.': 41,
 '5': 42,
 'against': 43,
 'United': 44,
 'told': 45,
 '1996-08-28': 46,
 'National': 47,
 '7': 48,
 'b': 49,
 'In': 50,
 '1/2': 51,
 '4.': 52,
 '5.': 53,
 '6.': 54,
 '/': 55,
 'West': 56,
 '1996-08-27': 57,
 'Czech': 58,
 'between': 59,
 'Mark': 60,
 'as': 61,
 'St': 62,
 'former': 63,
 '9': 64,
 'World': 65,
 'Michael': 66,
 'Sri': 67,
 '1996-08-26': 68,
 'an': 69,
 '7.': 70,
 'Hong': 71,
 '1996-08-22': 72,
 'John': 73,
 'A': 74,
 'vs.': 75,
 '1996-08-29': 76,
 'European': 77,
 'over': 78,
 'SAN': 79,
 '8.': 80,
 'Paul': 

In [9]:
def word_vector(word):
    """
    Renvoie une représentation du mot `word` par un vecteur encodant:
    - sa longueur
    - si sa première lettre est une majuscule
    - si sa deuxième lettre est une majuscule
    """
    v = np.zeros(3)
    v[0] = len(word)
    if len(word) > 0:
        v[1] = word[0].isupper()
    if len(word) > 1:
        v[2] = word[1].isupper()
    return v

def bow_vect(word_preced):
    v = np.zeros(len(BAG_OF_WORDS))
    if word_preced in BAG_OF_WORDS:
        v[BAG_OF_WORDS[word_preced]] = 1
    return v

In [41]:
N_DIMS_ORIG = 3 * 3 + len(BAG_OF_WORDS)
projecter = info9.RandomProjection(N_DIMS_ORIG, 64, "Gaussian")

def make_dataset(fname, binary=False, label_of_interest=4):
    df = pd.read_csv(fname, sep=" ", names=["word", "L1", "L2", "L3"])
    df.drop(df[df.word == "-DOCSTART-"].index, inplace=True)
    
    true_labels = np.array([DICT_LABELS[l] for l in df["L3"] if str(l) != "nan"])[:N_ROWS_TRAIN]
    if (binary):
        true_labels = (true_labels == label_of_interest).astype(int)
    datapoints = np.zeros((N_ROWS_TRAIN, N_DIMS_ORIG))
    # words contient 3 mots :
    #    [<mot précédent> <mot en traitement> <mot suivant>]
    # vects contient les représentations par `word_vect` de ces trois mots
    words = ["", ".", df.loc[0]["word"]]
    vects = [word_vector(w) for w in words]

    i = 0
    for index, row in df.loc[2:].iterrows():
        #  Lecture du mot suivant et insertion dans les tableaux
        nextw = str(row["word"])
        words = *words[1:], nextw
        vects = *vects[1:], word_vector(nextw)
        # Les mots vides n'ont pas de label : ne pas les mettre dans les datapoints
        if(words[1] != ""):
            # [<lg/maj mot preced> <lg/maj mot> <lg/maj mot suiv> <bag of words mot preced>]
            datapoints[i] = np.concatenate([*vects, bow_vect(words[0])])
            i += 1
            if i >= N_ROWS_TRAIN:
                break
    return info9.Dataset(projecter.project(datapoints), true_labels)

In [42]:
dataset_train = make_dataset(ftrain, binary=True, label_of_interest=4)
dataset_test = make_dataset(ftest, binary=True,label_of_interest=4)
print(dataset_train.show(False))

Dataset with 200000 samples, and 64 dimensions.



In [43]:
classifier = info9.KnnClassificationBinary(K, dataset_train, 0.2)

In [44]:
confusion_matrix = classifier.estimate_all(dataset_test)

In [45]:
print("Full confusion matrix after multilabel k-nn classification:\n")
print(confusion_matrix.PrintEvaluation())

Full confusion matrix after multilabel k-nn classification:

		Predicted
		0	1
Actual	0	183638	3877
	1	12125	360

Error rate		0.08001
False alarm rate	0.0206757
Detection rate		0.0288346
F-score			0.0430571
Precision		0.0849658



## With word embedding

In [38]:
# Load glove word embedding dataset
EMBEDDINGS_DICT = {}
with open("glove_pretrained/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        EMBEDDINGS_DICT[word] = np.asarray(values[1:], "float32")

In [47]:
EMBEDDINGS_DICT.get("to")

array([ 0.68047 , -0.039263,  0.30186 , -0.17792 ,  0.42962 ,  0.032246,
       -0.41376 ,  0.13228 , -0.29847 , -0.085253,  0.17118 ,  0.22419 ,
       -0.10046 , -0.43653 ,  0.33418 ,  0.67846 ,  0.057204, -0.34448 ,
       -0.42785 , -0.43275 ,  0.55963 ,  0.10032 ,  0.18677 , -0.26854 ,
        0.037334, -2.0932  ,  0.22171 , -0.39868 ,  0.20912 , -0.55725 ,
        3.8826  ,  0.47466 , -0.95658 , -0.37788 ,  0.20869 , -0.32752 ,
        0.12751 ,  0.088359,  0.16351 , -0.21634 , -0.094375,  0.018324,
        0.21048 , -0.03088 , -0.19722 ,  0.082279, -0.09434 , -0.073297,
       -0.064699, -0.26044 ], dtype=float32)

In [63]:
DIM_EMBEDDING = len(next(iter(EMBEDDINGS_DICT.values())))
N_DIMS_ORIG = 3 * 3 + 2 * DIM_EMBEDDING

projecter = info9.RandomProjection(N_DIMS_ORIG, 64, "Gaussian")

def make_dataset_word_embedding(fname, binary=False, label_of_interest=4):
    df = pd.read_csv(fname, sep=" ", names=["word", "L1", "L2", "L3"])
    df.drop(df[df.word == "-DOCSTART-"].index, inplace=True)
    
    true_labels = np.array([DICT_LABELS[l] for l in df["L3"] if str(l) != "nan"])[:N_ROWS_TRAIN]
    if (binary):
        true_labels = (true_labels == label_of_interest).astype(int)
    datapoints = np.zeros((N_ROWS_TRAIN, N_DIMS_ORIG))
    # words contient 3 mots :
    #    [<mot précédent> <mot en traitement> <mot suivant>]
    # vects contient les représentations par `word_vect` de ces trois mots
    words = ["", ".", df.loc[0]["word"]]
    vects = [word_vector(w) for w in words]

    i = 0
    for index, row in df.loc[2:].iterrows():
        #  Lecture du mot suivant et insertion dans les tableaux
        nextw = str(row["word"])
        words = *words[1:], nextw
        vects = *vects[1:], word_vector(nextw)
        # Les mots vides n'ont pas de label : ne pas les mettre dans les datapoints
        if(words[1] != ""):
            # [<lg/maj mot preced> <lg/maj mot> <lg/maj mot suiv> <bag of words mot preced>]
            embeddings = [EMBEDDINGS_DICT.get(words[i].lower(), np.zeros((DIM_EMBEDDING,))) for i in (0,2)]
            datapoints[i] = np.concatenate([*vects, *embeddings])
            i += 1
            if i >= N_ROWS_TRAIN:
                break
    return info9.Dataset(projecter.project(datapoints), true_labels)

In [64]:
dataset_train = make_dataset_word_embedding(ftrain, binary=True, label_of_interest=4)
dataset_test = make_dataset_word_embedding(ftest, binary=True,label_of_interest=4)
print(dataset_train.show(False))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [65]:
classifier = info9.KnnClassificationBinary(K, dataset_train, 0.2)

In [66]:
confusion_matrix = classifier.estimate_all(dataset_test)

In [67]:
print("Full confusion matrix after multilabel k-nn classification:\n")
print(confusion_matrix.PrintEvaluation())

Full confusion matrix after multilabel k-nn classification:

		Predicted
		0	1
Actual	0	18027	716
	1	1126	131

Error rate		0.0921
False alarm rate	0.0382009
Detection rate		0.104216
F-score			0.124525
Precision		0.154664

