# 0 Import

In [124]:
import numpy as np
np.set_printoptions(threshold=10000,suppress=True) 
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.neighbors import KNeighborsClassifier
import gensim
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, zero_one_loss
warnings.filterwarnings('ignore')

# 1. Data Analyse

In [None]:
data=pd.read_csv('PubMed-multi-label-dataset.csv',sep=',')
print(len(data))
data.head(3)

In [None]:
labels = data.iloc[:, -14:]
print("count of number of 1 per Label")
labels.where(labels == 1).count()

In [None]:
plt.matshow(labels.corr())

# 2. Modelisation

Pour modéliser notre problème d'apprentissage suppervisé, on défini notre plan d'attaque.

### Entrées

- abstractText 
- Potentiellement Title & meshMajor

### Sorties

- Les 14 labels

### Evaluations

- F1-score en raison du déséquilibre de la donnée.

### Approche

- Oversampling ou undersampling

# 3. Clean Data

In [None]:
corpus = data['abstractText'].astype(str) + data['Title'].astype(str)
corpus

In [None]:
corpus = corpus.apply(remove_stopwords)
corpus = corpus.apply(preprocess_string)
corpus = corpus.apply(lambda x : ' '.join(x))
corpus

In [None]:
title = data['Title'].astype(str)
title = title.apply(remove_stopwords)
title = title.apply(preprocess_string)
title = title.apply(lambda x : ' '.join(x))
title

# 4. Apprentissage du Word2vec

In [15]:
corpus_w2v = corpus.apply(lambda line : gensim.utils.simple_preprocess((line)))

In [None]:
import multiprocessing
cores=multiprocessing.cpu_count()
cores

In [17]:
model_size=100
model=gensim.models.Word2Vec(corpus_w2v, vector_size=model_size, sg=0, window=5, min_count=2, workers=cores-1)

In [None]:
for i in range(100):
    model.train(corpus_w2v, total_examples=len(corpus_w2v), epochs=1)
    print(i, end=' ')

In [None]:
model.save('./Word2vec_entraine.h5')
len(model.wv.index_to_key)

In [None]:
corpus

In [None]:
tfidf = TfidfVectorizer()
tfidf.fit_transform(corpus)
tfidf_dict = {}
for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
    tfidf_dict[ele1] = ele2
tfidf_dict

In [None]:
model=gensim.models.Word2Vec.load('./Word2vec_entraine.h5')
def encode_sentence(sentence: str):
    vec = np.zeros(model.vector_size)
    for word in sentence.split(" "):
        try:
            if word in tfidf_dict.keys():
                vec += tfidf_dict[word] * model.wv[word]
            else:
                vec += model.wv[word]
        except:
            pass
    return vec.tolist()

encode_sentence("paraffin embed tissu section patient")

In [None]:
title2 = title.apply(encode_sentence)
title2

# 5. Comparaison

In [125]:
X_train, X_test, y_train, y_test = train_test_split(title2, labels, test_size=0.5)

In [126]:
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())

In [127]:
def jesaispas(X_train, y_train, X_test, base_estimator, model, *args, **kwargs):
    ouioui = model(base_estimator, *args, **kwargs)
    ouioui.fit(X_train, y_train)
    ouioui = ouioui.predict(X_test)
    return ouioui

In [128]:
base_estimator = {
    'KNN' : KNeighborsClassifier(n_neighbors=5),
    'MLP' : MLPClassifier(hidden_layer_sizes=(100, 100)),
}

In [129]:
def run_model(data, base_estimator):
    
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.5)
    X_train = np.array(X_train.tolist())
    X_test = np.array(X_test.tolist())
    
    for key, value in base_estimator.items():
        print("##################", key, "##################")
        moc = jesaispas(X_train, y_train, X_test, value, MultiOutputClassifier)
        chain = jesaispas(X_train, y_train, X_test, value, ClassifierChain, order='random', random_state=42)
        
        print("----------MultiOutputClassifier----------", )
        print(f1_score(y_test, moc, average='micro'))
        print(f1_score(y_test, moc, average='macro'))
        print(zero_one_loss(y_test, moc))
        
        print("----------ClassifierChain----------", )
        print(f1_score(y_test, chain, average='micro'))
        print(f1_score(y_test, chain, average='macro'))
        print(zero_one_loss(y_test, chain))

    

In [None]:
run_model(title2, base_estimator)