In [11]:
import pandas as pd
import numpy as np
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB , GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [12]:
#pip install gensim

In [13]:
#import urllib.request

#url = "https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin"
#output_file = "BioWordVec.bin"

#print("Downloading BioWordVec... This may take a while.")
#urllib.request.urlretrieve(url, output_file)
#print("Download complete!")

In [14]:
BIOWORDVEC_PATH = "BioWordVec.bin"
biowordvec_model = gensim.models.KeyedVectors.load_word2vec_format(BIOWORDVEC_PATH, binary=True,limit = 500000)

In [15]:
#au lieu de scispacy , the tokenization :

def sentence_to_vector(sentence, model):
    words = sentence.split()
    vectors = [model[word] for word in words if word in model]
    if len(vectors) == 0:
        return np.zeros(model.vector_size) 
    return np.mean(vectors, axis=0)  
def load_data(file_path):
    df = pd.read_csv(file_path, delimiter='\t', names=['label', 'sentence'])
    df.dropna(inplace=True)
    return df

**Preparation of the dataSet**

In [16]:
train_path = 'pubmed-rct/PubMed_20k_RCT/train.txt'
test_path = 'pubmed-rct/PubMed_20k_RCT/test.txt'
dev_path = 'pubmed-rct/PubMed_20k_RCT/dev.txt'

#X_train, y_train = pd.read_csv(train_path, delimiter='\t', header=None, names=['label', 'text'])
#X_test, y_test = pd.read_csv(test_path, delimiter='\t', header=None, names=['label', 'text'])
#X_def, y_def =  pd.read_csv(def_path, delimiter='\t', header=None, names=['label', 'text'])


df_train = load_data(train_path)
df_test = load_data(test_path)
df_dev = load_data(dev_path)


In [17]:
print(df_test)

             label                                           sentence
1       BACKGROUND  This study analyzed liver function abnormaliti...
2          RESULTS  A post hoc analysis was conducted with the use...
3          RESULTS  Liver function tests ( LFTs ) were measured at...
4          RESULTS  Survival analyses were used to assess the asso...
5          RESULTS  The percentage of patients with abnormal LFTs ...
...            ...                                                ...
32630      RESULTS  There was a statistically significant between-...
32631      RESULTS  There were no statistically significant betwee...
32632      RESULTS  There was no significant association between s...
32633      RESULTS                 No adverse effects were reported .
32634  CONCLUSIONS  Performing a 6-week do-as-tolerated program of...

[30135 rows x 2 columns]


In [18]:
print (df_train.size)
print (df_dev.size)
print (df_test.size)

360080
60424
60270


**Feature extraction: TF-IDF and CountVectorizer**


In [19]:
vectorizer_tfidf = TfidfVectorizer()
vectorizer_count = CountVectorizer()


In [20]:

X_train_tfidf = vectorizer_tfidf.fit_transform(df_train['sentence'])
X_test_tfidf = vectorizer_tfidf.transform(df_test['sentence'])
X_dev_tfidf = vectorizer_tfidf.transform(df_dev['sentence'])

In [21]:
X_train_count = vectorizer_count.fit_transform(df_train['sentence'])
X_test_count = vectorizer_count.transform(df_test['sentence'])
X_dev_count = vectorizer_count.transform(df_dev['sentence'])



**Convert labels to numerical values**

In [22]:

y_train = df_train['label'].astype('category').cat.codes
y_test = df_test['label'].astype('category').cat.codes
y_dev = df_dev['label'].astype('category').cat.codes


In [23]:
print(df_train['label'].astype('category').cat.categories)


Index(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'], dtype='object')


**Train Naive Bayes with TF-IDF optimisé**

In [24]:


param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}
clf_tfidf = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
clf_tfidf.fit(X_train_tfidf, y_train)
best_tfidf = clf_tfidf.best_estimator_


**Vocabulary size for TF-IDF**

In [26]:
print("TF-IDF Vocabulary Size:", len(vectorizer_tfidf.vocabulary_))

TF-IDF Vocabulary Size: 57996


**Evaluate models**

In [27]:

print("=== TF-IDF Results ===")
y_pred_tfidf = best_tfidf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tfidf))
print("Classification Report:")
print(classification_report(y_test, y_pred_tfidf))

=== TF-IDF Results ===
Accuracy: 0.7451136552181848
Confusion Matrix:
[[1891  782  518  356   74]
 [ 476 2962  296  104  733]
 [ 175  173 8606   56  887]
 [ 612  367  503  808   43]
 [  51  405 1052   18 8187]]
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.52      0.55      3621
           1       0.63      0.65      0.64      4571
           2       0.78      0.87      0.82      9897
           3       0.60      0.35      0.44      2333
           4       0.82      0.84      0.83      9713

    accuracy                           0.75     30135
   macro avg       0.69      0.65      0.66     30135
weighted avg       0.74      0.75      0.74     30135



**Train Naive Bayes with CountVectorizer**

In [28]:

clf_count = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
clf_count.fit(X_train_count, y_train)
best_count = clf_count.best_estimator_

In [29]:
# Vocabulary size for CountVectorizer
print("CountVectorizer Vocabulary Size:", len(vectorizer_count.vocabulary_))

CountVectorizer Vocabulary Size: 57996


**Results**

In [14]:
print("=== CountVectorizer Results ===")
y_pred_count = best_count.predict(X_test_count)
print("Accuracy:", accuracy_score(y_test, y_pred_count))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_count))
print("Classification Report:")
print(classification_report(y_test, y_pred_count))




=== CountVectorizer Results ===
Accuracy: 0.7702339472374315
Confusion Matrix:
[[2127  636  285  537   36]
 [ 507 3286  130  154  494]
 [ 256  197 8629  158  657]
 [ 615  251  203 1249   15]
 [  67  635 1062   29 7920]]
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.59      0.59      3621
           1       0.66      0.72      0.69      4571
           2       0.84      0.87      0.85      9897
           3       0.59      0.54      0.56      2333
           4       0.87      0.82      0.84      9713

    accuracy                           0.77     30135
   macro avg       0.71      0.71      0.71     30135
weighted avg       0.77      0.77      0.77     30135



**BioWordVec Feature Extraction**

In [31]:

X_train_biovec = np.vstack(df_train['sentence'].apply(lambda x: sentence_to_vector(x, biowordvec_model)))
X_test_biovec = np.vstack(df_test['sentence'].apply(lambda x: sentence_to_vector(x, biowordvec_model)))
X_dev_biovec = np.vstack(df_dev['sentence'].apply(lambda x: sentence_to_vector(x, biowordvec_model)))


**Vocabulary size for BioWordVec (pre-trained word embeddings)**

In [32]:

print("BioWordVec Vocabulary Size:", len(biowordvec_model.key_to_index))

BioWordVec Vocabulary Size: 500000


**Train Naive Bayes with BioWordVec**

In [33]:
clf = GaussianNB()
clf.fit(X_train_biovec, y_train)
best_model = clf
best_biovec=best_model

In [34]:

print("=== BioWordVec Results (Naive Bayes) ===")
y_pred_biovec = best_biovec.predict(X_test_biovec)
print("Accuracy:", accuracy_score(y_test, y_pred_biovec))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_biovec))
print("Classification Report:")
print(classification_report(y_test, y_pred_biovec))

=== BioWordVec Results (Naive Bayes) ===
Accuracy: 0.5935954869752779
Confusion Matrix:
[[ 937 1196  173 1237   78]
 [ 432 3105  169  596  269]
 [ 684  841 5618 1009 1745]
 [ 226  371  154 1513   69]
 [ 315 1345 1111  227 6715]]
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.26      0.30      3621
           1       0.45      0.68      0.54      4571
           2       0.78      0.57      0.66      9897
           3       0.33      0.65      0.44      2333
           4       0.76      0.69      0.72      9713

    accuracy                           0.59     30135
   macro avg       0.54      0.57      0.53     30135
weighted avg       0.64      0.59      0.60     30135



**Train LogisticRegression with BioWordVec**

In [19]:


clf_biovec = LogisticRegression(C=1, solver='saga', max_iter=500).fit(X_train_biovec, y_train)
clf_biovec.fit(X_train_biovec, y_train)
best_biovec = clf_biovec

print("=== BioWordVec Results (LogisticRegression) ===")
y_pred_biovec = best_biovec.predict(X_test_biovec)
print("Accuracy:", accuracy_score(y_test, y_pred_biovec))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_biovec))
print("Classification Report:")
print(classification_report(y_test, y_pred_biovec))

=== BioWordVec Results (LogisticRegression) ===
Accuracy: 0.771163099386096
Confusion Matrix:
[[2031  642  348  537   63]
 [ 531 3130  269  103  538]
 [ 185  188 8708  100  716]
 [ 564  246  279 1216   28]
 [  57  451 1046    5 8154]]
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.56      0.58      3621
           1       0.67      0.68      0.68      4571
           2       0.82      0.88      0.85      9897
           3       0.62      0.52      0.57      2333
           4       0.86      0.84      0.85      9713

    accuracy                           0.77     30135
   macro avg       0.71      0.70      0.70     30135
weighted avg       0.77      0.77      0.77     30135



In [23]:

# Compare models
print("\n===== Model Comparison =====")
print(f"TF-IDF Accuracy with Naive Bayes: {accuracy_score(y_test, y_pred_tfidf):.4f}")
print(f"CountVectorizer Accuracy with Naive Bayes: {accuracy_score(y_test, y_pred_count):.4f}")
print(f"BioWordVec Accuracy with Logistic Regression: {accuracy_score(y_test, y_pred_biovec):.4f}")



===== Model Comparison =====
TF-IDF Accuracy with Naive Bayes: 0.7451
CountVectorizer Accuracy with Naive Bayes: 0.7702
BioWordVec Accuracy with Logistic Regression: 0.7712


In [None]:
#optimisation des hypermarapètres de LogisticRegression:

param_grid_lr = {'C': [0.1, 0.5, 1]}
clf_biovec = GridSearchCV(LogisticRegression(solver='saga', max_iter=10000), param_grid_lr, cv=50, scoring='accuracy', n_jobs=1)
clf_biovec.fit(X_train_biovec, y_train)
best_biovec = clf_biovec.best_estimator_



print("=== BioWordVec Results (LogisticRegression Optimised) ===")
y_pred_biovec = best_biovec.predict(X_test_biovec)
print("Accuracy:", accuracy_score(y_test, y_pred_biovec))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_biovec))
print("Classification Report:")
print(classification_report(y_test, y_pred_biovec))