In [1]:
!pip install fasttext



In [2]:
import fasttext
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
train_val_test = pd.read_csv('processed_abs_train.csv', sep=',')
classification = pd.read_csv('classification_Medline.csv', sep=',')

In [5]:
X_train_val, X_test, y_train_val, y_test = train_test_split(train_val_test, classification, test_size=0.15, random_state=42, stratify=classification)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.17647, random_state=42, stratify=y_train_val)

In [6]:
X_train.to_csv('train.train', index=False, header=False)
X_test.to_csv('test.test', index=False, header=False)
X_val.to_csv('val.val', index=False, header=False)

In [7]:
model = fasttext.train_unsupervised('train.train', model='skipgram', dim=700, epoch=10, wordNgrams = 3, lr = 0.03)

def get_embeddings(model, data_file):
    embeddings = []
    with open(data_file, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            word_embeddings = [model.get_word_vector(word) for word in words]
            embeddings.append(np.mean(word_embeddings, axis=0))
    return np.array(embeddings)

train_embeddings = get_embeddings(model, 'train.train')
val_embeddings = get_embeddings(model, 'val.val')
test_embeddings = get_embeddings(model, 'test.test')

print("Train embeddings shape:", train_embeddings.shape)
print("Validation embeddings shape:", val_embeddings.shape)
print("Test embeddings shape:", test_embeddings.shape)

Train embeddings shape: (6072, 700)
Validation embeddings shape: (1302, 700)
Test embeddings shape: (1302, 700)


In [8]:
import xgboost as xgb
from sklearn.metrics import classification_report

model = xgb.XGBClassifier()
model.fit(pd.DataFrame(train_embeddings), y_train)

print(classification_report(y_test, model.predict(pd.DataFrame(test_embeddings))))
print(classification_report(y_val, model.predict(pd.DataFrame(val_embeddings))))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       789
           1       0.94      0.90      0.92       513

    accuracy                           0.94      1302
   macro avg       0.94      0.93      0.94      1302
weighted avg       0.94      0.94      0.94      1302

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       789
           1       0.90      0.90      0.90       513

    accuracy                           0.92      1302
   macro avg       0.92      0.92      0.92      1302
weighted avg       0.92      0.92      0.92      1302



In [9]:
pd.DataFrame(train_embeddings).to_csv('train_embeddings_fasttext.csv', index=False)
pd.DataFrame(test_embeddings).to_csv('test_embeddings_fasttext.csv', index=False)
pd.DataFrame(val_embeddings).to_csv('val_embeddings_fasttext.csv', index=False)