In [1]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.3-cp312-cp312-macosx_15_0_arm64.whl size=288243 sha256=3a2e31e0d048b0720a4add749e50ee542edb9eac86583def716f10a3c6581387
  Stored in directory: /Users/vi/Library/Caches/pip/wheels/20/27/95/a7baf1b435f1cbde017cabdf1e9688526d2b0e929255a359c6
Successfully built fasttext
Installing collected packages: pybind11, 

### Load Library

In [None]:
import pandas as pd
import fasttext
from sklearn.model_selection import train_test_split
import io
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Using FastText as Classification Baseline

In [None]:
# Prepare the Data with train-test split
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# Combine All three contents into one column
train_data['text'] = train_data['question_title'] + ' ' + train_data['question_content'] + ' ' + train_data['best_answer']
train_data['formatted'] = '__label__' + train_data['topic_name'].astype(str) + ' ' + train_data['text']

val_data['text'] = val_data['question_title'] + ' ' + val_data['question_content'] + ' ' + val_data['best_answer']
val_data['formatted'] = '__label__' + val_data['topic_name'].astype(str) + ' ' + val_data['text']

# Use StringIO to create in-memory files for fastText
train_data_io = io.StringIO("\n".join(train_data['formatted'].tolist()))
val_data_io = io.StringIO("\n".join(val_data['formatted'].tolist()))

# Train the fastText model

# TODO: Considering tunning parameters

model = fasttext.train_supervised(input=train_data_io, epoch=25, lr=1.0, wordNgrams=2)

# Evaluate the Model on Validation Data
result = model.test(val_data_io)
print(f"Validation Precision: {result[1]}")
print(f"Validation Recall: {result[2]}")
print(f"Validation Accuracy: {result[1]}")

# Predict on test dataset

# TODO: Preprocessing test dataset into same format
# TODO: Other metrics

text_sample = "Sample text for prediction"
label, confidence = model.predict(text_sample)
print(f"Predicted label: {label[0]}, Confidence: {confidence[0]}")

### Using FastText as word embedding

In [None]:
# Train an Unsupervised fastText Model for Embeddings
# Combine all text fields for training embeddings
train_data['text'] = train_data['question_title'] + ' ' + train_data['question_content'] + ' ' + train_data['best_answer']
train_texts_io = io.StringIO("\n".join(train_data['text'].tolist()))

# Train fastText model for word embeddings (unsupervised)
embedding_model = fasttext.train_unsupervised(input=train_texts_io, model='skipgram')

#  Generate Sentence Embeddings for Each Sample
# Function to get sentence embedding by averaging word vectors
def get_sentence_embedding(text, model):
    words = text.split()
    word_vectors = [model.get_word_vector(word) for word in words if word in model.words]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        # Zero vector if no words are found
        return np.zeros(embedding_model.get_dimension())  

# Generate embeddings for training and validation data
train_data['embedding'] = train_data['text'].apply(lambda x: get_sentence_embedding(x, embedding_model))
val_data['embedding'] = val_data['text'].apply(lambda x: get_sentence_embedding(x, embedding_model))

# Prepare Feature Matrices and Labels for ML Models
X_train = np.vstack(train_data['embedding'].values)
# Integer labels for training
y_train = train_data['topic'] 

X_val = np.vstack(val_data['embedding'].values)
# Integer labels for validation
y_val = val_data['topic']     

# Train models:
# TODO: Try more models
# E.g. a Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate on Validation Set
# TODO: more metrics
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Random Forest Validation Accuracy: {accuracy}")