In [9]:
from gensim.models.fasttext import load_facebook_vectors

### Download the word vector from https://fasttext.cc/docs/en/english-vectors.html
Pre-trained word vectors:
- wiki-news-300d-1M.vec.zip: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
- wiki-news-300d-1M-subword.vec.zip: 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
- crawl-300d-2M.vec.zip: 2 million word vectors trained on Common Crawl (600B tokens).
- crawl-300d-2M-subword.zip: 2 million word vectors trained with subword information on Common Crawl (600B tokens) (** current one **)

In [10]:
# Load the FastText model from the file
model = load_facebook_vectors(
    "./word_models/crawl-300d-2M-subword/crawl-300d-2M-subword.bin")

In [11]:
### get word vectors ###
# model.word_vec('word')

2000000

In [21]:
import pandas as pd

In [11]:
data_dir = "../datasets/text_classification/"
model_names = ["gptneox_20B", "gptj_6B", "fairseq_gpt_13B", "text-davinci-002", "text-curie-001",
               "gpt-3.5-turbo", "gpt-4", "j1-jumbo", "j1-grande", "j1-large", "xlarge", "medium"]
answer_column = "ref_answer"
query_name = "query"

In [49]:
import os
import numpy as np
from tqdm import tqdm

In [50]:
# Load the data from the CSV files
def get_query_only(text):
    query = text.split("\n")[-2]
    query = query[query.find(":") + 1:].strip()
    return query

X = []
Y = []

for fname in os.listdir(data_dir):
    fpath = os.path.join(data_dir, fname)
    news_df = pd.read_csv(fpath)
    for _, row in news_df.iterrows():
        X.append({'dataset': fname[:-4], 'query': row[query_name]})
        Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})

In [51]:
Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]

In [85]:
def extract_features_word2vec(text):
    words = text.strip().split()
    word_vecs = []
    for word in words:
        try:
            word_vecs.append(model.get_vector(word.strip()))
        except Exception as ex:
            pass
    return np.mean(word_vecs, axis=0)

In [86]:
X_features_word2vec = [{'dataset': x['dataset'], 'features': extract_features_word2vec(x['query'])} for x in X]

100%|██████████| 39419/39419 [00:00<00:00, 3867129.84it/s]


In [58]:
# Save the data to a file
import pickle
with open("data_word2vec.pkl", mode="wb") as f:
    pickle.dump((X_features_word2vec, Y), f)

In [42]:
# Load the data from the file
import pickle
with open("data_word2vec.pkl", mode="rb") as f:
    X, Y = pickle.load(f)

In [43]:
from helpers import split_train_test_random, split_train_test_dataset

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

In [45]:
train_x, train_y, test_x, test_y = split_train_test_random(X, Y, 0.99)

In [46]:
clf = MultiOutputClassifier(estimator=XGBClassifier(n_jobs=-1, max_depth=100, n_estimators=1000))
clf.fit(train_x, train_y)

In [47]:
y_pred = clf.predict(test_x[0:])
y_score = clf.predict_proba(test_x[0:])

In [48]:
model_names = ["gptneox_20B", "gptj_6B", "fairseq_gpt_13B", "text-davinci-002", "text-curie-001",
               "gpt-3.5-turbo", "gpt-4", "j1-jumbo", "j1-grande", "j1-large", "xlarge", "medium"]

In [49]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test_y, y_pred, digits=3,
                            target_names=model_names
                            ))
print(accuracy_score(test_y, y_pred))

                  precision    recall  f1-score   support

     gptneox_20B      0.731     0.839     0.781     22793
         gptj_6B      0.726     0.802     0.762     22775
 fairseq_gpt_13B      0.723     0.808     0.763     23061
text-davinci-002      0.810     0.875     0.842     27221
  text-curie-001      0.685     0.784     0.731     21968
   gpt-3.5-turbo      0.779     0.874     0.823     24096
           gpt-4      0.801     0.870     0.834     27561
        j1-jumbo      0.761     0.857     0.806     25775
       j1-grande      0.741     0.860     0.796     24814
        j1-large      0.723     0.845     0.779     23495
          xlarge      0.764     0.875     0.816     25910
          medium      0.702     0.841     0.766     22351

       micro avg      0.747     0.846     0.794    291820
       macro avg      0.746     0.844     0.792    291820
    weighted avg      0.748     0.846     0.794    291820
     samples avg      0.612     0.671     0.609    291820

0.255810377

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
# for a new query
query = "What is the capital of France?"
query_features = extract_features_word2vec(query)
# query_features = X[0]['features'] # to test
y_pred = clf.predict([query_features])
y_score = clf.predict_proba([query_features])
# print(y_pred)
# print(y_score)
score = {k: v[0][1] for k, v in zip(model_names, y_score)}

In [66]:
score

{'gptneox_20B': 0.83844745,
 'gptj_6B': 0.74707603,
 'fairseq_gpt_13B': 0.9795002,
 'text-davinci-002': 0.9940054,
 'text-curie-001': 0.90561515,
 'gpt-3.5-turbo': 0.8935841,
 'gpt-4': 0.99221873,
 'j1-jumbo': 0.9869047,
 'j1-grande': 0.9963413,
 'j1-large': 0.8274226,
 'xlarge': 0.9966762,
 'medium': 0.115519546}