In [1]:
from transformers import BertModel, BertTokenizer



In [9]:
from gensim.models.fasttext import load_facebook_vectors

In [10]:
model = load_facebook_vectors(
    "./word_models/crawl-300d-2M-subword/crawl-300d-2M-subword.bin")

In [11]:
### get word vectors ###
# model.word_vec('word')

2000000

In [21]:
import pandas as pd

In [11]:
data_dir = "../datasets/text_classification/"
model_names = ["gptneox_20B", "gptj_6B", "fairseq_gpt_13B", "text-davinci-002", "text-curie-001",
               "gpt-3.5-turbo", "gpt-4", "j1-jumbo", "j1-grande", "j1-large", "xlarge", "medium"]
answer_column = "ref_answer"
query_name = "query"

In [49]:
import os
import numpy as np
from tqdm import tqdm

In [50]:
def get_query_only(text):
    query = text.split("\n")[-2]
    query = query[query.find(":") + 1:].strip()
    return query

X = []
Y = []

for fname in os.listdir(data_dir):
    fpath = os.path.join(data_dir, fname)
    news_df = pd.read_csv(fpath)
    for _, row in news_df.iterrows():
        X.append({'dataset': fname[:-4], 'query': row[query_name]})
        Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})

In [51]:
Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]

In [53]:
X[0]

{'dataset': 'agnews',
 'query': 'Please answer which category (World, Sports, Business or Sci/Tech) a provided news follows into.\n\nQ: Five-year ban for Blackburn fan One of the two Blackburn Rovers Football Club fans charged with public disorder for racially abusing Dwight Yorke has been handed a five-year ban.\nA: Sports\n\nQ: Major software pirates caught A multimillion-euro software piracy ring has been broken following synchronized raids in Athens and London yesterday, Attica police said.\nA: Sci/Tech\n\nQ: Loews to Buy Entergy-Koch Pipeline  NEW YORK (Reuters) - Conglomerate Loews Corp. &lt;A HREF="http://www.investor.reuters.com/FullQuote.aspx?ticker=LTR.N target=/stocks/quickinfo/fullquote"&gt;LTR.N&lt;/A&gt;  agreed to buy an 8,000-mile natural gas pipeline system from  Entergy-Koch LP for \\$1.14 billion on Monday, in a bid to cash  in on rising U.S. demand for natural gas.\nA: Business\n\nQ: Texas A amp;M Quarterback Finds Groove Once Again Reggie McNeal switched his jersey

In [85]:
def extract_features_word2vec(text):
    words = text.strip().split()
    word_vecs = []
    for word in words:
        try:
            word_vecs.append(model.get_vector(word.strip()))
        except Exception as ex:
            pass
    return np.mean(word_vecs, axis=0)

In [86]:
X_features_word2vec = list(tqdm([{'dataset': x['dataset'], 'features': extract_features_word2vec(x['query'])} for x in X]))

100%|██████████| 39419/39419 [00:00<00:00, 3867129.84it/s]


In [58]:
import pickle
with open("data_word2vec.pkl", mode="wb") as f:
    pickle.dump((X_features_word2vec, Y), f)

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

In [67]:
def split_train_test(X, Y, test_size):
    # train = data[data['System'] != dataset]
    # test = data[data['System'] == dataset]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
    
    x_train = [x['features'] for x in x_train]
    y_train = [list(y.values()) for y in y_train]
    x_test = [x['features'] for x in x_test]
    y_test = [list(y.values()) for y in y_test]

    return x_train, y_train, x_test, y_test

In [88]:
train_x, train_y, test_x, test_y = split_train_test(X_features_word2vec, Y, 0.99)

In [89]:
clf = MultiOutputClassifier(estimator=XGBClassifier(n_jobs=-1, max_depth=10, n_estimators=1000))
clf.fit(train_x, train_y)

In [90]:
y_pred = clf.predict(test_x[0:])
y_score = clf.predict_proba(test_x[0:])

In [91]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test_y, y_pred, digits=3,
                            target_names=model_names
                            ))
print(accuracy_score(test_y, y_pred))

                  precision    recall  f1-score   support

     gptneox_20B      0.733     0.830     0.778     22793
         gptj_6B      0.721     0.809     0.763     22775
 fairseq_gpt_13B      0.724     0.809     0.764     23061
text-davinci-002      0.809     0.879     0.843     27221
  text-curie-001      0.685     0.782     0.730     21968
   gpt-3.5-turbo      0.780     0.868     0.822     24096
           gpt-4      0.799     0.878     0.837     27561
        j1-jumbo      0.756     0.865     0.807     25775
       j1-grande      0.742     0.855     0.794     24814
        j1-large      0.723     0.838     0.776     23495
          xlarge      0.765     0.882     0.819     25910
          medium      0.699     0.843     0.764     22351

       micro avg      0.746     0.847     0.793    291820
       macro avg      0.745     0.845     0.791    291820
    weighted avg      0.747     0.847     0.794    291820
     samples avg      0.613     0.672     0.610    291820

0.251479820

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [92]:
bert_tokenizer = BertTokenizer.from_pretrained("./word_models/bert_uncased_L-4_H-256_A-4/")
bert_model = BertModel.from_pretrained("./word_models/bert_uncased_L-4_H-256_A-4/")

Some weights of the model checkpoint at ./word_models/bert_uncased_L-4_H-256_A-4/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [121]:
q = X[0]['query']
inputs = bert_tokenizer([q, q], return_tensors='pt', max_length=512, truncation=True)

In [128]:
len(bert_model(**inputs)['last_hidden_state'][:, 0, :].detach().tolist()[0])

256

In [115]:
def extract_features_bert(texts):
    inputs = bert_tokenizer(texts, return_tensors='pt', max_length=512, truncation=True)
    return bert_model(**inputs)['last_hidden_state'][:, 0, :].detach().tolist()

In [None]:
X_bert = []
for i in tqdm(range(0, len(X) // 10 + 1)):
    texts = [x['query'] for x in X[i: (i + 1) * 10]]
    X_bert.extend(extract_features_bert(texts))

In [108]:
bert_tokenizer.decode(inputs['input_ids'][0])

'[CLS] please answer which category ( world, sports, business or sci / tech ) a provided news follows into. q : five - year ban for blackburn fan one of the two blackburn rovers football club fans charged with public disorder for racially abusing dwight yorke has been handed a five - year ban. a : sports q : major software pirates caught a multimillion - euro software piracy ring has been broken following synchronized raids in athens and london yesterday, attica police said. a : sci / tech q : loews to buy entergy - koch pipeline new york ( reuters ) - conglomerate loews corp. & lt ; a href = " http : / / www. investor. reuters. com / fullquote. aspx? ticker = ltr. n target = / stocks / quickinfo / fullquote " & gt ; ltr. n & lt ; / a & gt ; agreed to buy an 8, 000 - mile natural gas pipeline system from entergy - koch lp for \\ $ 1. 14 billion on monday, in a bid to cash in on rising u. s. demand for natural gas. a : business q : texas a amp ; m quarterback finds groove once again reg

In [1]:
import pickle
with open("data_word2vec.pkl", mode="rb") as f:
    X, Y = pickle.load(f)

In [2]:
from helpers import split_train_test_random, split_train_test_dataset



In [3]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

In [4]:
train_x, train_y, test_x, test_y = split_train_test_dataset(X, Y, 'coqa')

In [8]:
clf = MultiOutputClassifier(estimator=XGBClassifier(n_jobs=-1, max_depth=10, n_estimators=100))
clf.fit(train_x, train_y)

In [9]:
y_pred = clf.predict(test_x[0:])
y_score = clf.predict_proba(test_x[0:])

In [12]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test_y, y_pred, digits=3,
                            target_names=model_names
                            ))
print(accuracy_score(test_y, y_pred))

                  precision    recall  f1-score   support

     gptneox_20B      0.207     0.744     0.324      1674
         gptj_6B      0.180     0.326     0.232      1495
 fairseq_gpt_13B      0.217     0.800     0.341      1709
text-davinci-002      0.260     0.955     0.409      2089
  text-curie-001      0.188     0.764     0.301      1503
   gpt-3.5-turbo      0.083     0.986     0.153       659
           gpt-4      0.270     0.933     0.419      2167
        j1-jumbo      0.248     0.642     0.358      1999
       j1-grande      0.258     0.441     0.326      2045
        j1-large      0.224     0.773     0.348      1796
          xlarge      0.217     0.952     0.354      1746
          medium      0.177     0.213     0.193      1430

       micro avg      0.211     0.712     0.326     20312
       macro avg      0.211     0.711     0.313     20312
    weighted avg      0.222     0.712     0.329     20312
     samples avg      0.212     0.346     0.231     20312

0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from torch import nn