In [1]:
from gensim.models.fasttext import load_facebook_vectors
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
import pickle

### Download the word vector from https://fasttext.cc/docs/en/english-vectors.html
Pre-trained word vectors:
- wiki-news-300d-1M.vec.zip: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
- wiki-news-300d-1M-subword.vec.zip: 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
- crawl-300d-2M.vec.zip: 2 million word vectors trained on Common Crawl (600B tokens).
- crawl-300d-2M-subword.zip: 2 million word vectors trained with subword information on Common Crawl (600B tokens) (** current one **)

In [61]:
# Load the FastText model from the file
model = load_facebook_vectors(
    "./word_models/crawl-300d-2M-subword/crawl-300d-2M-subword.bin")

In [None]:
### get word vectors ###
# model.word_vec('word')

In [2]:
data_dir = "../datasets/text_classification/"
model_names = ["gptneox_20B", "gptj_6B", "fairseq_gpt_13B", "text-davinci-002", "text-curie-001",
               "gpt-3.5-turbo", "gpt-4", "j1-jumbo", "j1-grande", "j1-large", "xlarge", "medium"]
answer_column = "ref_answer"
query_name = "content"

In [65]:
# single dataset preprocess
datasets = ['agnews', 'coqa', 'headlines', 'overruling', 'sciq']

for dataset in datasets:
    job_data = pd.read_csv(f"{data_dir}/{dataset}.csv")
    X = []
    Y = []
    cost = []
    for _, row in job_data.iterrows():
        X.append({'dataset': dataset, 'query': row[query_name]})
        Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})
        cost.append({k: row[f"{k}_cost"] for k in model_names})
    X_features_word2vec = [{'dataset': x['dataset'], 'features': extract_features_word2vec(x['query'])} for x in X]
    Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]
    print(dataset, len(job_data))
    with open(f"{dataset}_content_word2vec.pkl", mode="wb") as f:
        pickle.dump((X_features_word2vec, Y, cost), f)

agnews 7600
coqa 7982
headlines 10000
overruling 2160
sciq 11677


In [66]:
data_dir = "../datasets/log_parsing/"
model_names = ["j2_mid", "j2_ultra", "Mixtral_8x7B", "llama2_7b", "llama2_13b",
               "llama2_70b", "Yi_34B", "Yi_6B"]
answer_column = "ref_answer"
query_name = "content"

In [68]:
logs_df = pd.read_csv(f"{data_dir}/log_parsing.csv")
X = []
Y = []
cost = []
for _, row in logs_df.iterrows():
    X.append({'dataset': row["dataset"], 'query': row[query_name]})
    Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})
    cost.append({k: row[f"{k}_cost"] for k in model_names})
X_features_word2vec = [{'dataset': x['dataset'], 'features': extract_features_word2vec(x['query'])} for x in X]
Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]
print(len(logs_df))
with open(f"logs_df_{query_name}_word2vec.pkl", mode="wb") as f:
    pickle.dump((X_features_word2vec, Y, cost), f)

32000


In [None]:
# Load the data from the CSV files
def get_query_only(text):
    query = text.split("\n")[-2]
    query = query[query.find(":") + 1:].strip()
    return query

X = []
Y = []
cost = []
for fname in os.listdir(data_dir):
    fpath = os.path.join(data_dir, fname)
    news_df = pd.read_csv(fpath)
    for _, row in news_df.iterrows():
        X.append({'dataset': fname[:-4], 'query': row[query_name]})
        Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})
        cost.append({k: row[f"{k}_cost"] for k in model_names})

In [None]:
X[0]

In [None]:
Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]

In [47]:
def extract_features_word2vec(text):
    words = text.strip().split()
    word_vecs = []
    for word in words:
        try:
            word_vecs.append(model.get_vector(word.strip()))
        except Exception as ex:
            pass
    return np.mean(word_vecs, axis=0)

In [None]:
X_features_word2vec = [{'dataset': x['dataset'], 'features': extract_features_word2vec(x['query'])} for x in X]

In [None]:
# Save the data to a file
import pickle
with open("content_word2vec.pkl", mode="wb") as f:
    pickle.dump((X_features_word2vec, Y, cost), f)

In [1]:
import pickle
file_name = "content"  # content
with open(f"{file_name}_word2vec.pkl", mode="rb") as f:
    X, Y, cost = pickle.load(f)

In [13]:
import pickle
file_name = "content"  # content
with open(f"logs_df_{file_name}_word2vec.pkl", mode="rb") as f:
    X, Y, cost = pickle.load(f)

In [3]:
datasets = ['agnews', 'coqa', 'headlines', 'overruling', 'sciq']
dataset = "agnews"
# for dataset in datasets:
with open(f"{dataset}_word2vec.pkl", mode="rb") as f:
    X, Y, cost = pickle.load(f)

In [4]:
from helpers import split_train_test_random, split_train_test_dataset, split_train_test_

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

In [6]:
data_dict = {}
for i in range(len(X)):
    infor_dict = {}
    infor_dict['features'] = X[i]['features']
    infor_dict['label'] = Y[i]
    infor_dict['cost'] = cost[i]
    data_dict[i] = infor_dict

In [7]:
train_data, test_data = train_test_split(data_dict, test_size=0.99, random_state=42)
train_x, train_y, test_x, test_y = split_train_test_(train_data, test_data)

76

In [15]:
label = pd.DataFrame([x['label'] for x in train_data])
score = label.sum(axis=0) / len(label)
score

gptneox_20B         0.513158
gptj_6B             0.618421
fairseq_gpt_13B     0.776316
text-davinci-002    0.934211
text-curie-001      0.723684
gpt-3.5-turbo       0.868421
gpt-4               0.894737
j1-jumbo            0.894737
j1-grande           0.828947
j1-large            0.763158
xlarge              0.868421
medium              0.565789
dtype: float64

In [6]:
train_x = [x['features'] for x in train_data]
train_y = [list(y['label'].values()) for y in train_data]
test_x = [x['features'] for x in test_data]
test_y = [list(y['label'].values()) for y in test_data]

In [None]:
# train_x, train_y, test_x, test_y = split_train_test_random(X, Y, 0.99)

In [None]:
cost = [x['cost'] for x in test_data]
cost

In [7]:
clf = MultiOutputClassifier(estimator=XGBClassifier(n_jobs=-1, max_depth=100, n_estimators=1000))
clf.fit(train_x, train_y)

In [8]:
y_pred = clf.predict(test_x[0:])
y_score = clf.predict_proba(test_x[0:])

In [9]:
model_names = ["gptneox_20B", "gptj_6B", "fairseq_gpt_13B", "text-davinci-002", "text-curie-001",
               "gpt-3.5-turbo", "gpt-4", "j1-jumbo", "j1-grande", "j1-large", "xlarge", "medium"]

In [10]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test_y, y_pred, digits=3,
                            target_names=model_names
                            ))
print(accuracy_score(test_y, y_pred))

                  precision    recall  f1-score   support

     gptneox_20B      0.704     0.760     0.731     22808
         gptj_6B      0.716     0.740     0.728     22789
 fairseq_gpt_13B      0.688     0.779     0.731     23065
text-davinci-002      0.779     0.872     0.823     27220
  text-curie-001      0.679     0.765     0.720     21978
   gpt-3.5-turbo      0.765     0.832     0.797     24110
           gpt-4      0.797     0.877     0.835     27568
        j1-jumbo      0.739     0.847     0.789     25783
       j1-grande      0.707     0.854     0.774     24818
        j1-large      0.690     0.822     0.750     23498
          xlarge      0.767     0.849     0.806     25923
          medium      0.691     0.776     0.731     22372

       micro avg      0.729     0.818     0.771    291932
       macro avg      0.727     0.815     0.768    291932
    weighted avg      0.730     0.818     0.771    291932
     samples avg      0.623     0.656     0.602    291932

0.205970531

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train_x, train_y)
y_pred_lr = model.predict(test_x[0:])
# print(classification_report(test_y, y_pred_lr, digits=3,target_names=model_names))
print(accuracy_score(test_y, y_pred_lr))

ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

In [None]:
# for a new query
query = "What is the capital of France?"
query_features = extract_features_word2vec(query)
# query_features = X[0]['features'] # to test
y_pred = clf.predict([query_features])
y_score = clf.predict_proba([query_features])
# print(y_pred)
# print(y_score)
score = {k: v[0][1] for k, v in zip(model_names, y_score)}

In [None]:
score

In [None]:
job_data = pd.read_csv("../datasets/text_queries.csv")
X_features_word2vec = job_data['query'].apply(extract_features_word2vec)


In [None]:
Y = []
for _, row in job_data.iterrows():
    Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})

Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]

In [None]:
Y

In [None]:
job_data["content"].head(5)

In [4]:
import os
import pandas as pd
from prediction.prediction_model import *
datasets = ['logs'] #'overruling', 'agnews', 'coqa', 'headlines', 'sciq']
for dataset in datasets:
    test_data_size = 0.99
    print(f"Processing {dataset} dataset")
    data_dir = f"../datasets/text_classification"
    save_dir = f"output/text_classification/query_{test_data_size}/{dataset}_{test_data_size}"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # if dataset == "text_classification":
    # model_list = ['gptneox_20B', 'gptj_6B', 'fairseq_gpt_13B', 'text-davinci-002', 'text-curie-001', 'gpt-3.5-turbo',
    #                   'gpt-4', 'j1-jumbo', 'j1-grande', 'j1-large', 'xlarge', 'medium']
    # elif dataset == "log_parsing":
    model_names = ["j2_mid", "j2_ultra", "Mixtral_8x7B", "llama2_7b", "llama2_13b", "llama2_70b", "Yi_34B", "Yi_6B"]

    df_pre_accuracy, df_true_accuracy, df_cost = data_preprocess(data_dir, dataset, model_names, test_size=test_data_size)

ImportError: cannot import name 'logging' from 'huggingface_hub' (C:\Users\LYue7\anaconda3\envs\llms\lib\site-packages\huggingface_hub\__init__.py)