In [12]:
from gensim.models.fasttext import load_facebook_vectors
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
import pickle

### Download the word vector from https://fasttext.cc/docs/en/english-vectors.html
Pre-trained word vectors:
- wiki-news-300d-1M.vec.zip: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
- wiki-news-300d-1M-subword.vec.zip: 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
- crawl-300d-2M.vec.zip: 2 million word vectors trained on Common Crawl (600B tokens).
- crawl-300d-2M-subword.zip: 2 million word vectors trained with subword information on Common Crawl (600B tokens) (** current one **)

In [61]:
# Load the FastText model from the file
model = load_facebook_vectors(
    "./word_models/crawl-300d-2M-subword/crawl-300d-2M-subword.bin")

In [None]:
### get word vectors ###
# model.word_vec('word')

In [64]:
data_dir = "../datasets/text_classification/"
model_names = ["gptneox_20B", "gptj_6B", "fairseq_gpt_13B", "text-davinci-002", "text-curie-001",
               "gpt-3.5-turbo", "gpt-4", "j1-jumbo", "j1-grande", "j1-large", "xlarge", "medium"]
answer_column = "ref_answer"
query_name = "content"

In [65]:
# single dataset preprocess
datasets = ['agnews', 'coqa', 'headlines', 'overruling', 'sciq']

for dataset in datasets:
    job_data = pd.read_csv(f"{data_dir}/{dataset}.csv")
    X = []
    Y = []
    cost = []
    for _, row in job_data.iterrows():
        X.append({'dataset': dataset, 'query': row[query_name]})
        Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})
        cost.append({k: row[f"{k}_cost"] for k in model_names})
    X_features_word2vec = [{'dataset': x['dataset'], 'features': extract_features_word2vec(x['query'])} for x in X]
    Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]
    print(dataset, len(job_data))
    with open(f"{dataset}_content_word2vec.pkl", mode="wb") as f:
        pickle.dump((X_features_word2vec, Y, cost), f)

agnews 7600
coqa 7982
headlines 10000
overruling 2160
sciq 11677


In [66]:
data_dir = "../datasets/log_parsing/"
model_names = ["j2_mid", "j2_ultra", "Mixtral_8x7B", "llama2_7b", "llama2_13b",
               "llama2_70b", "Yi_34B", "Yi_6B"]
answer_column = "ref_answer"
query_name = "content"

In [68]:
logs_df = pd.read_csv(f"{data_dir}/log_parsing.csv")
X = []
Y = []
cost = []
for _, row in logs_df.iterrows():
    X.append({'dataset': row["dataset"], 'query': row[query_name]})
    Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})
    cost.append({k: row[f"{k}_cost"] for k in model_names})
X_features_word2vec = [{'dataset': x['dataset'], 'features': extract_features_word2vec(x['query'])} for x in X]
Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]
print(len(logs_df))
with open(f"logs_df_{query_name}_word2vec.pkl", mode="wb") as f:
    pickle.dump((X_features_word2vec, Y, cost), f)

32000


In [None]:
# Load the data from the CSV files
def get_query_only(text):
    query = text.split("\n")[-2]
    query = query[query.find(":") + 1:].strip()
    return query

X = []
Y = []
cost = []
for fname in os.listdir(data_dir):
    fpath = os.path.join(data_dir, fname)
    news_df = pd.read_csv(fpath)
    for _, row in news_df.iterrows():
        X.append({'dataset': fname[:-4], 'query': row[query_name]})
        Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})
        cost.append({k: row[f"{k}_cost"] for k in model_names})

In [None]:
X[0]

In [None]:
Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]

In [47]:
def extract_features_word2vec(text):
    words = text.strip().split()
    word_vecs = []
    for word in words:
        try:
            word_vecs.append(model.get_vector(word.strip()))
        except Exception as ex:
            pass
    return np.mean(word_vecs, axis=0)

In [None]:
X_features_word2vec = [{'dataset': x['dataset'], 'features': extract_features_word2vec(x['query'])} for x in X]

In [None]:
# Save the data to a file
import pickle
with open("content_word2vec.pkl", mode="wb") as f:
    pickle.dump((X_features_word2vec, Y, cost), f)

In [54]:
import pickle
file_name = "content"  # content
with open(f"{file_name}_word2vec.pkl", mode="rb") as f:
    X, Y, cost = pickle.load(f)

In [13]:
import pickle
file_name = "content"  # content
with open(f"logs_df_{file_name}_word2vec.pkl", mode="rb") as f:
    X, Y, cost = pickle.load(f)

In [59]:
datasets = ['agnews', 'coqa', 'headlines', 'overruling', 'sciq']

for dataset in datasets:
    with open(f"{dataset}_word2vec.pkl", mode="rb") as f:
        X, Y, cost = pickle.load(f)
        print(dataset, X[0])

agnews {'dataset': 'agnews', 'features': array([-1.30344694e-02, -1.71714611e-02,  3.59053947e-02,  9.45682544e-03,
       -6.83385832e-03, -1.14706364e-02,  7.46293142e-02,  1.38591073e-04,
       -1.10429747e-03, -6.84385234e-03,  1.14365750e-04,  1.24658190e-03,
        4.02127393e-03,  7.75221502e-04,  8.24482762e-04, -2.10659509e-03,
       -3.21886898e-03,  9.17145167e-04, -1.02229360e-02,  8.23147688e-03,
        1.58348419e-02,  2.45972374e-03,  3.48740746e-03, -6.38847612e-03,
       -1.12849195e-02,  1.09226899e-02, -2.41349563e-02, -6.36212807e-03,
       -8.88290070e-03,  1.10646226e-02, -9.04017501e-03, -1.20396102e-02,
       -1.22197147e-03, -3.74525925e-03,  7.11274659e-03,  1.63371989e-03,
       -2.08996539e-03, -4.62592230e-04,  2.30028480e-03,  9.79553256e-03,
       -8.91225506e-03, -6.39058426e-02,  3.88412457e-03,  7.47943204e-03,
       -5.72390668e-03,  5.64492941e-02, -1.15413945e-02,  6.75583375e-04,
       -1.17010726e-02,  8.81383382e-03,  5.74846659e-03, -

In [2]:
from helpers import split_train_test_random, split_train_test_dataset, split_train_test_

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

In [15]:
data_dict = {}
for i in range(len(X)):
    infor_dict = {}
    infor_dict['features'] = X[i]['features']
    infor_dict['label'] = Y[i]
    infor_dict['cost'] = cost[i]
    data_dict[i] = infor_dict

In [22]:
train_data, test_data = train_test_split(data_dict, test_size=0.99, random_state=42)
train_x, train_y, test_x, test_y = split_train_test_(train_data, test_data)

NameError: name 'split_train_test_' is not defined

In [19]:
data_dict[0]

{'features': array([-1.88731775e-03,  6.03284407e-03,  4.41974476e-02, -5.60682407e-03,
        -5.42015117e-03, -3.77124026e-02,  4.36003953e-02,  1.05917575e-02,
         2.48097070e-02, -6.00890629e-03,  9.94577073e-03,  1.48976343e-02,
        -1.78611800e-02,  2.31782682e-02,  1.63917919e-03,  4.03230404e-03,
         4.14691260e-03, -8.79830099e-04,  6.12845505e-03,  1.96187347e-02,
         2.34913342e-02,  1.85788609e-02, -9.59020667e-03, -7.09624123e-03,
         2.98187975e-02,  9.20136087e-03, -4.06162031e-02, -8.55865516e-03,
        -3.32335159e-02, -1.95528986e-03,  2.03704857e-03, -2.49226000e-02,
         3.42395972e-03,  1.28019126e-02,  5.81075363e-02, -1.87113397e-02,
        -1.26151983e-02, -2.46851649e-02,  1.29973138e-04, -2.06318758e-02,
        -1.26796989e-02, -8.22913125e-02,  7.12522771e-03, -5.40555734e-03,
         9.08060279e-03,  4.95495163e-02, -6.23691175e-03, -2.23871768e-02,
        -7.85248075e-03, -1.13578362e-03, -2.38721613e-02,  8.73606093e-03,


In [None]:
train_x = [x['features'] for x in train_data]
train_y = [list(y['label'].values()) for y in train_data]
test_x = [x['features'] for x in test_data]
test_y = [list(y['label'].values()) for y in test_data]

In [None]:
# train_x, train_y, test_x, test_y = split_train_test_random(X, Y, 0.99)

In [None]:
cost = [x['cost'] for x in test_data]
cost

In [None]:
clf = MultiOutputClassifier(estimator=XGBClassifier(n_jobs=-1, max_depth=100, n_estimators=1000))
clf.fit(train_x, train_y)

In [None]:
y_pred = clf.predict(test_x[0:])
y_score = clf.predict_proba(test_x[0:])

In [None]:
model_names = ["gptneox_20B", "gptj_6B", "fairseq_gpt_13B", "text-davinci-002", "text-curie-001",
               "gpt-3.5-turbo", "gpt-4", "j1-jumbo", "j1-grande", "j1-large", "xlarge", "medium"]

In [None]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test_y, y_pred, digits=3,
                            target_names=model_names
                            ))
print(accuracy_score(test_y, y_pred))

In [None]:
# for a new query
query = "What is the capital of France?"
query_features = extract_features_word2vec(query)
# query_features = X[0]['features'] # to test
y_pred = clf.predict([query_features])
y_score = clf.predict_proba([query_features])
# print(y_pred)
# print(y_score)
score = {k: v[0][1] for k, v in zip(model_names, y_score)}

In [None]:
score

In [None]:
job_data = pd.read_csv("../datasets/text_queries.csv")
X_features_word2vec = job_data['query'].apply(extract_features_word2vec)


In [None]:
Y = []
for _, row in job_data.iterrows():
    Y.append({k: row[f"{k}_answer"] == row[answer_column] for k in model_names})

Y = [{k: 1 if v else 0 for k, v in y.items()} for y in Y]

In [None]:
Y

In [None]:
job_data["content"].head(5)

In [4]:
import os
import pandas as pd
from prediction.prediction_model import *
datasets = ['logs'] #'overruling', 'agnews', 'coqa', 'headlines', 'sciq']
for dataset in datasets:
    test_data_size = 0.99
    print(f"Processing {dataset} dataset")
    data_dir = f"../datasets/text_classification"
    save_dir = f"output/text_classification/query_{test_data_size}/{dataset}_{test_data_size}"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # if dataset == "text_classification":
    # model_list = ['gptneox_20B', 'gptj_6B', 'fairseq_gpt_13B', 'text-davinci-002', 'text-curie-001', 'gpt-3.5-turbo',
    #                   'gpt-4', 'j1-jumbo', 'j1-grande', 'j1-large', 'xlarge', 'medium']
    # elif dataset == "log_parsing":
    model_names = ["j2_mid", "j2_ultra", "Mixtral_8x7B", "llama2_7b", "llama2_13b", "llama2_70b", "Yi_34B", "Yi_6B"]

    df_pre_accuracy, df_true_accuracy, df_cost = data_preprocess(data_dir, dataset, model_names, test_size=test_data_size)

ImportError: cannot import name 'logging' from 'huggingface_hub' (C:\Users\LYue7\anaconda3\envs\llms\lib\site-packages\huggingface_hub\__init__.py)