## 利用Embedding，训练机器学习模型

In [None]:
!wget https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/raw/master/toutiao_cat_data.txt.zip

In [None]:
!unzip ./toutiao_cat_data.txt.zip
!mv ./toutiao_cat_data.txt data/

In [1]:
import pandas as pd
import tiktoken
import openai
import os
import backoff

openai.api_key = os.environ.get("OPENAI_API_KEY")

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


# import data/toutiao_cat_data.txt as a pandas dataframe
df = pd.read_csv(os.environ.get("JUPYTER_HOME") + '/data/toutiao_cat_data.txt', sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'])
df = df.fillna("")
df["combined"] = (
    "标题: " + df.title.str.strip() + "; 关键字: " + df.keywords.str.strip()
)

print("Lines of text before filtering: ", len(df))

encoding = tiktoken.get_encoding(embedding_encoding)
# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

print("Lines of text after filtering: ", len(df))


  df = pd.read_csv(os.environ.get("JUPYTER_HOME") + '/data/toutiao_cat_data.txt', sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'])


Lines of text before filtering:  382688
Lines of text after filtering:  382688


In [None]:
# 以下代码生成的文件可以在百度网盘提取，信息如下：
# 链接: https://pan.baidu.com/s/1Cl0eFNLOkQqquf9ls0trEw 提取码: jvr4

In [2]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

EMBEDDING_MODEL = "text-embedding-ada-002"

def get_embedding(text, model=EMBEDDING_MODEL):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
# 请不要执行如下代码

# df_1k = df.sample(1000, random_state=42)

# df_1k["embedding"] = df_1k.combined.apply(lambda x : get_embedding(x, model=embedding_model))
# df_1k.to_csv("data/toutiao_cat_data_10k_with_embeddings.csv", index=False)

In [None]:
# 请不需要执行如下代码
# @backoff.on_exception(backoff.expo, openai.RateLimitError)
# def get_embedding_with_backoff(**kwargs):
#    return get_embedding(**kwargs)
#
# df_10k = df.sample(10000, random_state=42)
#
# df_10k["embedding"] = df_10k.combined.apply(lambda x : get_embedding_with_backoff(text=x, model=embedding_model))
# df_10k.to_csv("data/toutiao_cat_data_10k_with_embeddings.csv", index=False)

In [None]:
# 如果你不想重新计算一遍embedding，请不要运行如下代码

batch_size = 2000

def get_embeddings(list_of_text, model):
    return client.embeddings.create(input = list_of_text, model=model).data

@backoff.on_exception(backoff.expo, openai.RateLimitError)
def get_embeddings_with_backoff(prompts, model):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        batch_embeddings = get_embeddings(list_of_text=batch, model=model)
        embeddings += batch_embeddings
        print(f"Batch {i} Number of embeddings: {len(embeddings)}")
    return embeddings

# randomly sample 10k rows
df_all = df
# group prompts into batches of 100
prompts = df_all.combined.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, model=embedding_model)
    embeddings += batch_embeddings

df_all["embedding"] = embeddings
df_all.to_parquet(os.environ.get("JUPYTER_HOME") + "/data/toutiao_cat_data_all_with_embeddings.parquet", index=False)

## 训练模型，看看效果怎么样

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

training_data = pd.read_parquet(os.environ.get("JUPYTER_HOME") + "/data/toutiao_cat_data_all_with_embeddings.parquet")

df =  training_data.sample(50000, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(
    list(df.embedding.values), df.category, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=300)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

: 

In [None]:
from sklearn.linear_model import LogisticRegression

df =  training_data

X_train, X_test, y_train, y_test = train_test_split(
    list(df.embedding.values), df.category, test_size=0.2, random_state=42
)

clf = LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)