## 利用Embedding，训练机器学习模型

In [4]:
!wget https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/raw/master/toutiao_cat_data.txt.zip

--2023-03-20 11:13:19--  https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/raw/master/toutiao_cat_data.txt.zip
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/master/toutiao_cat_data.txt.zip [following]
--2023-03-20 11:13:20--  https://raw.githubusercontent.com/aceimnorstuvwxz/toutiao-text-classfication-dataset/master/toutiao_cat_data.txt.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26912069 (26M) [application/zip]
Saving to: ‘toutiao_cat_data.txt.zip’


2023-03-20 11:13:31 (2.51 MB/s) - ‘toutiao_cat_data.txt.zip’ 

In [12]:
!unzip ./toutiao_cat_data.txt.zip
!mv ./toutiao_cat_data.txt data/

Archive:  ./toutiao_cat_data.txt.zip
  inflating: toutiao_cat_data.txt    


In [3]:
import pandas as pd
import tiktoken
import openai
import os
import backoff

openai.api_key = os.environ.get("OPENAI_API_KEY")

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


# import data/toutiao_cat_data.txt as a pandas dataframe
df = pd.read_csv('data/toutiao_cat_data.txt', sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'])
df = df.fillna("")
df["combined"] = (
    "标题: " + df.title.str.strip() + "; 关键字: " + df.keywords.str.strip()
)

print("Lines of text before filtering: ", len(df))

encoding = tiktoken.get_encoding(embedding_encoding)
# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]

print("Lines of text after filtering: ", len(df))


  df = pd.read_csv('data/toutiao_cat_data.txt', sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'])


Lines of text before filtering:  382688
Lines of text after filtering:  382688


In [1]:
# 以下代码生成的文件可以在百度网盘提取，信息如下：
# 链接: https://pan.baidu.com/s/1Cl0eFNLOkQqquf9ls0trEw 提取码: jvr4

In [15]:
# 请不要执行如下代码
# from openai.embeddings_utils import get_embedding

# df_1k = df.sample(1000, random_state=42)

# df_1k["embedding"] = df_1k.combined.apply(lambda x : get_embedding(x, engine=embedding_model))
# df_1k.to_csv("data/toutiao_cat_data_10k_with_embeddings.csv", index=False)

In [16]:
# 请不需要执行如下代码
# @backoff.on_exception(backoff.expo, openai.error.RateLimitError)
# def get_embedding_with_backoff(**kwargs):
#     return get_embedding(**kwargs)

# df_10k = df.sample(10000, random_state=42)

# df_10k["embedding"] = df_10k.combined.apply(lambda x : get_embedding_with_backoff(text=x, engine=embedding_model))
# df_10k.to_csv("data/toutiao_cat_data_10k_with_embeddings.csv", index=False)

In [1]:
# 如果你不想重新计算一遍embedding，请不要运行如下代码
# from openai.embeddings_utils import get_embeddings

# batch_size = 2000

# @backoff.on_exception(backoff.expo, openai.error.RateLimitError)
# def get_embeddings_with_backoff(prompts, engine):
#     embeddings = []
#     for i in range(0, len(prompts), batch_size):
#         batch = prompts[i:i+batch_size]
#         embeddings += get_embeddings(list_of_text=batch, engine=engine)
#     return embeddings

# # randomly sample 10k rows
# df_all = df
# # group prompts into batches of 100
# prompts = df_all.combined.tolist()
# prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

# embeddings = []
# for batch in prompt_batches:
#     batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
#     embeddings += batch_embeddings

# df_all["embedding"] = embeddings
# df_all.to_parquet("data/toutiao_cat_data_all_with_embeddings.parquet", index=False)

## 训练模型，看看效果怎么样

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

training_data = pd.read_parquet("data/toutiao_cat_data_all_with_embeddings.parquet")

df =  training_data.sample(50000, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(
    list(df.embedding.values), df.category, test_size=0.2, random_state=42
)

clf = RandomForestClassifier(n_estimators=300)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

                    precision    recall  f1-score   support

  news_agriculture       0.83      0.85      0.84       495
          news_car       0.88      0.94      0.91       895
      news_culture       0.86      0.77      0.81       741
          news_edu       0.86      0.89      0.87       708
news_entertainment       0.71      0.92      0.80      1051
      news_finance       0.80      0.76      0.78       735
         news_game       0.90      0.81      0.86       742
        news_house       0.91      0.87      0.89       450
     news_military       0.88      0.82      0.85       688
       news_sports       0.90      0.92      0.91       968
        news_story       0.94      0.47      0.62       197
         news_tech       0.81      0.85      0.83      1052
       news_travel       0.80      0.75      0.77       599
        news_world       0.82      0.72      0.77       671
             stock       0.00      0.00      0.00         8

          accuracy                    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
from sklearn.linear_model import LogisticRegression

df =  training_data

X_train, X_test, y_train, y_test = train_test_split(
    list(df.embedding.values), df.category, test_size=0.2, random_state=42
)

clf = LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

  news_agriculture       0.85      0.88      0.87      3908
          news_car       0.92      0.92      0.92      7101
      news_culture       0.82      0.84      0.83      5719
          news_edu       0.88      0.89      0.89      5376
news_entertainment       0.85      0.88      0.86      7908
      news_finance       0.82      0.78      0.80      5409
         news_game       0.91      0.87      0.89      5899
        news_house       0.90      0.91      0.91      3463
     news_military       0.86      0.82      0.84      4976
       news_sports       0.93      0.93      0.93      7611
        news_story       0.83      0.81      0.82      1308
         news_tech       0.84      0.85      0.85      8168
       news_travel       0.80      0.79      0.79      4252
        news_world       0.79      0.80      0.80      5370
             stock       0.00      0.00      0.00        70

          accuracy                    

  _warn_prf(average, modifier, msg_start, len(result))
