In [1]:
import requests
import json
from typing import List, Any

In [40]:
def callGpt(
        userText: str,
        messages: List[Any],
        temperature: float = 0.7,
        max_tokens: int = 500
    ) -> str:
    url: str = f"https://model.hsueh.tw/callapi/chatGPT?temperature={temperature}&max_tokens={max_tokens}&top_p=0.95&frequency_penalty=0&presence_penalty=0&stop=""&past_messages=3&purpose=none"
    payload = json.dumps(messages)
    headers = {
        'Content-Type': 'application/json',
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    try:
        return str(response.json()['choices'][0]['message']['content'])
    except:
        return "ERROR:"+str(response.text)

In [45]:
def callGPT_keywords_extract(userText: str) -> str:
        return callGpt(userText, [
                {
                    "role": "system",
                    "content": """
                    你是一名台灣國小自然科學領域的老師，你發起了國小自然科學探究活動的題目，你現在必須提取出和探究題目有關以及學生在探究過程中需要聚焦的10個重要的關鍵字，請以台灣國小自然科學課綱會出現的詞彙為主，除關鍵字之外請不要回覆其他訊息。注意，請以'、'區隔，不要出現其他格式的訊息。
                    """
                },
                {"role": "user", "content": userText}
        ],0.3)

In [46]:
activity_topic = """太陽光為什麼可以發電？"""
keywords_topic = callGPT_keywords_extract(activity_topic)
print(keywords_topic)

太陽能、光電效應、太陽能板、能量轉換、環保、可再生能源、發電原理、電能儲存、太陽能利用、綠色能源。


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 两组关键字数据
keywords1 = "太陽能、光電效應、太陽能板、能量轉換、環保、可再生能源、發電原理、電能儲存、太陽能利用、綠色能源"
keywords2 = "宇宙、太空梭、衛星、黑洞、地心引力、蟲洞、暗物質、太陽"

# 合并两组关键字数据为文档列表
documents = [keywords1, keywords2]

# 创建TF-IDF向量化器
tfidf_vectorizer = TfidfVectorizer()

# 转换文档为TF-IDF向量
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# 计算余弦相似度
similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

# 打印相似度分数
print("相似度分数：", similarities[0][0])


相似度分数： 0.0


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 两组关键字数据
keywords1 = "太陽能、光電效應、太陽能板、能量轉換、環保"
keywords2 = "太陽能、太陽能板、發電、光能、轉換效率"

# 将关键字数据分割成单词列表
keywords1_list = keywords1.split('、')
keywords2_list = keywords2.split('、')

# 创建TF-IDF向量化器
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([keywords1, keywords2])

# 计算余弦相似度矩阵
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# 提取keywords1和keywords2之间的相似度
similarity_matrix = cosine_sim_matrix[0, 1:]

# 创建一个相似度矩阵
similarity_matrix = np.reshape(similarity_matrix, (1, -1))

# 创建一个矩阵，其中包含keyword1中的每个词与keyword2中的每个词之间的相似度
similarity_matrix = np.outer(tfidf_matrix.toarray()[0], tfidf_matrix.toarray()[1])

# 打印相似度矩阵
print("相似度矩阵：")
print(similarity_matrix)


相似度矩阵：
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.08464773 0.08464773 0.08464773 0.0846

In [75]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# # 两组关键字数据
# keywords1 = "太陽能、光電效應、太陽能板、能量轉換、環保"
# keywords2 = "太陽能、太陽能板、發電、光能、轉換效率"

# # 将关键字数据分割成单词列表
# keywords1_list = keywords1.split('、')
# keywords2_list = keywords2.split('、')

# # 创建一个5x5的矩阵来存储相似度值
# similarity_matrix = np.zeros((5, 5))

# # 创建TF-IDF向量化器
# tfidf_vectorizer = TfidfVectorizer()

# # 计算每个词在keywords2与keywords1之间的相似度
# for i in range(5):
#     for j in range(5):
#         # 转换为TF-IDF向量
#         tfidf_matrix = tfidf_vectorizer.fit_transform([keywords1_list[j], keywords2_list[i]])
#         # 提取keywords1与keywords2之间的相似度
#         similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
#         # 存储相似度值到矩阵
#         similarity_matrix[i, j] = similarity

# # 打印相似度矩阵
# print("相似度矩阵：")
# print(similarity_matrix)


相似度矩阵：
[[1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [8]:
!pip install -U sentence_transformers

Collecting sentence_transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting torchvision
  Using cached torchvision-0.16.0-cp310-cp310-win_amd64.whl (1.3 MB)
Installing collected packages: torchvision, sentence_transformers
Successfully installed sentence_transformers-2.2.2 torchvision-0.16.0




In [12]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# 多个句子（关键词）
keywords1 = "太陽能、光電效應、太陽能板、能量轉換、環保、可再生能源、發電原理、電能儲存、太陽能利用、綠色能源"
keywords2 = "太陽能發電、光能、地球暖化、再生能源、熱、能量、太陽能板、能量轉換、太陽能電池、光電效應"

# 将关键词分割成单词列表
keywords1_list = keywords1.split('、')
keywords2_list = keywords2.split('、')

# 加载预训练模型
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

# 对关键词进行编码以获取嵌入向量
embeddings1 = model.encode(keywords1_list, convert_to_tensor=True)
embeddings2 = model.encode(keywords2_list, convert_to_tensor=True)

# 计算关键词之间的余弦相似度
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

# 创建DataFrame并添加相似度矩阵
similarity_df = pd.DataFrame(cosine_scores.cpu().numpy())

# 设置相似度阈值
threshold = [0.8, 0.5]

# 创建标志列
flagged_series = similarity_df.apply(lambda x: 1 if (x > threshold[0]).any() else (2 if (x > threshold[1]).any else 0))
# flagged_series = similarity_df.apply(lambda x: 2 if (x > threshold[1]).any() else 0)

# 将标志列转换为DataFrame
flagged_df = pd.DataFrame(flagged_series, columns=['相似詞'])

# 打印相似度矩阵
print("相似度矩陣：")
display(similarity_df)

# 打印标志列
print("標記：")
display(flagged_df)

相似度矩陣：


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.90234,0.642136,0.544626,0.575003,0.523333,0.693996,0.916888,0.550701,0.819602,0.509209
1,0.505099,0.742526,0.298119,0.452399,0.320377,0.520592,0.438604,0.466027,0.448781,1.0
2,0.861513,0.530733,0.487893,0.473159,0.415271,0.524306,1.0,0.451985,0.816749,0.438604
3,0.567656,0.431394,0.354742,0.755039,0.363675,0.782317,0.451985,1.0,0.450108,0.466027
4,0.382775,0.454946,0.686765,0.487783,0.446872,0.517162,0.45271,0.383645,0.340275,0.350143
5,0.606513,0.398919,0.429958,0.986808,0.326952,0.707921,0.471695,0.743631,0.501992,0.44793
6,0.552164,0.380781,0.3822,0.62974,0.280557,0.647264,0.459117,0.6283,0.505992,0.573276
7,0.486852,0.408772,0.274722,0.587539,0.266605,0.584874,0.402244,0.558849,0.495902,0.543653
8,0.872036,0.458861,0.444902,0.458165,0.321829,0.441404,0.842132,0.408893,0.784894,0.396389
9,0.540855,0.39792,0.453467,0.682765,0.365234,0.665331,0.479975,0.614175,0.475896,0.418602


標記：


Unnamed: 0,相似詞
0,1
1,2
2,2
3,1
4,2
5,2
6,1
7,1
8,1
9,1


In [None]:
# 计算标记为1和2的数量
count_0 = (flagged_df['相似詞'] == 0).sum()
count_1 = (flagged_df['相似詞'] == 1).sum()

offtopic = 0
notbroad = 0

# 检查如果1的数量少于一半，输出"不够深入"
if count_0 > len(flagged_series) / 2:
    offtopic = 1
elif count_1 < len(flagged_series) / 2:
    notbroad = 1

In [6]:
# from transformers import AutoTokenizer, AutoModel
# import torch
# from scipy.spatial.distance import cosine

# # 句子
# sentences = ['太陽能', '光']

# # 加载分词器和模型
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
# model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# # Tokenize sentences
# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# # Compute token embeddings
# with torch.no_grad():
#     model_output = model(**encoded_input)

# # Perform pooling
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# # Normalize embeddings
# sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

# print("Sentence embeddings:")
# print(sentence_embeddings)


相似度： 0.3808121979236603
