In [1]:
import pandas as pd
import numpy as np

In [2]:
# load the ability-index dict
import json

ability_index_dict = {}
ability_json_file = "src/ability_mindmap.json"
with open(ability_json_file,'r') as f:
    ability_tree = json.load(f)

# traverse the ability tree and assign index to each ability
index = 0
def traverse_ability_tree(node):
    global index
    ability_index_dict[node["name"]] = index
    index+=1
    if "children" in node:
        for child in node["children"]:
            traverse_ability_tree(child)

traverse_ability_tree(ability_tree)

In [3]:
#read in tagged data
instance_dataset_path = "src/instances_Inception_600_with_tag.csv"
df = pd.read_csv(instance_dataset_path)
instance_cnt = len(df)
print(instance_cnt)

600


In [6]:
import os
import pickle

# init ability data
if os.path.exists("statistics/ability_pair_ref_600.pickle"):
    with open("statistics/ability_pair_ref_600.pickle",'rb') as f:
        ability_pair_ref = pickle.load(f)
else:
    # first calculate the tagged ability appearances in raw df
    ability_freq = {} 
    for i in range(len(df)):
        abilities = df.iloc[i]["annotation"]
        ability_list = abilities.split(",")
        for ability in ability_list:
            if ability not in ability_freq:
                ability_freq[ability] = 0
            ability_freq[ability] += 1
        
    def add_freq_to_parent(node):
        if "children" in node:
            for child in node["children"]:
                add_freq_to_parent(child)
                ability_freq[node["name"]] = ability_freq.get(node["name"],0) + ability_freq.get(child["name"],0)
        else:
            ability_freq[node["name"]] = ability_freq.get(node["name"],0)
    
    add_freq_to_parent(ability_tree)
    # normalize the ability freq
    total = len(df)
    for ability in ability_freq:
        ability_freq[ability] /= total

    # then calculate the ability pair frequency
    ability_pair_ref = np.zeros((len(ability_index_dict),len(ability_index_dict)),dtype=np.float32)
    for ability,index in ability_index_dict.items():
        for ability_2,index_2 in ability_index_dict.items():
            ability_pair_ref[index][index_2] = ability_freq[ability] * ability_freq[ability_2]
    with open("statistics/ability_pair_ref_600.pickle",'wb') as f:
        pickle.dump(ability_pair_ref,f)

In [7]:
#read in tagged data
instance_dataset_path = "src/instances_Inception_150_with_tag.csv"
df = pd.read_csv(instance_dataset_path)
instance_cnt = len(df)
print(instance_cnt)

150


In [8]:
import os
import pickle

# init ability data
if os.path.exists("statistics/ability_pair_ref_150.pickle"):
    with open("statistics/ability_pair_ref_150.pickle",'rb') as f:
        ability_pair_ref = pickle.load(f)
else:
    # first calculate the tagged ability appearances in raw df
    ability_freq = {} 
    for i in range(len(df)):
        abilities = df.iloc[i]["annotation"]
        ability_list = abilities.split(",")
        for ability in ability_list:
            if ability not in ability_freq:
                ability_freq[ability] = 0
            ability_freq[ability] += 1
        
    def add_freq_to_parent(node):
        if "children" in node:
            for child in node["children"]:
                add_freq_to_parent(child)
                ability_freq[node["name"]] = ability_freq.get(node["name"],0) + ability_freq.get(child["name"],0)
        else:
            ability_freq[node["name"]] = ability_freq.get(node["name"],0)
    
    add_freq_to_parent(ability_tree)
    # normalize the ability freq
    total = len(df)
    for ability in ability_freq:
        ability_freq[ability] /= total

    # then calculate the ability pair frequency
    ability_pair_ref = np.zeros((len(ability_index_dict),len(ability_index_dict)),dtype=np.float32)
    for ability,index in ability_index_dict.items():
        for ability_2,index_2 in ability_index_dict.items():
            ability_pair_ref[index][index_2] = ability_freq[ability] * ability_freq[ability_2]
    with open("statistics/ability_pair_ref_150.pickle",'wb') as f:
        pickle.dump(ability_pair_ref,f)

In [38]:
#generatre a frequency table for the data

import pandas as pd
import pickle

instances_with_embedding_path = "src/instances_Inception_600_with_similarity.csv"

df_embedding = pd.read_csv(instances_with_embedding_path)

# read from the embedding_pca column and convert it to numpy array, the split symbol is space like "[1.5 2.8 ... 3.97]"
pca_array = np.array([np.fromstring(embedding[1:-1],dtype=float,sep=' ') for embedding in df_embedding.embedding_pca])

print(pca_array.shape)

# use kmeans and set k=1 to get the center
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=0).fit(pca_array)

# calculate the distance between each of the 4 centers and each element in pca_array and take the distance of the closest center as the distance to the center
distance = np.array([min([np.linalg.norm(center - embedding) for center in kmeans.cluster_centers_]) for embedding in pca_array])

# calculate the probability of each example using p(i) = exp(-d(i)^2/2sigma^2)
sigma = 0.5
probability = np.exp(-distance**2/(2*sigma**2))

# normalize the probability
probability = probability / probability.sum()

#output the top ten probability's corresponding instance expression
top_ten = np.argsort(probability)[-10:]
print(df_embedding.iloc[top_ten].expression, probability[top_ten])


# output the probability into pickle
with open("statistics/instance_prob_600.pkl", "wb") as f:
    pickle.dump(probability, f)




(600, 32)
333    你是股份回购领域的专家，请你用通俗、简洁的语言向一位初学者介绍事件验证法的概念与要点
132                                     请在15分钟后叫我
155                           请帮我做一个个人简历，要能体现我的牛逼
23                               帮我在室友群问一下他们晚上的安排
211                              请帮我整理与大语言模型相关的文献
186                             帮我根据已有信息整理出一篇实验报告
183                                 让我一个小时之内不要碰手机
78                         这是一组数据，请问你观察到了什么有价值的信息
128                        【在材料页面】阅读我写好的辩诉材料，帮我修改
345                  【在线上参会】我去上厕所，帮我记一下，有人叫我就震动手环
Name: expression, dtype: object [0.00181436 0.00181437 0.0018146  0.00181936 0.00182418 0.00182695
 0.00182806 0.00183602 0.00185339 0.00186943]


In [39]:
#generatre a frequency table for the data

import pandas as pd
import pickle

instances_with_embedding_path = "src/instances_Inception_150_with_similarity.csv"

df_embedding = pd.read_csv(instances_with_embedding_path)

# read from the embedding_pca column and convert it to numpy array, the split symbol is space like "[1.5 2.8 ... 3.97]"
pca_array = np.array([np.fromstring(embedding[1:-1],dtype=float,sep=' ') for embedding in df_embedding.embedding_pca])

print(pca_array.shape)

# use kmeans and set k=1 to get the center
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=0).fit(pca_array)

# calculate the distance between each of the 4 centers and each element in pca_array and take the distance of the closest center as the distance to the center
distance = np.array([min([np.linalg.norm(center - embedding) for center in kmeans.cluster_centers_]) for embedding in pca_array])

# calculate the probability of each example using p(i) = exp(-d(i)^2/2sigma^2)
sigma = 0.5
probability = np.exp(-distance**2/(2*sigma**2))

# normalize the probability
probability = probability / probability.sum()

#output the top ten probability's corresponding instance expression
top_ten = np.argsort(probability)[-10:]
print(df_embedding.iloc[top_ten].expression, probability[top_ten])


# output the probability into pickle
with open("statistics/instance_prob_150.pkl", "wb") as f:
    pickle.dump(probability, f)


(150, 32)
70                    请帮我记录一个明天早上7点的提醒事项
73                   帮我写篇拓扑几何前沿研究主题的文献综述
119    请在接下来2小时内帮我自动回复导师的信息，帮我掩饰我在外面玩的事情
60                           规划一下明天的行程安排
110                   请帮我写一篇能发表在cvpr上的论文
109           如果我需要做一份关于社会实践的报告，请给出报告的大纲
133                     提醒我明天七点到清华大学（导航）
126                        提前10分钟提醒我参加组会
118                          根据我的关键词查找文献
1                        请帮我找出与这篇文章相似的例子
Name: expression, dtype: object [0.00724914 0.00724965 0.00725133 0.00731024 0.00735349 0.00741306
 0.00742145 0.00745267 0.00747263 0.00753554]
