In [1]:
# imports
import pandas as pd
import os
import tiktoken
import openai
from dotenv import load_dotenv

from openai.embeddings_utils import get_embedding

In [2]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [15]:
qa_df = pd.read_csv("data/crland.csv")

In [16]:
qa_df

Unnamed: 0,prompt,completion
0,华润城整体规划单位及其知名作品,RTKL国际有限公司（RTKL International）。是世界上最大的建筑规划设计公司...
1,公建区规划及设计单位,公建区整体规划及展示中心设计单位：福斯特 (Foster)。参与的设计公司：RTKL、Fos...
2,建筑施工单位与建筑设计单位,建筑施工单位：中建三局。建筑设计单位：CCDI。\n创立于1994年，是在世界城市建设和开发...
3,万象天地园林设计单位及其曾设计过的知名作品,James Corner Field Operations（JCFO）事务所\n简介：该事务...
4,精装修设计单位,西塔：美国WILSON公司、东塔Kokai。简介：美国WILSON室内设计公司成立于1971...
...,...,...
91,停车收费标准？临时停放的车辆如何收费？,待定
92,物业管理公司的有偿和无偿服务项目？每日服务时间？,待定
93,物业服务内容是什么？,待定
94,入伙时须交哪些费用及金额计算？,物业管理收费应按月收取，经双方协商约定可以预收，但预收期限不得超过_____个月。\n本体维...


In [6]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [17]:
qa_df = qa_df.dropna()

In [18]:
qa_df.columns

Index(['prompt', 'completion'], dtype='object')

In [19]:
encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
qa_df["n_tokens"] = qa_df.prompt.apply(lambda x: len(encoding.encode(x)))
qa_df = qa_df[qa_df.n_tokens <= max_tokens].tail(20)
len(qa_df)

20

In [20]:
# This may take a few minutes
qa_df["embedding"] = qa_df.prompt.apply(lambda x: get_embedding(x, engine=embedding_model))


NameError: name 'df' is not defined

In [21]:
qa_df.to_csv("data/crland_embeddings.csv", encoding=")

In [22]:
qa_df.head()

Unnamed: 0,prompt,completion,n_tokens,embedding
76,地下大堂标准,地库入库大堂天棚材料不锈钢，墙面材料石材，地面材料石材。,8,"[-0.0018823315622285008, 0.002783428877592087,..."
77,居民文体活动中心及物业用房标准（待定）,物业服务用房材质待定,22,"[0.006069754250347614, -0.009398995898663998, ..."
78,公摊部分有哪些,包括但不限于大堂、设备用房、电梯厅、公共走道、公共管井、楼梯间、前室、电梯井道等。,9,"[0.005246931686997414, -0.010830115526914597, ..."
79,是否有凸窗,无凸窗,6,"[-0.018448153510689735, -0.012338797561824322,..."
80,有无设置残疾人通道？哪些位置？,设计满足无障碍，设置无障碍电梯,17,"[0.0023053945042192936, -0.006690924055874348,..."


In [23]:
response = openai.Embedding.create(
    input="停车场有几个出入口？",
    model="text-embedding-ada-002"
)
queryembedding = response['data'][0]['embedding']

In [24]:
queryembedding

[0.012612882070243359,
 -0.0042774975299835205,
 0.015641547739505768,
 -0.008790994994342327,
 -0.04389597848057747,
 0.012337548658251762,
 -0.017293546348810196,
 -0.016493769362568855,
 -0.016572436317801476,
 -4.5914472138974816e-05,
 0.013052104040980339,
 0.013366770930588245,
 -0.03618664667010307,
 0.02055821195244789,
 -0.0023976932279765606,
 -0.003556387033313513,
 0.003589164698496461,
 -0.002125637838616967,
 0.017503324896097183,
 -0.006568663287907839,
 -0.00889588426798582,
 0.01574643701314926,
 -0.03327598422765732,
 -0.01330777071416378,
 -0.012291660532355309,
 0.04536442086100578,
 0.0221315436065197,
 0.0028811651282012463,
 0.002828720724210143,
 -0.032567981630563736,
 0.013674881309270859,
 -0.0025369985960423946,
 -0.017083769664168358,
 -0.013274992816150188,
 0.0013234020443633199,
 -0.0016470825066789985,
 -0.004896997474133968,
 0.008882773108780384,
 0.019784655421972275,
 -0.0031089705880731344,
 0.014094437472522259,
 0.013176660053431988,
 0.005070719

In [25]:
from openai.embeddings_utils import cosine_similarity

In [27]:
qa_df["similarity"] = qa_df.embedding.apply(lambda x: cosine_similarity(x, queryembedding))

In [30]:
qa_df_similarity = qa_df.sort_values(by="similarity", ascending=False)

In [32]:
qa_df_similarity

Unnamed: 0,prompt,completion,n_tokens,embedding,similarity
91,停车收费标准？临时停放的车辆如何收费？,待定,22,"[0.015386095270514488, -0.005792957730591297, ...",0.873181
83,垃圾中转站的位置？垃圾如何处理？,未设置,20,"[0.004107650835067034, -0.019975794479250908, ...",0.81818
82,设备用房的位置？有什么设备？,网络机房设置于LG层，在内布置有网络机柜、配线架、网络交换机等；\n消防及安防控制室设置在一...,15,"[0.008880389854311943, -0.01147504337131977, -...",0.809877
80,有无设置残疾人通道？哪些位置？,设计满足无障碍，设置无障碍电梯,17,"[0.0023053945042192936, -0.006690924055874348,...",0.808328
87,景观阳台与生活阳台是否有地漏？位置在哪？,阳台有设置地漏，无生活阳台,21,"[0.019203854724764824, -0.021063167601823807, ...",0.805864
86,是否有洗衣机预留位？位置在哪？,室内预留,17,"[0.009565784595906734, -0.030221756547689438, ...",0.803758
94,入伙时须交哪些费用及金额计算？,物业管理收费应按月收取，经双方协商约定可以预收，但预收期限不得超过_____个月。\n本体维...,17,"[0.03006836213171482, -0.021734196692705154, 0...",0.797397
85,厨房是否设置专用烟道？如何排油烟？往楼顶排还是往地下排？是否防倒灌？,有烟道，屋顶排放,45,"[0.012193873524665833, 0.006573679391294718, -...",0.796922
78,公摊部分有哪些,包括但不限于大堂、设备用房、电梯厅、公共走道、公共管井、楼梯间、前室、电梯井道等。,9,"[0.005246931686997414, -0.010830115526914597, ...",0.792907
90,物业管理费是多少？是否要预付管理费？由谁拟定批准？,物业管理服务方案正在进行方案评审，物业服务费价格待公司审批。\n物业服务费按规定预收____...,26,"[0.029821686446666718, -0.013722201809287071, ...",0.791936
