In [None]:
!pip install openai
!pip install spacy
!pip install tensorflow
!pip install --upgrade tensorflow_hub
!pip install scipy
!pip install tqdm

In [1]:
# 라이브러리 import
import json
import os
import openai
import random
import spacy
import tensorflow_hub as hub


from scipy import spatial
from scipy.spatial import distance
from tqdm import tqdm


# OpenAI 주소 : https://openai.com/api/
# API_KEY의 경우 OpenAI 내 개인 계정에서 생성하여 사용할 수 있다.
# 위 주소에서 View API keys를 이용하여 아래를 채워 사용해야 한다.
# 환경변수 설정 필요
# export OPENAI_API_KEY = " 자신의 api-key "
# 아래 빈 부분에 자신의 api-key를 넣어야 사용할 수 있다.
openai.api_key = "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"

In [4]:
# 실제로 활용해보기
# abstract 를 원하는 논문의 abstract로 채우고 실행 시 활용 가능

abstract = "Many researchers have sought ways of model compression to reduce the size of a deep neural network (DNN) with minimal performance degradation in order to use DNNs in embedded systems. Among the model compression methods, a method called knowledge transfer is to train a student network with a stronger teacher network. In this paper, we propose a novel knowledge transfer method which uses convolutional operations to paraphrase teacher’s knowledge and to translate it for the student. This is done by two convolutional modules, which are called a paraphraser and a translator. The paraphraser is trained in an unsupervised manner to extract the teacher factors which are defined as paraphrased information of the teacher network. The translator located at the student network extracts the student factors and helps to translate the teacher factors by mimicking them. We observed that our student network trained with the proposed factor transfer method outperforms the ones trained with conventional knowledge transfer methods."

response = openai.Completion.create(
        model="davinci:ft-personal-2022-12-11-02-49-21", 
        prompt=f"Generate title with this : {abstract}",
        temperature=0.1, 
        max_tokens=1024)
    
res = response["choices"][0]["text"].split("END")[0]

print(res)


 The Paraphraser and the Translator: Two Convolutional Modules for Knowledge Transfer 


In [None]:
# 실제로 활용해보기
# abstract를 Dataset의 abstract를 이용해서 실행 시 활용 가능

t = []
a = []

# 테스트를 위한 Dataset 생성
with open("arxiv_data/arxiv-metadata-oai-snapshot.json", 'r') as f:  # arxiv dataset 읽어오기
    for i, entry in enumerate(tqdm(f)):
        data = dict(json.loads(entry))
        if "cs.AI" in data["categories"]:
            t.append(data["title"])
            a.append(data["abstract"])

2174796it [00:52, 41697.45it/s]


In [None]:
# 원하는 Index 선택
idx = 2345

response = openai.Completion.create(
        model="davinci:ft-personal-2022-12-11-02-49-21", 
        prompt=f"Generate title with this : {a[idx]}",
        temperature=0, 
        max_tokens=1024)
    
res = response["choices"][0]["text"].split("END")[0]

print(f"Original: {t[idx]}, \nGenerated:{res}")

Original: Large-Scale Automatic Labeling of Video Events with Verbs Based on
  Event-Participant Interaction, 
Generated:  Event Recognition with Spatiotemporal Verbs 


# Fine Tuning

In [None]:
fine_tune = []

# Fine tuning을 위한 Dataset 생성
with open("arxiv_data/arxiv-metadata-oai-snapshot.json", 'r') as f:  # arxiv dataset 읽어오기
    for i, entry in enumerate(f):
        data = dict(json.loads(entry))
        if "cs.AI" in data["categories"]:       # data의 category 중 cs.AI만 뽑기
            tmp = {}
            abs = data["abstract"]
            
            # Prepare Data 내 prompt라는 key에 대한 value로 abstract를 설정
            tmp["prompt"] = f"Generate title with this : {abs}"
            
            # Prepare Data 내 completion이라는 key에 대한 value로 title을 설정
            tmp["completion"] = data["title"]
            
            fine_tune.append(tmp)

# arxiv data 전체를 사용 시 fine tuning 시간이 오래걸리므로 데이터를 500개로 한정            
fine_tune_train = fine_tune[:500]

# Fune tuning을 위한 Json Lines Dataset 생성 (Fune tuning을 하기 위해선 json이 아니라 jsonl 확장자가 필요)
with open("arxiv_data/finetune_traindata.jsonl", 'w') as f:
    for i in fine_tune_train: f.write(json.dumps(i) + "\n")
    
test = []
gt = []

# Test를 위한 Data 생성 (Testset 생성)
with open("arxiv_data/arxiv-metadata-oai-snapshot.json", 'r') as f:
    for i, entry in enumerate(f):
        data = dict(json.loads(entry))
        if "cs.AI" in data["categories"]:
            test.append(data["abstract"])
            gt.append(data["title"])

# Test

In [None]:
# 샘플 생성            
idx = []

for i in range(0, 55185):
    idx.append(i)

sample = random.sample(idx, 100)

### 기존 Model

In [None]:
# 평균 성능 측정 (Finetuning 시 사용하지 않은 데이터 이용)
test_set = test[500:]
gt_set = gt[500:]
default_distance = 0

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

for i, idx in enumerate(tqdm(sample)):
    t = test_set[idx]
    titles = []
    
    # 기존 모델에서 생성한 제목 
    response = openai.Completion.create(
        model="text-davinci-003", 
        prompt=f"Generate title with this : \n\n{t}",
        temperature=0, 
        max_tokens=1024)
    
    res = response["choices"][0]["text"]
    
    titles.append(gt_set[idx])
    titles.append(res)
    
    embeddings = embed(titles)
    
    # Universal Sentence Encoding을 이용하여 유사도 측정
    default_distance += (1 - distance.cosine(embeddings[0], embeddings[1]))
    print(f"Current Distance = {default_distance / (i+1)}")

# 누적된 유사도를 sample 개수로 나누어 평균적인 유사도를 측정
default_distance /= len(sample)

print(default_distance)

In [11]:
print(f"Default Model Similarity using Universal Sentence Encoding: {default_distance}")

Default Model Similarity using Universal Sentene Encoding: 0.5018037787824869


### Fine Tuning 한 모델

In [None]:
# 평균 성능 측정 (Finetuning 시 사용하지 않은 데이터 이용)
test_set = test[500:]
gt_set = gt[500:]
default_distance = 0

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

for i, idx in enumerate(tqdm(sample)):
    t = test_set[idx]
    titles = []
    
    # Fine Tuning 한 모델을 이용하여 제목 생성
    response = openai.Completion.create(
        model="davinci:ft-personal-2022-12-11-02-49-21", 
        prompt=f"Generate title with this : {t}",
        temperature=0, 
        max_tokens=1024)
    
    res = response["choices"][0]["text"].split("END")[0]
    
    titles.append(gt_set[idx])
    titles.append(res)
    
    embeddings = embed(titles)
    
    # Universal Sentence Encoding을 이용하여 유사도 측정
    default_distance += (1 - distance.cosine(embeddings[0], embeddings[1]))
    print(f"Current Distance = {default_distance / (i+1)}")

# 누적된 유사도를 sample 개수로 나누어 평균적인 유사도를 측정
default_distance /= len(sample)

print(default_distance)

In [20]:
print(f"Fine Tuned Model Similarity using Universal Sentence Encoding: {default_distance}")

Fine Tuned Model Similarity using Universal Sentene Encoding: 0.527911923378706
