# Code Gen
- java 날것의 데이터 이용

## DataFrame 만들기

In [None]:
!pip uninstall -y pyarrow
!pip install datasets

In [None]:
!pip install huggingface_hub transformers sentence-transformers langchain_community langchain

In [None]:
# 벡터DB
!pip install qdrant-client

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# 구글 드라이브 마운트 (필요 없는 경우 skip)
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# 소스를 읽고 txt로 변환하는 함수
def convert_files_to_txt(src_dir, dst_dir):
    # If the destination directory does not exist, create it.
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    for root, dirs, files in os.walk(src_dir):
        for file in files:
            if not file.endswith('.jpg'):
                file_path = os.path.join(root, file)
            rel_path = os.path.relpath(file_path, src_dir)
            # get the relative path to preserve directory structure
            # Create the same directory structure in the new directory
            new_root = os.path.join(dst_dir, os.path.dirname(rel_path))
            os.makedirs(new_root, exist_ok=True)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = f.read()
            except UnicodeDecodeError:
                # 문제가 발생하면 다른 처리를 하도록  처리 - 없으면 pretraining 실패 할 수 있음
                try:
                    with open(file_path, 'r', encoding='latin-1') as f:
                        data = f.read()
                except UnicodeDecodeError:
                    print(f"Failed to decode the file: {file_path}")
                continue
            # Create a new file path with .txt extension
            new_file_path = os.path.join(new_root, file + '.txt')
            with open(new_file_path, 'w', encoding='utf-8') as f:
                f.write(data)
            print(f"Converted {file_path} to {new_file_path}")
#
import os
# Call the function with the source and destination directory paths
# convert_files_to_txt('/content/drive/MyDrive/data_samples', '/content/converted_codebase')
convert_files_to_txt('./data_samples', './converted_codebase')
#convert_files_to_text('/content/sample', '/content/des_folder')


Converted ./data_samples/hi.java to ./converted_codebase/hi.java.txt


In [None]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader
from langchain_community.vectorstores import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
src_dir = "./data_sample"
loader = DirectoryLoader(src_dir, show_progress=True, loader_cls=TextLoader)
repo_files = loader.load()
print(f"Number of files loaded: {len(repo_files)}")
#
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
documents = text_splitter.split_documents(documents=repo_files)
print(f"Number of documents : {len(documents)}")


In [None]:
for doc in documents:
    old_path_with_txt_extension = doc.metadata["source"]
    new_path_without_txt_extension = old_path_with_txt_extension.replace(".txt", "")
    doc.metadata.update({"source": new_path_without_txt_extension})

model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings":True}
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name,
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs,
                                      )


qdrant = Qdrant.from_documents(
    documents,
    embeddings,
    path="/content/local_qdrant",
    collection_name="my_documents",
)

In [None]:
def pretty_print_docs(documents):
    for doc in documents:
        print(doc.metadata)
        print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ")
        print(doc.page_content)

#query = "what is the syntax to import text_splitter using langchain"
query = "print of BinarySearch"
found_docs = qdrant.similarity_search(query)
pretty_print_docs(found_docs)

for doc in found_docs:
    print(doc)

## DataFrame 생성
- 결과물이 wonik-hi/code_data2
- 데이터프레임에 학습용으로 쓸려면 instruction, description을 포함하는 것이 좋다. (현재 filename, code만 존재)

In [None]:
import os 
import pandas as pd

path = "/content/drive/MyDrive/data_samples"
dst_dir="/content/drvie/MyDrvie/data_samples_res"

arr_code = []
tmp={}

if not os.path.exists(dst_dir):
  os.makedirs(dst_dir)

for root, dirs, files in os.walk(path):
  for file in files:
    if not file.endswith('.jpg'):
      file_path = os.path.join(root, file)

    rel_path = os.path.relpath(file_path, path)
    new_root = os.path.join(dst_dir, os.path.dirname(rel_path))
    #new_root = os.path.join(dst_dir, os.path.dirname(dst_dir))
    #print(rel_path+","+new_root)

    new_file_path = os.path.join(new_root, file + '.txt')

    try:
      with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    except UnicodeDecodeError:
      try:
        with open(file_path, 'r', encoding='latin-1') as f:
          data = f.read()
      except UnicodeDecodeError:
        print(f"Failed to decode the file: {file_path}")
        continue

    with open(new_file_path, 'w', encoding='utf-8') as f:
        f.write(data)

    tmp["filename"] = rel_path
    tmp["code"] = data
    arr_code.append(tmp.copy())
    print(tmp["filename"])

arr_code

In [None]:
len(arr_code)

In [None]:
df = pd.DataFrame(arr_code)

In [None]:
from datasets import Dataset

In [None]:
vds = Dataset.from_pandas(df)

In [None]:
vds

In [None]:
vds.push_to_hub('wonik-hi/code_data2')