In [None]:
!pip install datasets
from datasets import Dataset # Import Dataset from the correct library
from IPython import get_ipython
from IPython.display import display
# %%
!pip install huggingface_hub beautifulsoup4 PyPDF2 markdown

import os
import shutil # Import shutil for deleting the directory if necessary
from pathlib import Path
from bs4 import BeautifulSoup
import PyPDF2
import markdown
from huggingface_hub import HfApi, DatasetCard # Keep other imports from huggingface_hub

GITHUB_REPO_URL = "https://github.com/zinderud/HuginRisale.git"
DATA_DIR = "content"  # Or update to "HuginRisale/content" if necessary
REPO_DIR = "HuginRisale"  # Define a variable for the repo directory


# Hugging Face ayarları
HF_USERNAME = "zinderud"
HF_DATASET_NAME = "risale"
os.environ['HF_TOKEN'] = 'nn' 

if os.path.exists(REPO_DIR):
    shutil.rmtree(REPO_DIR) 


!git clone {GITHUB_REPO_URL}

processed_data = []

# 2. Dosya işleme fonksiyonu
def process_file(file_path):
    try:
        if file_path.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')
                return soup.get_text()
                
        elif file_path.endswith('.pdf'):
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                text = '\n'.join([page.extract_text() for page in reader.pages])
                return text
                
        elif file_path.endswith('.md'):
            with open(file_path, 'r', encoding='utf-8') as f:
                md_text = f.read()
                return markdown.markdown(md_text)
                
        elif file_path.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
                
    except Exception as e:
        print(f"Hata: {file_path} - {str(e)}")
        return None

# Dosya tarama
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith(('.html', '.pdf', '.md', '.txt')):
            full_path = os.path.join(root, file)
            # Temporarily remove or comment out the following line
            # if any(subfolder in full_path for subfolder in ["ekler", "pdf", "txt"]):
            print(full_path)  # Print the full path of each file
            text = process_file(full_path)
            if text:
                processed_data.append({
                    'file_path': full_path,
                    'content': text,
                    'file_type': Path(full_path).suffix[1:]
                })

# Hugging Face'e yükleme
if processed_data:
    dataset = Dataset.from_dict({
        'text': [f"[{item['file_type'].upper()}]\n{item['content']}" for item in processed_data],
        'file_path': [item['file_path'] for item in processed_data],
        'file_type': [item['file_type'] for item in processed_data]
    })
    
    dataset.push_to_hub(
        repo_id=f"{HF_USERNAME}/{HF_DATASET_NAME}",
        token=os.environ['HF_TOKEN']  # Environment variable'dan oku
    )
    print(f"✅ {len(processed_data)} dosya başarıyla yüklendi!")
else:
    print("❌ Yüklenecek veri bulunamadı!")