<a href="https://colab.research.google.com/github/ykitaguchi77/github-text-extractor/blob/main/Colab_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **📂 GitHub-Text-Extractor**

## 🚀 Overview
`GitRepoTreeExplorer` is a tool that visualizes the file hierarchy of a GitHub repository and exports it in text format. It allows you to easily review the contents of a repository and clearly understand its structure.

## 📌 Key Features
- Clone the contents of a GitHub repository to local storage
- Visually display the file hierarchy
- Export the total codes of the repository in text format with XML structure

## 💡 Future Application
- This tool allows for analysis and modificatin of GitHub repositories with large language models (LLM) such as Claude3.






In [1]:
!pip install gitpython treelib

Collecting gitpython
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting treelib
  Downloading treelib-1.7.0-py3-none-any.whl.metadata (1.3 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading treelib-1.7.0-py3-none-any.whl (18 kB)
Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)
Installing collected packages: treelib, smmap, gitdb, gitpython
Successfully installed gitdb-4.0.11 gitpython-3.1.43 smmap-5.0.1 treelib-1.7.0


In [3]:
import os
from git import Repo
from treelib import Node, Tree

# GitHub repository URL
repository_url = "https://github.com/niuchuangnn/SPICE.git" #enter repository URL

# クローンするディレクトリのパス
clone_dir = "/content/repo"
os.makedirs(clone_dir, exist_ok=True)

%cd $clone_dir
!git clone $repository_url

exclude_dirs = [".git"]

/content/repo
Cloning into 'SPICE'...
remote: Enumerating objects: 388, done.[K
remote: Counting objects: 100% (114/114), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 388 (delta 64), reused 92 (delta 53), pack-reused 274 (from 1)[K
Receiving objects: 100% (388/388), 178.87 MiB | 27.79 MiB/s, done.
Resolving deltas: 100% (167/167), done.
Updating files: 100% (138/138), done.


In [7]:
from treelib import Tree
import os

# Function to display the file hierarchy
def display_file_hierarchy(directory, exclude_dirs):
    tree = Tree()
    tree.create_node(directory, directory)  # Add root node

    # Traverse the directory structure
    for root, dirs, files in os.walk(directory):
        dirs[:] = [d for d in dirs if d not in exclude_dirs]  # Exclude specified directories
        for dir in dirs:
            tree.create_node(dir, os.path.join(root, dir), parent=root)  # Add directory nodes
        for file in files:
            tree.create_node(file, os.path.join(root, file), parent=root)  # Add file nodes

    # Define a fixed file name for the output
    tree_file = os.path.join(directory, "tree_structure.txt")

    # Open (and create) the file to ensure it's empty before saving the tree structure
    with open(tree_file, 'w') as file:
        pass  # This action creates or truncates the file

    # Save the tree structure to the text file
    tree.save2file(tree_file)
    print(f"Directory structure saved to {tree_file}")

    # Read and display the content of the text file
    with open(tree_file, 'r') as file:
        print(file.read())

# Example usage to display the file hierarchy of a cloned repository, excluding the .git directory
display_file_hierarchy(clone_dir, exclude_dirs=[".git"])

Directory structure saved to /content/repo/tree_structure.txt
/content/repo
├── SPICE
│   ├── LICENSE
│   ├── README.md
│   ├── configs
│   │   ├── cifar10
│   │   │   ├── embedding.py
│   │   │   ├── eval.py
│   │   │   └── spice_self.py
│   │   ├── cifar100
│   │   │   ├── embedding.py
│   │   │   ├── eval.py
│   │   │   └── spice_self.py
│   │   ├── imagenet10
│   │   │   ├── embedding.py
│   │   │   ├── eval.py
│   │   │   └── spice_self.py
│   │   ├── imagenet_dog
│   │   │   ├── embedding.py
│   │   │   ├── eval.py
│   │   │   └── spice_self.py
│   │   ├── stl10
│   │   │   ├── embedding.py
│   │   │   ├── eval.py
│   │   │   └── spice_self.py
│   │   └── tiny_imagenet
│   │       └── eval.py
│   ├── dataset.md
│   ├── evaluation.md
│   ├── figures
│   │   ├── framework.png
│   │   └── proto-local.png
│   ├── fixmatch
│   │   ├── .gitignore
│   │   ├── LICENSE
│   │   ├── README.md
│   │   ├── assets
│   │   │   ├── eval_metrics.png
│   │   │   ├── fixmatch.png
│   │   │   └── tr

In [9]:
import os
import xml.etree.ElementTree as ET

def read_file_content(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        return str(e)

def directory_to_xml(directory, exclude_dirs=None, include_extensions=None):
    if exclude_dirs is None:
        exclude_dirs = []
    if include_extensions is None:
        include_extensions = []

    root_element = ET.Element("directory", name=os.path.basename(directory))
    append_files_and_dirs(root_element, directory, exclude_dirs, include_extensions)
    return root_element

def append_files_and_dirs(parent_element, path, exclude_dirs, include_extensions):
    for item in sorted(os.listdir(path)):
        item_path = os.path.join(path, item)
        if os.path.isdir(item_path) and item not in exclude_dirs:
            dir_element = ET.SubElement(parent_element, "directory", name=item)
            append_files_and_dirs(dir_element, item_path, exclude_dirs, include_extensions)
        elif os.path.isfile(item_path):
            file_extension = os.path.splitext(item)[1].lower()
            if not include_extensions or file_extension in include_extensions:
                file_element = ET.SubElement(parent_element, "file", name=item)
                content = read_file_content(item_path)
                content_element = ET.SubElement(file_element, "content")
                content_element.text = content

def generate_tree_structure(element, prefix=""):
    result = []
    children = list(element)
    for i, child in enumerate(children):
        is_last = i == len(children) - 1
        result.append(f"{prefix}{'└── ' if is_last else '├── '}{child.get('name')}")
        if child.tag == "directory":
            result.extend(generate_tree_structure(child, prefix + ('    ' if is_last else '│   ')))
    return result

def write_xml_to_text_file(xml_element, text_file_name):
    tree_structure = generate_tree_structure(xml_element)

    with open(text_file_name, 'w', encoding='utf-8') as file:
        file.write("Directory Structure:\n")
        file.write("\n".join(tree_structure))
        file.write("\n\nDetailed Content:\n")
        file.write(ET.tostring(xml_element, encoding='unicode', method='xml'))

# Example usage
clone_dir = "/content/repo"
repository_url = "https://github.com/niuchuangnn/SPICE.git"
root_dir = f"{clone_dir}/{os.path.basename(repository_url).rstrip('.git')}"
exclude_dirs = ['.git']
include_extensions = ['.py', '.txt', '.yaml']  # Add more extensions as needed

root_element = directory_to_xml(root_dir, exclude_dirs, include_extensions)
text_file_name = "directory_structure_with_content.txt"
write_xml_to_text_file(root_element, text_file_name)

print(f"Directory structure and content saved to {text_file_name}")

Directory structure and content saved to directory_structure_with_content.txt
