-
Notifications
You must be signed in to change notification settings - Fork 2.2k
refactor: File model #3050
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor: File model #3050
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Generated by Django 5.2 on 2025-05-07 03:40 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('knowledge', '0004_knowledge_file_size_limit_alter_document_status_and_more'), | ||
] | ||
|
||
operations = [ | ||
migrations.RemoveField( | ||
model_name='file', | ||
name='workspace_id', | ||
), | ||
migrations.AddField( | ||
model_name='file', | ||
name='file_size', | ||
field=models.IntegerField(default=0, verbose_name='文件大小'), | ||
), | ||
migrations.AddField( | ||
model_name='file', | ||
name='sha256_hash', | ||
field=models.CharField(default='', verbose_name='文件sha256_hash标识'), | ||
), | ||
migrations.AddField( | ||
model_name='file', | ||
name='source_id', | ||
field=models.CharField(default='TEMPORARY_100_MINUTE', verbose_name='资源id'), | ||
), | ||
migrations.AddField( | ||
model_name='file', | ||
name='source_type', | ||
field=models.CharField(choices=[('KNOWLEDGE', 'Knowledge'), ('APPLICATION', 'Application'), ('TEMPORARY_30_MINUTE', 'Temporary 30 Minute'), ('TEMPORARY_100_MINUTE', 'Temporary 120 Minute'), ('TEMPORARY_1_DAY', 'Temporary 1 Day')], default='TEMPORARY_100_MINUTE', verbose_name='资源类型'), | ||
) | ||
] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The provided migration script looks generally correct and should not contain any immediate issues. However, there are a few optimizations you can consider:
By implementing these optimizations, you can improve query performance and maintainability of your database schema. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,15 @@ | |
import uuid_utils.compat as uuid | ||
from django.contrib.postgres.search import SearchVectorField | ||
from django.db import models | ||
from django.db.models import QuerySet | ||
from django.db.models.signals import pre_delete | ||
from django.dispatch import receiver | ||
from mptt.fields import TreeForeignKey | ||
from mptt.models import MPTTModel | ||
|
||
from common.db.sql_execute import select_one | ||
from common.mixins.app_model_mixin import AppModelMixin | ||
from common.utils.common import get_sha256_hash | ||
from models_provider.models import Model | ||
from users.models import User | ||
|
||
|
@@ -221,6 +223,19 @@ class SearchMode(models.TextChoices): | |
blend = 'blend' | ||
|
||
|
||
class FileSourceType(models.TextChoices): | ||
# 知识库 跟随知识库被删除而被删除 source_id 为知识库id | ||
KNOWLEDGE = "KNOWLEDGE" | ||
# 应用 跟随应用被删除而被删除 source_id 为应用id | ||
APPLICATION = "APPLICATION" | ||
# 临时30分钟 数据30分钟后被清理 source_id 为TEMPORARY_30_MINUTE | ||
TEMPORARY_30_MINUTE = "TEMPORARY_30_MINUTE" | ||
# 临时120分钟 数据120分钟后被清理 source_id为TEMPORARY_100_MINUTE | ||
TEMPORARY_120_MINUTE = "TEMPORARY_100_MINUTE" | ||
# 临时1天 数据1天后被清理 source_id为TEMPORARY_1_DAY | ||
TEMPORARY_1_DAY = "TEMPORARY_1_DAY" | ||
|
||
|
||
class VectorField(models.Field): | ||
def db_type(self, connection): | ||
return 'vector' | ||
|
@@ -246,16 +261,25 @@ class Meta: | |
class File(AppModelMixin): | ||
id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id") | ||
file_name = models.CharField(max_length=256, verbose_name="文件名称", default="") | ||
workspace_id = models.CharField(max_length=64, verbose_name="工作空间id", default="default", db_index=True) | ||
file_size = models.IntegerField(verbose_name="文件大小", default=0) | ||
sha256_hash = models.CharField(verbose_name="文件sha256_hash标识", default="") | ||
source_type = models.CharField(verbose_name="资源类型", choices=FileSourceType, | ||
default=FileSourceType.TEMPORARY_120_MINUTE.value) | ||
source_id = models.CharField(verbose_name="资源id", default=FileSourceType.TEMPORARY_120_MINUTE.value) | ||
loid = models.IntegerField(verbose_name="loid") | ||
meta = models.JSONField(verbose_name="文件关联数据", default=dict) | ||
|
||
class Meta: | ||
db_table = "file" | ||
|
||
def save(self, bytea=None, force_insert=False, force_update=False, using=None, update_fields=None): | ||
result = select_one("SELECT lo_from_bytea(%s, %s::bytea) as loid", [0, bytea]) | ||
self.loid = result['loid'] | ||
sha256_hash = get_sha256_hash(bytea) | ||
f = QuerySet(File).filter(sha256_hash=sha256_hash).first() | ||
if f is not None: | ||
self.loid = f.loid | ||
else: | ||
result = select_one("SELECT lo_from_bytea(%s, %s::bytea) as loid", [0, bytea]) | ||
self.loid = result['loid'] | ||
super().save() | ||
|
||
def get_bytes(self): | ||
|
@@ -265,4 +289,6 @@ def get_bytes(self): | |
|
||
@receiver(pre_delete, sender=File) | ||
def on_delete_file(sender, instance, **kwargs): | ||
select_one(f'SELECT lo_unlink({instance.loid})', []) | ||
exist = QuerySet(File).filter(loid=instance.loid).exclude(id=instance.id).exists() | ||
if not exist: | ||
select_one(f'SELECT lo_unlink({instance.loid})', []) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code looks mostly clean and well-structured. However, there are a few areas where improvements can be made: Improvements
Suggested Corrections# Separating get_bytes logic
def fetch_data(file_size, bytea):
if file_size > 0:
result = select_one("SELECT lo_from_bytea(%s, %s::bytea) as loid", [file_size, bytea])
return result['loid']
return 0
# Updated save method to call helper function
def save(self, bytea=None, force_insert=False, force_update=False, using=None, update_fields=None):
file_size = len(bytea)
self.loid = fetch_data(file_size, bytea)
@receiver(pre_delete, sender=File)
def on_delete_file(sender, instance, **kwargs):
exist = QuerySet(File).filter(loid=instance.loid).exclude(id=instance.id).exists()
if exist: # Only delete if another File with the same loid exists
select_one(f'SELECT lo_unlink({instance.loid})', []) These changes aim to reduce redundancy, enhance readability, and improve maintainability of the codebase while ensuring security practices are followed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The provided code looks mostly clean and well-structured. However, there are a few areas that can be improved:
Docstring: Adding docstrings to functions can help clarify their purpose.
Comments: It might be beneficial to add comments above sections of code that perform specific operations or set up certain conditions.
Code Consistency:
sub_array
function should handle cases whereitem_num > len(array)
gracefully.Performance Optimization:
bulk_create_in_batches
is called frequently, consider optimizing it further or integrating with an asynchronous database client like Django Channels or Celery.Exception Handling:
Here's revised version with some of these improvements:
These changes enhance readability, maintainability, and error-handling capabilities of the code.