In [3]:
import os
import sys
import pandas as pd
import argparse
from datetime import datetime

# Setup Django environment
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'website_configs.settings')
import django
django.setup()
# 重要：設定環境變數以允許在 Jupyter 的異步環境中執行同步操作
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Now we can import Django models
from app_user_keyword_association.models import NewsData

In [None]:
 # Read CSV file
csv_file_path = 'app_user_keyword/dataset/cna_news_200_preprocessed.csv'
df = pd.read_csv(csv_file_path, sep='|')

# Process each row and create a NewsData object
for _, row in df.iterrows():
    # Convert date string to datetime object
    date_obj = datetime.strptime(row['date'], '%Y-%m-%d').date()

    # Create or update NewsData object
    news_data, created = NewsData.objects.update_or_create(
        item_id=row['item_id'],
        defaults={
            'date': date_obj,
            'category': row['category'],
            'title': row['title'],
            'content': row['content'],
            'sentiment': row['sentiment'],
            'summary': row['summary'],
            'top_key_freq': row['top_key_freq'],
            'tokens': row['tokens'],
            'tokens_v2': row['tokens_v2'],
            'entities': row['entities'],
            'token_pos': row['token_pos'],
            'link': row['link'],
            'photo_link': row['photo_link'] if row['photo_link'] != "" and not pd.isna(row['photo_link']) else None,
        }
    )

# photo_link 欄位的值可能為以下幾種情況：
# 實際有值的 URL 字串
# 空字串 ("")
# Pandas NaN 值（當 CSV 檔案中該欄位為空時）
# None 值        