In [1]:
import os
import sys
import pandas as pd
import argparse
from datetime import datetime, timedelta
import pathlib

# 新增：將上一層目錄加入 sys.path
parent_path = pathlib.Path().absolute().parent
sys.path.insert(0, str(parent_path))
# Setup Django environment
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'website_configs.settings')
import django
django.setup()
# 重要：設定環境變數以允許在 Jupyter 的異步環境中執行同步操作
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Now we can import Django models
from app_user_keyword_db.models import NewsData
from app_top_person_db.models import TopPerson

In [2]:
from django.db.models import Q, Max, F
from collections import Counter

In [3]:
# Searching keywords from "content" column
# This function now uses database queries instead of pandas
def filter_database_fullText(user_keywords, cond, cate, weeks):
    # Get the latest date in the database
    latest_date = NewsData.objects.aggregate(max_date=Max('date'))['max_date']
    
    # Calculate start date
    start_date = latest_date - timedelta(weeks=weeks)
    
    # Base query - filter by date range
    queryset = NewsData.objects.filter(date__gte=start_date, date__lte=latest_date)
    
    # Filter by category if not "全部"
    if cate != "全部":
        queryset = queryset.filter(category=cate)
    
    # Filter by keywords based on condition (AND or OR)
    if cond == 'and':
        # For AND condition, we need all keywords to be present
        for kw in user_keywords:
            queryset = queryset.filter(content__contains=kw)
    elif cond == 'or':
        # For OR condition, any keyword can be present
        q_objects = Q()
        for kw in user_keywords:
            q_objects |= Q(content__contains=kw)
        queryset = queryset.filter(q_objects)
    
    return queryset

In [4]:

user_keywords = ['烏克蘭', '俄羅斯']  # Example keywords
cond = 'and'  # Example condition (and/or), 
cate = '全部'  # Example category (or "全部" for all categories)
weeks = 4  # Example weeks
queryset = filter_database_fullText(user_keywords, cond, cate, weeks)

In [5]:
for news in queryset:
    print(news.title)

俄侵烏克蘭全球憤慨 臉書放寬規定讓人抒發怒火
Apple新品一次看 iPhone SE售1萬3900元起iPad Air有5色[影]
北京冬季帕運閉幕 帕委會主席談和平與希望
俄烏談判露曙光 油價下滑5%
華碩停止對俄羅斯出貨 捐款3000萬賑濟烏克蘭
油價若續漲 朱澤民：今年CPI有可能超過2%
澳洲擴大制裁俄羅斯 歐盟要凍結切爾西老闆資產
俄羅斯提核協議新要求 伊朗外長將赴莫斯科討論
2022酷寒演習展開 3萬北約兵力集結挪威
俄國遭制裁降價求售石油和商品 印度考慮採購
烏克蘭戰事中國疫情添不安 亞股多數收黑
路透社：美中高層已在羅馬會晤
借鑑烏克蘭核電廠遭攻 日研議核廠設專屬警備隊
烏俄進行第4輪談判 烏克蘭代表稱雙方溝通困難
國際博物館協會發聲拒絕戰爭 吳思瑤籲故宮跟進
戰爭時文物如何疏散  故宮3個月內擬對策7月推演
香港恆指暴跌千點 失守2萬點創6年新低


In [6]:
# Limit to k results and get specific fields
news_items = queryset.values('category', 'title', 'link', 'photo_link')[:3]

In [7]:
news_items

<QuerySet [{'category': '科技', 'title': '俄侵烏克蘭全球憤慨 臉書放寬規定讓人抒發怒火', 'link': 'https://www.cna.com.tw/news/ait/202203110088.aspx', 'photo_link': 'https://imgcdn.cna.com.tw/www/WebPhotos/200/20220311/2000x1391_0522240424184.jpg'}, {'category': '科技', 'title': 'Apple新品一次看 iPhone SE售1萬3900元起iPad Air有5色[影]', 'link': 'https://www.cna.com.tw/news/ait/202203090006.aspx', 'photo_link': 'https://imgcdn.cna.com.tw/www/webphotos/WebCover/420/20220309/800x600_644221551345.jpg'}, {'category': '運動', 'title': '北京冬季帕運閉幕 帕委會主席談和平與希望', 'link': 'https://www.cna.com.tw/news/aspt/202203130215.aspx', 'photo_link': None}]>

In [8]:
dates = list(queryset.values_list('date', flat=True))

In [9]:
latest_date = NewsData.objects.aggregate(max_date=Max('date'))['max_date']

# Calculate start date
start_date = latest_date - timedelta(weeks=4)  # 4 weeks ago

In [21]:
category = '政治'  # Example category (or "全部" for all categories)
entities_list = list(NewsData.objects.filter(category=category).filter(date__gte=start_date, date__lte=latest_date).values_list('entities', flat=True))

In [22]:
entities_list

["[NerToken(word='烏克蘭', ner='GPE', idx=(4, 7)), NerToken(word='外交部', ner='ORG', idx=(16, 19)), NerToken(word='傍晚', ner='TIME', idx=(24, 26)), NerToken(word='4000', ner='CARDINAL', idx=(32, 36)), NerToken(word='外交部長', ner='ORG', idx=(38, 42)), NerToken(word='吳釗燮', ner='PERSON', idx=(42, 45)), NerToken(word='中午', ner='TIME', idx=(45, 47)), NerToken(word='外交部', ner='ORG', idx=(76, 79)), NerToken(word='晚間', ner='TIME', idx=(79, 81)), NerToken(word='外交部', ner='ORG', idx=(89, 92)), NerToken(word='7日', ner='DATE', idx=(93, 95)), NerToken(word='烏克蘭', ner='NORP', idx=(104, 107)), NerToken(word='外交部', ner='ORG', idx=(122, 125)), NerToken(word='1730', ner='CARDINAL', idx=(133, 137)), NerToken(word='4000', ner='CARDINAL', idx=(152, 156)), NerToken(word='18日', ner='DATE', idx=(195, 198)), NerToken(word='外交部', ner='ORG', idx=(199, 202)), NerToken(word='外交部', ner='ORG', idx=(228, 231)), NerToken(word='吳釗燮', ner='PERSON', idx=(234, 237)), NerToken(word='今天', ner='DATE', idx=(237, 239)), NerToken(word=

In [11]:
news_categories=['政治','科技','運動','證卷','產經','娛樂','生活','國際','社會','文化','兩岸']

In [13]:

allowedNE=['PERSON']
news_categories=['政治','科技','運動','證卷','產經','娛樂','生活','國際','社會','文化','兩岸']
def ne_word_frequency( a_news_ne ):
    filtered_words =[]
    for ner,word in a_news_ne:
        if (len(word) >= 2) & (ner in allowedNE):
            filtered_words.append(word)
    counter = Counter( filtered_words )
    return counter.most_common( 20 )

# NerToken(word='烏克蘭', ner='GPE', idx=(4, 7))  # call function NerToken with three parameters: word, ner, and idx
def NerToken(word, ner, idx):
    # print(ner,word)
    return ner,word

In [28]:
    # Get the latest date in the database
latest_date = NewsData.objects.aggregate(max_date=Max('date'))['max_date']

# Calculate start date
start_date = latest_date - timedelta(weeks=4)  # 4 weeks ago


In [29]:

top_cate_ner_words={}
words_all=[]
for category in news_categories:
    entities_list=list(NewsData.objects.filter(category=category).filter(date__gte=start_date, date__lte=latest_date).values_list('entities', flat=True))
    words_group = []
    for entities in entities_list:
        if entities:
            words_group += eval(entities)

    words_all += words_group

    # Get top words by calling ne_word_frequency() function
    topwords = ne_word_frequency( words_group )
    top_cate_ner_words[category] = topwords


In [30]:

topwords_all = ne_word_frequency(words_all)
top_cate_ner_words['全部'] = topwords_all


In [31]:
top_cate_ner_words

{'政治': [('黃重凱', 24),
  ('柳惠千', 13),
  ('喬建中', 12),
  ('徐國勇', 11),
  ('季欽', 8),
  ('許舒博', 7),
  ('游錫堃', 6),
  ('侯富議', 6),
  ('蘇貞昌', 5),
  ('吳釗燮', 4),
  ('許佑格', 4),
  ('范惠君', 4),
  ('歐布萊恩', 4),
  ('陳其邁', 3),
  ('柯文哲', 3),
  ('陳智菡', 3),
  ('黃鼎翔', 3),
  ('王同義', 3),
  ('王美惠', 3),
  ('許邁德', 2)],
 '科技': [('林柏亨', 6),
  ('吳偉仁', 5),
  ('包淳偉', 4),
  ('黃晨瑋', 4),
  ('廖俊智', 4),
  ('波博', 3),
  ('黃文山', 3),
  ('艾偉', 3),
  ('王金燦', 2),
  ('庫克', 2),
  ('曾信超', 2),
  ('波皮斯庫', 2),
  ('王兆璋', 1),
  ('希塔拉姆', 1),
  ('吳宗信', 1),
  ('葉均蔚', 1),
  ('楊勇', 1),
  ('陳信安', 1),
  ('Tim哥', 1),
  ('夏哈', 1)],
 '運動': [('林昀儒', 24),
  ('鄭怡靜', 24),
  ('莊智淵', 17),
  ('陳將双', 12),
  ('潘武雄', 10),
  ('陳建安', 8),
  ('布雷迪', 7),
  ('布魯娜', 6),
  ('賈奈特', 6),
  ('艾倫', 6),
  ('塞爾蒂克', 5),
  ('李恩芯', 5),
  ('王志庭', 4),
  ('施冠宇', 4),
  ('加奇尼', 4),
  ('樂天', 3),
  ('安宰賢', 3),
  ('巴特瑞', 3),
  ('梁家榮', 3),
  ('黃子鵬', 3)],
 '證卷': [('林耕億', 3), ('葉獻文', 2), ('李昌鴻', 2), ('黃水可', 1)],
 '產經': [('陳耀祥', 17),
  ('朱澤民', 9),
  ('曾博昇', 6),
  ('杜書全', 5),
  ('蘇松輝', 4),

## Save or update top person table

In [32]:

# save it to db
# save it to db
for category, top_ners in top_cate_ner_words.items():
    # Convert the list of tuples to string representation for storage
    top_keys_str = str(top_ners)
    
    # Check if an entry for this category already exists
    try:
        # Update existing record
        obj = TopPerson.objects.get(category=category)
        obj.top_keys = top_keys_str
        obj.save()
    except TopPerson.DoesNotExist:
        # Create new record
        TopPerson.objects.create(
            category=category,
            top_keys=top_keys_str
        )