# 國立陽明交通大學課程爬蟲 - 完整綱要版

此 Notebook 用於爬取 NYCU 課程的**完整資料**，包括：

### 基本資訊
- 課程代碼、課程名稱
- 授課教師、學分、時數
- 上課時間、教室

### 詳細綱要
- 先修科目
- 課程概述與目標
- 教科書與參考資料
- 評分方式
- 每週進度表
- 單元分配時數表

**注意**：此版本需要較長時間（約 40-60 分鐘），請耐心等待

## 步驟 1: 參數設定

修改下方的學年度和學期參數：

In [None]:
# ============= 參數設定 =============
YEAR = 114          # 學年度
SEMESTER = 1        # 學期 (1=上學期, 2=下學期)
# ===================================

## 步驟 2: 安裝必要套件

In [None]:
!pip install requests -q

## 步驟 3: 完整爬蟲程式碼

執行此儲存格載入完整版爬蟲：

In [None]:
import json
import re
import requests
import time
import warnings
from datetime import datetime, timedelta
from IPython.display import clear_output

warnings.filterwarnings('ignore')

class NYCUCrawlerWithOutline:
    def __init__(self, year, semester):
        self.year = year
        self.semester = semester
        self.acysem = str(year) + str(semester)
        self.flang = "zh-tw"
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
        self.ajax_headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "X-Requested-With": "XMLHttpRequest"
        }
        self.dep_list = []
        self.course_data = {}
        self.stats = {
            'total_courses': 0,
            'outline_success': 0,
            'outline_fail': 0,
            'start_time': None
        }
        
    def parse_time(self, tc):
        pattern = '[MTWRFSU][1-9yznabcd]+'
        tc_list = tc.split(',')
        time_list = []
        for item in tc_list:
            time = re.findall(pattern, item.split('-')[0])
            for t in time:
                for i in range(len(t)-1):
                    time_list.append(t[0]+t[i+1])
        return time_list

    def parse_classroom(self, tc):
        tc_list = tc.split(',')
        classroom_list = []
        for item in tc_list:
            try:
                classroom = item.split('-')[1]
            except IndexError:
                classroom = ''
            classroom_list.append(classroom)
        return classroom_list

    def get_course_outline(self, cos_id, max_retries=3):
        outline_data = {}
        request_data = {
            "acy": str(self.year),
            "sem": str(self.semester),
            "cos_id": str(cos_id),
            "user": "",
            "_token": ""
        }
        
        for attempt in range(max_retries):
            try:
                # 1. 基本資料
                url_base = "https://timetable.nycu.edu.tw/?r=main/getCrsOutlineBase"
                response = requests.post(url_base, data=request_data, 
                                       headers=self.ajax_headers, verify=False, timeout=10)
                if response.status_code == 200:
                    base_data = response.json()
                    outline_data['base'] = {
                        'cos_name': base_data.get('cos_name', ''),
                        'cos_eng_name': base_data.get('cos_eng_name', ''),
                        'sel_type_name': base_data.get('sel_type_name', ''),
                        'dep_name': base_data.get('dep_name', ''),
                        'cos_code': base_data.get('cos_code', '')
                    }
                
                time.sleep(0.2)
                
                # 2. 課程描述
                url_desc = "https://timetable.nycu.edu.tw/?r=main/getCrsOutlineDescription"
                response = requests.post(url_desc, data=request_data, 
                                       headers=self.ajax_headers, verify=False, timeout=10)
                if response.status_code == 200:
                    desc_data = response.json()
                    outline_data['description'] = {
                        'prerequisite': desc_data.get('crs_prerequisite', ''),
                        'outline': desc_data.get('crs_outline', ''),
                        'textbook': desc_data.get('crs_textbook', ''),
                        'exam_score': desc_data.get('crs_exam_score', ''),
                        'teach_method': desc_data.get('crs_teach_method', '')
                    }
                
                time.sleep(0.2)
                
                # 3. 每週進度
                url_syllabus = "https://timetable.nycu.edu.tw/?r=main/getCrsOutlineSyllabuses"
                response = requests.post(url_syllabus, data=request_data, 
                                       headers=self.ajax_headers, verify=False, timeout=10)
                if response.status_code == 200:
                    syllabus_data = response.json()
                    outline_data['weekly_schedule'] = []
                    for week in syllabus_data:
                        outline_data['weekly_schedule'].append({
                            'week_id': week.get('week_id', ''),
                            'class_date': week.get('class_date', ''),
                            'class_data': week.get('class_data', '')
                        })
                
                time.sleep(0.2)
                
                # 4. 單元時數
                url_optional = "https://timetable.nycu.edu.tw/?r=main/getCrsOutlineOptional"
                response = requests.post(url_optional, data=request_data, 
                                       headers=self.ajax_headers, verify=False, timeout=10)
                if response.status_code == 200:
                    optional_data = response.json()
                    outline_data['unit_hours'] = []
                    for unit in optional_data:
                        outline_data['unit_hours'].append({
                            'title': unit.get('opt_title', ''),
                            'content': unit.get('opt_content', ''),
                            'hour_teaching': unit.get('opt_hour_teaching', '')
                        })
                
                self.stats['outline_success'] += 1
                return outline_data
                
            except Exception as e:
                if attempt < max_retries - 1:
                    time.sleep(1)
                    continue
                else:
                    self.stats['outline_fail'] += 1
                    return None
        
        return None

    def get_type(self):
        res = requests.get('https://timetable.nycu.edu.tw/?r=main/get_type', 
                          headers=self.headers, verify=False)
        return res.json()

    def get_category(self, ftype):
        res = requests.post('https://timetable.nycu.edu.tw/?r=main/get_category', 
                          data={'ftype': ftype, 'flang': self.flang, 
                                'acysem': self.acysem, 'acysemend': self.acysem},
                          headers=self.headers, verify=False)
        return res.json()

    def get_college(self, fcategory, ftype):
        res = requests.post('https://timetable.nycu.edu.tw/?r=main/get_college',
                          data={'fcategory': fcategory, 'ftype': ftype, 
                                'flang': self.flang, 'acysem': self.acysem, 
                                'acysemend': self.acysem},
                          headers=self.headers, verify=False)
        return res.json()

    def get_dep(self, fcollege, fcategory, ftype):
        res = requests.post('https://timetable.nycu.edu.tw/?r=main/get_dep',
                          data={'fcollege': fcollege, 'fcategory': fcategory, 
                                'ftype': ftype, 'flang': self.flang, 
                                'acysem': self.acysem, 'acysemend': self.acysem},
                          headers=self.headers, verify=False)
        return res.json()

    def get_cos(self, dep):
        url = "https://timetable.nycu.edu.tw/?r=main/get_cos_list"
        data = {
            "m_acy": self.year, "m_sem": self.semester,
            "m_acyend": self.year, "m_semend": self.semester,
            "m_dep_uid": dep, "m_group": "**", "m_grade": "**",
            "m_class": "**", "m_option": "**", "m_crsname": "**",
            "m_teaname": "**", "m_cos_id": "**", "m_cos_code": "**",
            "m_crstime": "**", "m_crsoutline": "**", "m_costype": "**",
            "m_selcampus": "**"
        }

        r = requests.post(url, headers=self.headers, verify=False, data=data)
        if r.status_code != requests.codes.ok:
            return

        raw_data = json.loads(r.text)
        for dep_value in raw_data:
            language = raw_data[dep_value]["language"]
            for dep_content in raw_data[dep_value]:
                if re.match("^[1-2]+$", dep_content) is None:
                    continue
                for cos_id in raw_data[dep_value][dep_content]:
                    if cos_id in self.course_data:
                        continue
                        
                    raw_cos_data = raw_data[dep_value][dep_content][cos_id]
                    time_list = self.parse_time(raw_cos_data["cos_time"])
                    classroom_list = self.parse_classroom(raw_cos_data["cos_time"])
                    brief_code = list(raw_data[dep_value]["brief"][cos_id].keys())[0]
                    brief = raw_data[dep_value]["brief"][cos_id][brief_code]['brief'].split(',')
                    name = raw_cos_data["cos_cname"].replace("(英文授課)", '').replace("(英文班)", '')
                    
                    self.course_data[cos_id] = {
                        "id": raw_cos_data["cos_id"],
                        "num_limit": raw_cos_data["num_limit"],
                        "reg_num": raw_cos_data["reg_num"],
                        "name": name,
                        "credit": raw_cos_data["cos_credit"],
                        "hours": raw_cos_data["cos_hours"],
                        "teacher": raw_cos_data["teacher"],
                        "time": time_list,
                        "classroom": classroom_list,
                        "time-classroom": raw_cos_data["cos_time"],
                        "english": language[cos_id]["授課語言代碼"] == "en-us",
                        "brief": brief,
                        "type": raw_cos_data["cos_type"],
                    }
                    
                    self.stats['total_courses'] += 1

    def print_progress(self):
        total = self.stats['total_courses']
        success = self.stats['outline_success']
        fail = self.stats['outline_fail']
        processed = success + fail
        
        if total > 0:
            progress_pct = (processed / total) * 100
            
            if self.stats['start_time'] and processed > 0:
                elapsed = (datetime.now() - self.stats['start_time']).total_seconds()
                avg_time = elapsed / processed
                remaining = (total - processed) * avg_time
                eta = timedelta(seconds=int(remaining))
                
                clear_output(wait=True)
                print(f"進度: {processed}/{total} ({progress_pct:.1f}%)")
                print(f"成功: {success} | 失敗: {fail}")
                print(f"預估剩餘: {eta}")

    def crawl(self):
        print("=" * 70)
        print(f"NYCU 課程爬蟲 - {self.year} 學年度第 {self.semester} 學期 (含完整綱要)")
        print("=" * 70)
        
        start_time = datetime.now()
        
        # 階段 1: 取得基本資料
        print("\n階段 1/2: 取得課程基本資料...")
        types = self.get_type()
        
        for i in range(len(types)):
            ftype = types[i]["uid"]
            print(f"  處理: {types[i]['cname']}")
            categories = self.get_category(ftype)
            
            if types[i]["cname"] == "其他課程":
                for fcategory in categories.keys():
                    if fcategory not in self.dep_list:
                        self.dep_list.append(fcategory)
                        self.get_cos(fcategory)
            else:
                for fcategory in categories.keys():
                    colleges = self.get_college(fcategory, ftype)
                    if len(colleges):
                        for fcollege in colleges.keys():
                            deps = self.get_dep(fcollege, fcategory, ftype)
                            if len(deps):
                                for fdep in deps.keys():
                                    if fdep not in self.dep_list:
                                        self.dep_list.append(fdep)
                                        self.get_cos(fdep)
                    else:
                        deps = self.get_dep("", fcategory, ftype)
                        if len(deps):
                            for fdep in deps.keys():
                                if fdep not in self.dep_list:
                                    self.dep_list.append(fdep)
                                    self.get_cos(fdep)
        
        print(f"\n已取得 {len(self.course_data)} 門課程的基本資料")
        
        # 階段 2: 取得課程綱要
        print("\n階段 2/2: 取得課程綱要...")
        print("這將需要較長時間，請耐心等待\n")
        
        self.stats['start_time'] = datetime.now()
        
        for cos_id, course in self.course_data.items():
            outline = self.get_course_outline(course['id'])
            if outline:
                course['outline'] = outline
            self.print_progress()
        
        end_time = datetime.now()
        elapsed = end_time - start_time
        
        print("\n" + "=" * 70)
        print("爬取完成！")
        print(f"總課程數: {self.stats['total_courses']}")
        print(f"綱要成功: {self.stats['outline_success']}")
        print(f"綱要失敗: {self.stats['outline_fail']}")
        success_rate = (self.stats['outline_success'] / self.stats['total_courses']) * 100
        print(f"成功率: {success_rate:.1f}%")
        print(f"總花費時間: {elapsed}")
        print("=" * 70)
        
        return self.course_data

print("✓ 完整爬蟲類別載入完成")

## 步驟 4: 執行爬蟲

⚠️ **注意**：此步驟將耗時約 40-60 分鐘，請確保網路穩定

In [None]:
# 建立爬蟲實例並執行
crawler = NYCUCrawlerWithOutline(YEAR, SEMESTER)
course_data = crawler.crawl()

## 步驟 5: 查看結果統計

In [None]:
import os

# 統計有綱要的課程
courses_with_outline = sum(1 for c in course_data.values() if 'outline' in c)
print(f"總課程數: {len(course_data)}")
print(f"有綱要的課程: {courses_with_outline} ({courses_with_outline/len(course_data)*100:.1f}%)")

# 統計課程概述
with_outline_text = sum(1 for c in course_data.values() 
                        if 'outline' in c and 
                        'description' in c['outline'] and 
                        c['outline']['description'].get('outline', ''))
print(f"有課程概述: {with_outline_text} ({with_outline_text/len(course_data)*100:.1f}%)")

# 統計週進度
weekly_counts = []
for c in course_data.values():
    if 'outline' in c and 'weekly_schedule' in c['outline']:
        weekly_counts.append(len(c['outline']['weekly_schedule']))

if weekly_counts:
    print(f"\n平均週數: {sum(weekly_counts)/len(weekly_counts):.1f}")
    print(f"週數範圍: {min(weekly_counts)} - {max(weekly_counts)} 週")

# 顯示範例
print("\n範例課程綱要:")
for cid, course in list(course_data.items())[:1]:
    if 'outline' in course:
        print(f"\n課程: {course['name']}")
        print(f"教師: {course['teacher']}")
        outline = course['outline']
        if 'base' in outline:
            print(f"英文名稱: {outline['base'].get('cos_eng_name', 'N/A')}")
        if 'description' in outline:
            desc = outline['description']
            print(f"\n課程概述(前200字):")
            print(desc.get('outline', 'N/A')[:200])
        if 'weekly_schedule' in outline:
            print(f"\n共 {len(outline['weekly_schedule'])} 週進度")

## 步驟 6: 下載資料

In [None]:
# 儲存為 JSON 檔案
filename = f"{YEAR}-{SEMESTER}_data_with_outline.json"
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(course_data, f, ensure_ascii=False, indent=2)

file_size_mb = os.path.getsize(filename) / 1024 / 1024
print(f"✓ 資料已儲存至: {filename}")
print(f"檔案大小: {file_size_mb:.2f} MB")

# 下載檔案
from google.colab import files
files.download(filename)
print("\n✓ 檔案下載已開始")

## 資料結構說明

完整綱要版本在基本資訊的基礎上，額外包含 `outline` 欄位：

```json
{
  "outline": {
    "base": {
      "cos_name": "課程中文名稱",
      "cos_eng_name": "課程英文名稱",
      "sel_type_name": "必修/選修",
      "dep_name": "開課系所",
      "cos_code": "永久課號"
    },
    "description": {
      "prerequisite": "先修科目",
      "outline": "課程概述與目標",
      "textbook": "教科書",
      "exam_score": "評分方式",
      "teach_method": "教學方法"
    },
    "weekly_schedule": [
      {
        "week_id": "週次",
        "class_date": "上課日期",
        "class_data": "課程進度內容"
      }
    ],
    "unit_hours": [
      {
        "title": "單元主題",
        "content": "內容綱要",
        "hour_teaching": "講授時數"
      }
    ]
  }
}
```

## 注意事項

1. **執行時間**：完整爬取約需 40-60 分鐘
2. **網路穩定性**：請確保 Colab 連線穩定
3. **資料完整性**：部分課程可能未填寫完整綱要
4. **成功率**：正常情況下成功率 > 98%