In [4]:
# !pip install scholarly
# !pip install pandas
# !pip install openpyxl


Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [4]:
import os
import requests
import time
import random
import sqlite3
import csv
from scholarly import scholarly
import pandas as pd


In [5]:
def create_database(db_name="scholar_data.db"):
    """
    建立 SQLite 資料庫及表格
    """
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS papers (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            authors TEXT,
            year TEXT,
            abstract TEXT,
            venue TEXT,
            pdf_link TEXT,
            citations_link TEXT,
            num_citations INTEGER,
            publication_url TEXT
        )
    ''')
    conn.commit()
    return conn

def insert_paper(conn, paper):
    """
    插入單筆資料到資料庫
    """
    cursor = conn.cursor()
    cursor.execute('''
        INSERT INTO papers (title, authors, year, abstract, venue, pdf_link, citations_link, num_citations, publication_url)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        paper['title'],
        paper['authors'],
        paper['year'],
        paper['abstract'],
        paper['venue'],
        paper['pdf_link'],
        paper['citations_link'],
        paper['num_citations'],
        paper['publication_url']
    ))

def save_to_excel(results, file_name="scholar_results.xlsx"):
    #將結果儲存為 Excel 檔案，並將指定欄位設定為超連結
    df = pd.DataFrame(results)

    # 設定 PDF 連結和文章連結為 Hyperlink
    for col in ['pdf_link', 'publication_url']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: f'=HYPERLINK("{x}", "{x}")' if pd.notna(x) else x)

    # 將資料寫入 Excel
    with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
        df.to_excel(writer, index=False)



def save_to_csv(results, file_name="scholar_results.csv"):
    """
    將結果存為 CSV 檔案
    """
    keys = ['title', 'authors', 'year', 'abstract', 'venue', 'pdf_link', 'citations_link', 'num_citations', 'publication_url']
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(results)
        
# def setup_proxies():
#     """
#     設定代理和重試避免被 Google Scholar 封鎖
#     """
#     proxies = {
#         'http': 'http://your_proxy_address:port',
#         'https': 'http://your_proxy_address:port'
#     }
#     session = requests.Session()
#     retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
#     session.proxies = proxies
#     session.mount('http://', HTTPAdapter(max_retries=retries))
#     session.mount('https://', HTTPAdapter(max_retries=retries))
#     return session

def search_scholar(keyword, limit=10, conn=None, commit_interval=10, year_low=None, year_high=None, sort_by='relevance'):
    from scholarly import ProxyGenerator

    print(1121)
    # 使用付費代理
    # pg = ProxyGenerator()
    # pg.ScraperAPI('your_scraper_api_key')  # 替換為你的 API 金鑰
    # scholarly.use_proxy(pg)
    
    results = []

    print(1122)
    for _ in range(3):  # 最多重試 3 次
        try:
            search_query = scholarly.search_pubs(
                query=keyword,
                year_low=year_low,
                year_high=year_high,
                sort_by=sort_by
            )

            for i in range(limit):
                print("Catch paper #_"+ str(i))
                # print(search_query)
                time.sleep(random.uniform(5, 10))  # 增加延遲，模擬人類行為
                    
                pub = next(search_query)

                #***************************************************  似乎scholarly.search_pubs都還是沒辦法限制 year_high
                ## 提取資料
                # bib = pub.get('bib', {})
                # pub_year = int(bib.get('pub_year', '0')) if bib.get('pub_year') else None

                # # 手動篩選年份
                # if year_low and pub_year and pub_year < year_low:
                #     continue
                # if year_high and pub_year and pub_year > year_high:
                #     continue
                # #***************************************************

                
                # 提取資料
                bib = pub.get('bib', {})
                result_dict = {
                    'title': bib.get('title', ''),
                    'authors': ", ".join(bib.get('author', [])),
                    'year': bib.get('pub_year', ''),
                    'abstract': bib.get('abstract', ''),
                    'venue': bib.get('venue', ''),
                    'pdf_link': pub.get('eprint_url', None),
                    'citations_link': pub.get('citedby_url', None),
                    'num_citations': pub.get('num_citations', None),
                    'publication_url': pub.get('pub_url', None)
                }

                results.append(result_dict)

                # 插入資料到資料庫
                if conn:
                    insert_paper(conn, result_dict)

                # 每 10 筆提交一次
                if conn and (i + 1) % commit_interval == 0:
                    conn.commit()

            break  # 成功後跳出重試迴圈

        except StopIteration:
            print("搜尋結果已無更多資料。")
            break
        except Exception as e:
            print(f"發生錯誤，重試中... 錯誤訊息: {e}")
            time.sleep(30)  # 等待 30 秒後重試

    # 最後提交資料
    if conn:
        conn.commit()

    return results



def search_scholar_org(keyword, limit=10, conn=None, commit_interval=10, year_low=None, year_high=None, sort_by='relevance'):
    """
    使用 scholar 搜尋指定關鍵字，回傳前 limit 筆結果的資訊，並存入資料庫。
    """
    from scholarly import ProxyGenerator
    
    print(112)
    # Set up a ProxyGenerator object to use free proxies
    # This needs to be done only once per session
    pg = ProxyGenerator()
    pg.FreeProxies()
    scholarly.use_proxy(pg)
    print(113)

    
    search_query = scholarly.search_pubs(
        query=keyword,
        year_low=year_low,
        year_high=year_high,
        sort_by=sort_by
    )
    print(114)

    results = []

    for i in range(limit):
        print(i, limit)
        try:
            # 模擬人類行為：隨機延遲
            time.sleep(random.uniform(3, 10))

            # 取得下一篇文章資訊
            pub = next(search_query)
            
            #print(pub) #方便查看結構

            # 從 `pub` 的結構提取所需資訊
            bib = pub.get('bib', {})
            title = bib.get('title', '')
            authors = ", ".join(bib.get('author', []))  # 將作者列表轉為逗號分隔的字串
            year = bib.get('pub_year', '')
            abstract = bib.get('abstract', '')
            venue = bib.get('venue', '')

            # 提取其他資訊
            url_pdf = pub.get('eprint_url', None)  # PDF 連結
            url_citations = pub.get('citedby_url', None)  # 引用其他文章的連結
            num_citations = pub.get('num_citations', None)  # 引用次數
            pub_url = pub.get('pub_url', None)  # 文章的公開 URL

            result_dict = {
                'title': title,
                'authors': authors,
                'year': year,
                'abstract': abstract,
                'venue': venue,
                'pdf_link': url_pdf,
                'citations_link': url_citations,
                'num_citations': num_citations,
                'publication_url': pub_url
            }
            results.append(result_dict)

            # 插入資料到資料庫
            if conn:
                insert_paper(conn, result_dict)

            # 每 10 筆資料進行一次 commit
            if conn and (i + 1) % commit_interval == 0:
                conn.commit()

        except StopIteration:
            print("搜尋結果已無更多資料。")
            break  # 搜尋結果已無更多資料
        except Exception as e:
            print(f"發生錯誤: {e}")

    # 確保最後的資料被提交
    if conn:
        conn.commit()

    return results

def download_pdf(pdf_url, save_folder='pdf_files_Device', file_prefix='paper'):
    """
    根據 pdf_url 下載 PDF，如果有 pdf_url 而且確實能下載。
    """
    if pdf_url is None:
        return None

    # 建立儲存 PDF 的資料夾
    if not os.path.exists(save_folder):
        os.makedirs(save_folder, exist_ok=True)

    try:
        response = requests.get(pdf_url, timeout=10)
        if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
            file_name = f"{file_prefix}_{os.path.basename(pdf_url).split('?')[0]}"
            if not file_name.endswith(".pdf"):
                file_name += ".pdf"
            file_path = os.path.join(save_folder, file_name)
            with open(file_path, 'wb') as f:
                f.write(response.content)
            return file_path
        else:
            return None
    except Exception as e:
        print(f"下載失敗: {e}")
        return None



In [6]:
def main():
    #搜尋條件輸入
    keyword = "machine learning, semiconductor device, deep learning"  # 這裡可以換成你要的關鍵字
    limit = 1500 # 搜尋前 50 筆結果
    year_low = 2015  # 篩選年份下限
    year_high = 2023  # 篩選年份上限
    sort_by = 'date'  # 試著以日期排序

    # 建立資料庫
    conn = create_database()
    print(111)
    # 第一步：搜尋並儲存文獻資訊到資料庫
    results = search_scholar(
        keyword=keyword,
        limit=limit,
        conn=conn,
        commit_interval=10, #每十筆儲存一次，避免中斷爬的都沒存到
        year_low=year_low,
        year_high=year_high,
        sort_by=sort_by
    )
    print(222)

    # 顯示結果
    print(f"共有 {len(results)} 筆文獻資料。")

    # 第二步：儲存結果為 CSV 檔案
    # save_to_csv(results, file_name="scholar_results.csv")
    save_to_excel(results, file_name="scholar_results.xlsx")

    # 第三步：輸出結果至csv or excel並且嚐試下載pdf檔案
    for idx, paper in enumerate(results):
        print(f"\n=== 第 {idx + 1} 筆結果 ===")
        print("標題:", paper['title'])
        print("作者:", paper['authors'])
        print("年份:", paper['year'])
        print("期刊/會議:", paper['venue'])
        print("摘要:", paper['abstract'])
        print("引用次數:", paper['num_citations'])
        print("文章連結:", paper['publication_url'])
        print("PDF 連結:", paper['pdf_link'])
        print("引用文獻連結:", paper['citations_link'])

        # 若存在可用的 PDF 連結，則嘗試下載
        if paper['pdf_link']:
            # pdf_path = download_pdf(paper['pdf_link'], file_prefix=f"paper_{idx+1}")
            pdf_path = download_pdf(paper['pdf_link'], file_prefix=f"{paper['year']}_{paper['title'].replace(' ', '_')}.pdf")

            if pdf_path:
                print(f"PDF 已下載至: {pdf_path}")
            else:
                print("PDF 連結存在，但無法成功下載。")
        else:
            print("查無 PDF 連結或無法下載。")

    # 關閉資料庫連線
    conn.close()

if __name__ == "__main__":
    main()


111
1121
1122
Catch paper #_0
Catch paper #_1
Catch paper #_2
Catch paper #_3
Catch paper #_4
Catch paper #_5
Catch paper #_6
Catch paper #_7
Catch paper #_8
Catch paper #_9
Catch paper #_10
Catch paper #_11
Catch paper #_12
Catch paper #_13
Catch paper #_14
Catch paper #_15
Catch paper #_16
Catch paper #_17
Catch paper #_18
Catch paper #_19
Catch paper #_20
Catch paper #_21
Catch paper #_22
Catch paper #_23
Catch paper #_24
Catch paper #_25
Catch paper #_26
Catch paper #_27
Catch paper #_28
Catch paper #_29
Catch paper #_30
Catch paper #_31
Catch paper #_32
Catch paper #_33
Catch paper #_34
Catch paper #_35
Catch paper #_36
Catch paper #_37
Catch paper #_38
Catch paper #_39
Catch paper #_40
Catch paper #_41
Catch paper #_42
Catch paper #_43
Catch paper #_44
Catch paper #_45
Catch paper #_46
Catch paper #_47
Catch paper #_48
Catch paper #_49
Catch paper #_50
Catch paper #_51
Catch paper #_52
Catch paper #_53
Catch paper #_54
Catch paper #_55
Catch paper #_56
Catch paper #_57
Catch pape