In [6]:
import requests
import json
import re
import numpy as np
import pandas as pd
import datetime
from bs4 import BeautifulSoup as soup

In [7]:
def getHtml(url):
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0"
    }
    req = requests.get(url=url, headers=headers)
    req.encoding = "utf-8"
    return req.text

In [8]:
# 取得 QS個分類領域排名的網頁
def getAllSubHtml(year):
    urlRankN=f'https://www.topuniversities.com/subject-rankings/{year}'
    subHtml=soup(getHtml(urlRankN),'html.parser')
    reHref=re.compile(f'^/university-rankings/university-subject-rankings/{year}')
    return ['https://www.topuniversities.com/'+item.get('href') for item in subHtml.find_all('a',href=reHref)]
 
    

In [9]:
# 取得分領域排名的網頁內取得資料檔的連結並取值
def getSubDataHtml(url):
    rankHtml=soup(getHtml(url),'html.parser')
    temp=rankHtml.find('script',type='application/json')
    toJson=json.loads(temp.text)
    resultLink=toJson['qs_rankings_datatables']['rank_indicators_url']
    getData=getHtml(resultLink)
    result=json.loads(getData)
    # 要取的欄位以及欄位本身是類html形式需要清理的欄位
    # 理論上result[columns]裡面有資料可以寫活的，但暫時覺得取得上麻煩，而且nid等有部分欄位不存在column->data裡，所以先還是寫死
    meta=['nid',
          'uni',
          'region',
          'location',
          'city',
          'overall_rank','overall',
          'rank_76','ind_76',
          'rank_77','ind_77',
          'rank_69','ind_69',
          'rank_70','ind_70',
          'stars']

    #divved=['uni','overall','ind_76','ind_77','ind_69','ind_70']
    ## html decoder
    def cleanDiv(x):
        try:
            temp=soup(x,'html.parser').find('div')
            return temp.text if temp else x
    # 如果 meta內的欄位不存在(即data內無該欄位)，temp解析會報TypeError，用此欄函數翻譯之
    # 曾嘗試用re.sub直接寫但失敗
        except TypeError:
            return x
    # rename dict
    renamer={'uni':'university',
             'overall':'overall_score',
             'rank_76':'acadamic_reputaion_rank',
             'ind_76':'acadamic_reputaion_score',
             'rank_77':'employer_reputaion_rank',
             'ind_77':'employer_reputaion_score',
             'rank_69':'Hindex_rank',
             'ind_69':'Hindex_score',
             'rank_70':'citation_rank',
             'ind_70':'citaion_score',
             'stars':'QSstars'
             }
            
    df=pd.json_normalize(result['data'])
    refined = pd.DataFrame(df, columns=meta)
   
    res=refined.applymap(cleanDiv).rename(columns=renamer)
    # 在構成查找連結時，整個網址字串的第86個字元開始到最後即為領域名稱
    res.insert(0,'Subject',str(url[86:]))

    return res

    

In [13]:
# 主程式區_爬一個領域append一個領域數據
def main():
    # 指定年分，建議2017~2022
    year=2022
    # 各領域url集合而成的list
    allSub=getAllSubHtml(year)
    #initial = pd.DataFrame()
    for sub in allSub:
        print(str(sub[86:])+':'+str(datetime.datetime.now()))
        curr=getSubDataHtml(sub)
        #initial=pd.concat([initial,curr],axis=0)
        curr.to_csv(f'QS{year}subject_all.csv',mode='a',index=0,encoding='utf_8_sig')
    print('finish'+':'+str(datetime.datetime.now()))
         
if __name__ == '__main__':
    main()

        
        
    

arts-humanities:2022-08-10 17:48:45.552705
linguistics:2022-08-10 17:48:48.438023
theology-divinity-religious-studies:2022-08-10 17:48:51.396135
archaeology:2022-08-10 17:48:53.406019
architecture-built-environment:2022-08-10 17:48:55.383646
art-design:2022-08-10 17:48:57.640178
classics-ancient-history:2022-08-10 17:48:59.858098
english-language-literature:2022-08-10 17:49:02.059902
history:2022-08-10 17:49:04.458331
modern-languages:2022-08-10 17:49:06.444922
performing-arts:2022-08-10 17:49:08.799211
philosophy:2022-08-10 17:49:10.724934
engineering-technology:2022-08-10 17:49:13.055284
chemical-engineering:2022-08-10 17:49:17.125262
civil-structural-engineering:2022-08-10 17:49:19.947285
computer-science-information-systems:2022-08-10 17:49:21.872254
electrical-electronic-engineering:2022-08-10 17:49:25.022214
engineering-petroleum:2022-08-10 17:49:27.659255
mechanical-aeronautical-manufacturing-engineering:2022-08-10 17:49:29.905872
mineral-mining-engineering:2022-08-10 17:49:33.7

In [None]:
# 筆記區
# 各分領域排名的網頁範例如下
# <a href="/university-rankings/university-subject-rankings/2022/classics-ancient-history">