# 載入所需套件

In [1]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.request import urlretrieve
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import time
from datetime import datetime
import os

# 定義爬取IG帳號基本資訊函數

In [2]:
def ig_summary_crawler(id_name):
    header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}
    url='https://www.instagram.com/{}/'.format(id_name)
    response=requests.get(url,headers=header)
    soup=BeautifulSoup(response.text,'lxml')
    json_part=soup.find_all('script',type='text/javascript')[3].string
    json_part=json_part[json_part.find('=')+2:-1]
    data=json.loads(json_part)
    user_data=data['entry_data']['ProfilePage'][0]['graphql']['user']

    user_id=user_data['id']
    full_name=user_data['full_name']
    biography=user_data['biography']
    followed_num=user_data['edge_followed_by']['count']
    follow_num=user_data['edge_follow']['count']
    entry_num=user_data['edge_owner_to_timeline_media']['count']

    summary_df=pd.DataFrame({
        '編號':user_id,
        '名稱':full_name,
        '簡介':biography,
        '被追蹤數':followed_num,
        '追蹤數':follow_num,
        '貼文數':entry_num
    },index=[0])
    
    directory='IG圖片爬蟲/{}'.format(id_name)
    if not os.path.isdir(directory):
        os.makedirs(directory)
    
    summary_df.to_csv('{}/{}個人簡介.csv'.format(directory,id_name),encoding='utf_8_sig',index=False)
    
    return summary_df,entry_num,user_id

# 定義爬取IG帳號貼文、圖片函數

In [3]:
def user_data_crawler(url):
    header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}
    response=requests.get(url,headers=header)
    
    try:
        soup=BeautifulSoup(response.text,'lxml')
        json_part=soup.find_all('script',type='text/javascript')[3].string
        json_part=json_part[json_part.find('=')+2:-1]
        data=json.loads(json_part)
        user_data=data['entry_data']['ProfilePage'][0]['graphql']['user']
    except:
        #處理第2頁以後user_data的json格式不同
        user_data=json.loads(response.text)['data']['user']
    
    return user_data

In [4]:
def ig_img_page_crawler(user_data):
    one_page_info=user_data['edge_owner_to_timeline_media']['edges']

    texts=[]
    tags=[]
    ats=[]
    img_links=[]
    post_links=[]
    tag_nums=[]
    at_nums=[]
    img_nums=[]
    message_nums=[]
    heart_nums=[]
    ids=[]
    post_times=[]
    locations=[]

    #每個one_page_info首頁最多有12篇，第2頁以後每頁最多有50篇
    for i in range(len(one_page_info)):
        img_link=[]
        try:
            text=one_page_info[i]['node']['edge_media_to_caption']['edges'][0]['node']['text']
            texts.append(text)
            #抓取ig貼文#
            if '#' in text:
                tag_list=re.findall(r'#([^ |\n]+)',text)
                tag_nums.append(len(tag_list))
                tags.append('\n'.join(tag_list))
            else:
                tag_nums.append(0)
                tags.append('')
            #抓取ig貼文@
            if '@' in text:
                at_list=re.findall(r'@([^ |\n]+)',text)
                at_nums.append(len(at_list))
                ats.append('\n'.join(at_list))
            else:
                at_nums.append(0)
                ats.append('')
        except:
            #處理ig貼文沒文字敘述情況
            texts.append('')
            tag_nums.append(0)
            tags.append('')
            at_nums.append(0)
            ats.append('')
        try:
            #處理ig貼文可能不只1張圖片情況
            img_info=one_page_info[i]['node']['edge_sidecar_to_children']['edges']
            img_num=len(img_info)
            img_nums.append(img_num)
            
            for j in range(img_num):
                img_link.append(img_info[j]['node']['display_url'])
            img_links.append(img_link)
        except:
            img_nums.append(1)
            img_links.append(one_page_info[i]['node']['display_url'])
        try:
            locations.append(one_page_info[i]['node']['location']['name'])
        except:
            #處理ig貼文沒打卡地點情況
            locations.append('')
        
        message_nums.append(one_page_info[i]['node']['edge_media_to_comment']['count'])
        heart_nums.append(one_page_info[i]['node']['edge_media_preview_like']['count'])
        ids.append(one_page_info[i]['node']['id'])
        int_time=one_page_info[i]['node']['taken_at_timestamp']
        post_times.append(datetime.fromtimestamp(int_time).strftime('%Y-%m-%d %H:%M:%S'))
        post_code=one_page_info[i]['node']['shortcode']
        post_links.append('https://www.instagram.com/p/{}/'.format(post_code))
        
    img_df=pd.DataFrame({
        '編號':ids,
        '時間':post_times,
        '貼文':texts,
        '標籤':tags,
        '標註':ats,
        '打卡地點':locations,
        '留言數':message_nums,
        '愛心數':heart_nums,
        '標籤數':tag_nums,
        '標註數':at_nums,
        '圖片數':img_nums,
        '貼文網址':post_links,
        '圖片網址':img_links 
    })        

    return img_df

In [5]:
def ig_img_total_crawler(entry_num,user_id,id_name):
    img_dfs=[]
    url='https://www.instagram.com/{}/'.format(id_name)
    user_data=user_data_crawler(url)
    img_df=ig_img_page_crawler(user_data)
    img_dfs.append(img_df)
    next_code=user_data['edge_owner_to_timeline_media']['page_info']['end_cursor']
    
    #首頁最多有12篇，第2頁以後每頁最多50篇
    if (entry_num-12)%50==0:
        iter_times=int((entry_num-12)/50)
    else:
        iter_times=int((entry_num-12)/50)+1
    
    for i in tqdm(range(iter_times)):
        url='https://www.instagram.com/graphql/query/?query_hash=f2405b236d85e8296cf30347c9f08c2a&variables=%7B%22id%22%3A%22{}%22%2C%22first%22%3A50%2C%22after%22%3A%22{}%3D%3D"%7D'.format(user_id,next_code.replace('=',''))
        time.sleep(0.5)
        user_data=user_data_crawler(url)
        img_df=ig_img_page_crawler(user_data)
        img_dfs.append(img_df)
        next_code=user_data['edge_owner_to_timeline_media']['page_info']['end_cursor']
    
    total_img_df=pd.concat(img_dfs,ignore_index=True)
    
    directory='IG圖片爬蟲/{}'.format(id_name)
    if not os.path.isdir(directory):
        os.makedirs(directory)
    
    total_img_df.to_csv('{}/{}貼文資訊.csv'.format(directory,id_name),encoding='utf_8_sig',index=False)
    
    return total_img_df

# 定義下載儲存IG帳號圖片、貼文檔案函數

In [6]:
def download_img_text(id_name,total_img_df):
    for i in tqdm(range(len(total_img_df))):
        directory='IG圖片爬蟲/{}/{}'.format(id_name,i)
        if not os.path.isdir(directory):
            os.makedirs(directory)
        
        text=total_img_df.loc[i,'貼文']
        tag=total_img_df.loc[i,'標籤']
        at=total_img_df.loc[i,'標註']
        img_links=total_img_df.loc[i,'圖片網址']
        
        with open('{}/貼文.txt'.format(directory),'w',encoding='utf-8') as f:
            f.write(text) 
        
        with open('{}/標籤.txt'.format(directory),'w',encoding='utf-8') as f:
            f.write(tag) 
            
        with open('{}/標註.txt'.format(directory),'w',encoding='utf-8') as f:
            f.write(at) 
        
        try:
            for j in range(len(img_links)):
                urlretrieve(img_links[j],'{}/{}.png'.format(directory,j))
        except:
            urlretrieve(img_links,'{}/{}.png'.format(directory,0))

# 特定IG公開帳號爬蟲

In [7]:
id_name=input('請輸入IG帳號 : ')
try:
    summary_df,entry_num,user_id=ig_summary_crawler(id_name)
    print('輸入的IG帳號為：{}，共有{}篇貼文'.format(id_name,entry_num))
except:
    print('此帳號輸入錯誤(或設定不公開)')
try:
    total_img_df=ig_img_total_crawler(entry_num,user_id,id_name)
    print('圖片貼文表格下載成功')
except:
    print('圖片貼文表格下載失敗')
try:
    download_img_text(id_name,total_img_df)
    print('圖片貼文檔案下載成功')
except:
    print('圖片貼文檔案下載失敗')

請輸入IG帳號 : ekey
輸入的IG帳號為：ekey，共有781篇貼文


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:40<00:00,  1.83s/it]


圖片貼文表格下載成功


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 781/781 [27:57<00:00,  2.23it/s]


圖片貼文檔案下載成功


In [8]:
total_img_df

Unnamed: 0,編號,時間,文章,標籤,標註,打卡地點,留言數,愛心數,標籤數,標註數,圖片數,貼文網址,圖片網址
0,2392287252653778449,2020-09-06 22:34:45,怎麼可以這麼好看💓\n我說衣服🤪\n #shy shy的 辣辣的\n\nThank you ...,shy\n安安邊緣子,aiplitsao\nhaley_duu\nyany0801\nsee_u_hong,,23,2024,2,4,3,https://www.instagram.com/p/CEzHJTXH6YR/,[https://instagram.ftpe8-3.fna.fbcdn.net/v/t51...
1,2390745088761265265,2020-09-04 19:30:45,怎麼樣 給載嗎？🤍🖤🤍\n·\n·\n·\n沒駕照你還敢坐！？🤪\n\n@2d.cafetw...,2dcafe\n2dcafe新莊\n2dcafe新北旗艦店\n聽說台中高雄店也開幕\n高雄店...,2d.cafetw,,28,2480,6,1,1,https://www.instagram.com/p/CEtof3UH_xx/,https://instagram.ftpe8-4.fna.fbcdn.net/v/t51....
2,2388721584679167177,2020-09-02 00:30:24,九月girl💓想要得到神力🦸‍♀️\n\n👚： @caco_tw @beast_kingdo...,WBinvited\nwonderwomen\n神力女超人1984\n百事可樂,caco_tw\nbeast_kingdom,,17,1853,4,2,4,https://www.instagram.com/p/CEmcaASHCTJ/,[https://instagram.ftpe8-1.fna.fbcdn.net/v/t51...
3,2387887545114340195,2020-08-31 20:53:19,泡泡仙子上線中🧚🏼‍♀️\n哈哈哈泡泡仙子聽起來沒什麼神力好廢\n臉上的泡泡最近試用到的好物...,看起來很累很正常\n黑眼圈已經病入膏肓\nbelta#palclair\npalclair海...,palclair.tw\nbeltatw,,18,1654,4,2,4,https://www.instagram.com/p/CEjexIaHqtj/,[https://instagram.ftpe8-3.fna.fbcdn.net/v/t51...
4,2387260346233282811,2020-08-31 00:07:11,謝謝華納兄弟🥰\n這咖神力女超人電影禮盒超棒的！\n希望穿上衣服 吃完零食 用枕頭睡覺後也可...,caco\n野獸國\n百事可樂\nJellyBelly\nTWWONDERWOMAN\n來用...,paclicensing,,13,1285,12,1,3,https://www.instagram.com/p/CEhQKL7nGD7/,[https://instagram.ftpe8-3.fna.fbcdn.net/v/t51...
5,2386344290115505392,2020-08-29 17:47:09,今晚⋯我想來點⋯⋯\nbaan的⋯「泰式雪糕」🤤\n哈哈哈 看我的口氣\n知道我每次都被誰洗...,baan\n泰式奶茶\n摩摩喳喳\nPChome24h購物\n家好選物\nhomegalle...,PChome24h購物,,40,1270,9,1,4,https://www.instagram.com/p/CEd_30MHczw/,[https://instagram.ftpe8-2.fna.fbcdn.net/v/t51...
6,2385764711466519415,2020-08-28 22:35:38,累積了好多照片\n不知要從何發起哈哈哈\n算了不想了⋯\nFriday Night 隨心所欲...,玩具總動員\ncafein\ntoystory\n玩具總動員快閃店,,,14,1329,4,0,2,https://www.instagram.com/p/CEb8F1gn8t3/,[https://instagram.ftpe8-2.fna.fbcdn.net/v/t51...
7,2384762614989988141,2020-08-27 13:24:38,齁齁齁！臭臉氣勢?照來了🤪\n一直都想要擁有 #GA 的底妝產品 \n這次有機會試用到超強新...,GA\n裸粉持妝乳\nGA\n婊姐\n水慕斯粉底\n9\nGA\n亞曼尼\n裸粉持妝乳\n雲...,,,29,1062,13,0,7,https://www.instagram.com/p/CEYYPacHCEt/,[https://instagram.ftpe8-2.fna.fbcdn.net/v/t51...
8,2383393334478427720,2020-08-25 16:04:07,情人節來讓大家看我跟療癒動物們約會的照片🤣💕\n第一張有沒有很像置身日本！\n水豚君在泡溫泉...,農遊券\n埔心牧場\n水豚\n草泥馬\n梅花鹿\n桃園景點\n牧場,,埔心牧場,20,1427,7,0,9,https://www.instagram.com/p/CETg5wlntZI/,[https://instagram.ftpe8-1.fna.fbcdn.net/v/t51...
9,2381834023935454277,2020-08-23 12:26:03,用三倍券到 #好市多 買了 #airpodspro \n真的划算 哈哈哈\n如果剛好有想入手...,好市多\nairpodspro\n犀牛盾Airpods保護套 \n我就是名牌 #rhinos...,rhinoshieldtw,,19,1613,6,1,7,https://www.instagram.com/p/CEN-WzenKxF/,[https://instagram.ftpe8-4.fna.fbcdn.net/v/t51...
