### Instaloader & Instagramy notebook

Use this notebook to utilise python scraping libraries (Instaloader and Instagramy) to scrape user details and user posts

In [29]:
! pip install instagramy instaloader



In [30]:
import numpy as np
import pandas as pd
from instagramy import InstagramPost, InstagramUser
import time
from tqdm import tqdm

from instaloader import Profile, structures
import instaloader

import pickle

In [31]:
influencers_df = pd.DataFrame()
# influencers_df = pd.read_csv('data/influencer_db_17112022.csv')

influencer_posts_df = pd.DataFrame()
# influencer_posts_df = pd.read_csv('data/influencer_post_17112022.csv')


### Scraper


Note: for instaloader you will need to follow this [guide](https://instaloader.github.io/troubleshooting.html). For simplicity, ignore the instaloader part and run instagramy

In [32]:
# L = instaloader.Instaloader()
# L.load_session_from_file('tehbing1234')


Loaded session from C:\Users\chuaz\AppData\Local\Instaloader\session-tehbing1234.


In [3]:
# instagrammy test
user = InstagramUser('huinileee')
user

InstagramUser('huinileee')

In [10]:
# instaloader test
# Profile.from_username(L.context, 'mongabong')

<Profile mongabong (35127351)>

In [33]:
def get_user_details(user):
    return {
        'username': user.username,
        'name': user.user_data['full_name'], 
        'biography': user.user_data['biography'],
        'num_followers': user.user_data['edge_followed_by']['count'],
        'business_category_name': user.user_data['business_category_name'],
        'overall_category_name': user.user_data['overall_category_name'],
        'category_enum':user.user_data['category_enum'],
        'category_name':user.user_data['category_name'],
        'transparency_product':user.user_data['transparency_product'],
        'is_verified': user.user_data['is_verified']
    }

In [34]:
def get_user_posts(user, username):
    user_posts = []
    # num_posts = len(user.posts)
    for post_type in ['edge_felix_video_timeline', 'edge_owner_to_timeline_media']:
        total_num = user.user_data[post_type]['count']
        posts = user.user_data[post_type]['edges']
        # num_posts = min(user.user_data[post_type]['count'],len(user.posts))
        num_posts = len(user.user_data[post_type]['edges'])
        print('total posts ', num_posts)
        for i in range(num_posts):
            # user_posts['post'].append(user.user_data['edge_owner_to_timeline_media']['edges'][i])
            post = {}
            post['username'] = username
            post['post_type'] =  user.user_data[post_type]['edges'][i]['node']['__typename']
            post['display_url'] =  user.user_data[post_type]['edges'][i]['node']['display_url']
            post['is_video'] =  user.user_data[post_type]['edges'][i]['node']['is_video']
            try:
                post['edge_media_to_caption'] = user.user_data[post_type]['edges'][i]['node']['edge_media_to_caption']['edges'][0]['node']['text']
            except:
                print(user.user_data[post_type]['edges'][i]['node']['edge_media_to_caption']['edges'])
                post['edge_media_to_caption'] = ""
            post['edge_media_to_comment'] = user.user_data[post_type]['edges'][i]['node']['edge_media_to_comment']['count']
            post['taken_at_timestamp'] = user.user_data[post_type]['edges'][i]['node']['taken_at_timestamp']
            post['edge_liked_by'] = user.user_data[post_type]['edges'][i]['node']['edge_liked_by']['count']
            tagged_users_list = []
            for tagged in user.user_data[post_type]['edges'][i]['node']['edge_media_to_tagged_user']['edges']:
                tagged_users_list.append(tagged['node']['user']['username'])
            post['edge_media_to_tagged_users'] = tagged_users_list
            post['video_view_count'] = 0
            if post['is_video']:
                post['video_view_count'] = user.user_data[post_type]['edges'][i]['node']['video_view_count']
            # print(i)
            # print(user.user_data[post_type]['edges'][i]['node'])
            try:
                if user.user_data[post_type]['edges'][i]['node']['edge_sidecar_to_children']:
                    if  user.user_data[post_type]['edges'][i]['node']['edge_sidecar_to_children']['edges']:
                        for j in range(len(user.user_data[post_type]['edges'][i]['node']['edge_sidecar_to_children']['edges'])):
                            if user.user_data[post_type]['edges'][i]['node']['edge_sidecar_to_children']['edges'][j]['node']['is_video']:
                                post['video_view_count'] = user.user_data[post_type]['edges'][i]['node']['edge_sidecar_to_children']['edges'][j]['node']['video_view_count']
            except:
                pass

            user_posts.append(post)

        # account for tagged users in children images
    return user_posts


In [35]:
def get_related_profiles(user, num_related):
    edge_related_profiles = user.user_data['edge_related_profiles']
    n = len(user.user_data['edge_related_profiles']['edges'])
    edge_related_usernames = []
    if n>num_related:
        for i in range(num_related):
            edge_related_usernames.append(user.user_data['edge_related_profiles']['edges'][i]['node']['username'])
    else: 
        for i in range(n):
            edge_related_usernames.append(user.user_data['edge_related_profiles']['edges'][i]['node']['username'])
    return edge_related_usernames

In [36]:
# input influencer names to add
usernames = [
    'huinileee'  # for example
]

# find posts for the following usernames
# usernames = set(influencers_df['username']) - set(influencer_posts_df['username'])
usernames


['huinileee']

### Instagrammy

In [37]:
from collections import deque
num_related = 25
MIN_FOLLOWERS = 5000
MAX_FOLLOWERS = 200000
target_number_influencers = 5


queue = deque(usernames)
added_influencers = set()

#### Mining main loop

Mine user details and post information. Find related influencers (based on instagram's recommendation output) and add them to the queue. This is how we mine influencers unknown to us (scalable)

In [38]:
# MAIN LOOP

# for username in usernames:
while len(added_influencers) < target_number_influencers and queue:
    username = queue.popleft()
    user = InstagramUser(username)
    
    # check follower as condition
    if int(user.number_of_followers) < MIN_FOLLOWERS or int(user.number_of_followers) > MAX_FOLLOWERS:
        continue
    print(f"Mining {username} {user.number_of_followers}, total influencers mined: {len(added_influencers)}")
    
    user_details = get_user_details(user)
    user_posts = get_user_posts(user, username)
    
    added_influencers.add(username)  
    
    for post in user_posts:
        influencer_posts_df = influencer_posts_df.append(post, ignore_index=True)
    related_users = get_related_profiles(user, num_related)
    user_details['related_influencers'] = related_users
    
    
    influencers_df = influencers_df.append(user_details, ignore_index=True)
    
    # add related influencers into the queue (this is how we mine influencers we do not know)
    for related_user in related_users: 
        if related_user not in added_influencers and related_user not in queue:
            queue.append(related_user)

    influencers_df.to_csv('influencer_db.csv', index=False)
    influencer_posts_df.to_csv('influencer_post_db.csv', index=False)
    time.sleep(10)


Mining huinileee 100423, total influencers mined: 0
total posts  1
total posts  12
Mining chankimberly 86409, total influencers mined: 1
total posts  2
total posts  12
Mining xianwenpoops 142279, total influencers mined: 2
total posts  12
total posts  12
Mining elizachong 36478, total influencers mined: 3
total posts  0
total posts  12
Mining stopitrach 46329, total influencers mined: 4
total posts  12
total posts  12


In [39]:
influencer_posts_df

Unnamed: 0,username,post_type,display_url,is_video,edge_media_to_caption,edge_media_to_comment,taken_at_timestamp,edge_liked_by,edge_media_to_tagged_users,video_view_count
0,huinileee,GraphVideo,https://instagram.fsin14-1.fna.fbcdn.net/v/t51...,1.0,"It’s day one, or one day. 🗓 \n\nYou’re always ...",30.0,1.640772e+09,3052.0,[huawei.sg],31914.0
1,huinileee,GraphSidecar,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,0.0,Friends are the flowers in the garden of life!...,3.0,1.669350e+09,593.0,[amazon.sg],0.0
2,huinileee,GraphVideo,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,1.0,How did it get so late so soon? 📆 We are almos...,2.0,1.669264e+09,1739.0,[amazon.sg],9595.0
3,huinileee,GraphSidecar,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,0.0,"Eat your meals, train hard, be fuelled. It’s s...",35.0,1.669201e+09,4862.0,[],0.0
4,huinileee,GraphSidecar,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,0.0,Feeling peachy today! 🍑 \n\nOne of the major r...,13.0,1.668766e+09,1881.0,[sum37.sg],0.0
...,...,...,...,...,...,...,...,...,...,...
82,stopitrach,GraphImage,https://instagram.fsin14-1.fna.fbcdn.net/v/t51...,0.0,On energy saving mode today 😂😴,14.0,1.668156e+09,1441.0,"[toryburch, circleslifesg, sheinsingapore]",0.0
83,stopitrach,GraphSidecar,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,0.0,Drinks and sunsets 🍹🌅\nCiara Pearl Ring x Heid...,15.0,1.667973e+09,781.0,"[byinviteonlystore, sheinsingapore]",0.0
84,stopitrach,GraphVideo,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,1.0,"Whenever I anticipate a busy week/month ahead,...",6.0,1.667811e+09,92.0,[yolofoodsg],2895.0
85,stopitrach,GraphVideo,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,1.0,Brunch outfits at 15% off when you use <rachel...,12.0,1.667556e+09,637.0,[sheinsingapore],17040.0


In [40]:
# de-duplication using unique identifiers

influencer_posts_df = influencer_posts_df.drop_duplicates('display_url')
influencer_posts_df

Unnamed: 0,username,post_type,display_url,is_video,edge_media_to_caption,edge_media_to_comment,taken_at_timestamp,edge_liked_by,edge_media_to_tagged_users,video_view_count
0,huinileee,GraphVideo,https://instagram.fsin14-1.fna.fbcdn.net/v/t51...,1.0,"It’s day one, or one day. 🗓 \n\nYou’re always ...",30.0,1.640772e+09,3052.0,[huawei.sg],31914.0
1,huinileee,GraphSidecar,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,0.0,Friends are the flowers in the garden of life!...,3.0,1.669350e+09,593.0,[amazon.sg],0.0
2,huinileee,GraphVideo,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,1.0,How did it get so late so soon? 📆 We are almos...,2.0,1.669264e+09,1739.0,[amazon.sg],9595.0
3,huinileee,GraphSidecar,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,0.0,"Eat your meals, train hard, be fuelled. It’s s...",35.0,1.669201e+09,4862.0,[],0.0
4,huinileee,GraphSidecar,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,0.0,Feeling peachy today! 🍑 \n\nOne of the major r...,13.0,1.668766e+09,1881.0,[sum37.sg],0.0
...,...,...,...,...,...,...,...,...,...,...
82,stopitrach,GraphImage,https://instagram.fsin14-1.fna.fbcdn.net/v/t51...,0.0,On energy saving mode today 😂😴,14.0,1.668156e+09,1441.0,"[toryburch, circleslifesg, sheinsingapore]",0.0
83,stopitrach,GraphSidecar,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,0.0,Drinks and sunsets 🍹🌅\nCiara Pearl Ring x Heid...,15.0,1.667973e+09,781.0,"[byinviteonlystore, sheinsingapore]",0.0
84,stopitrach,GraphVideo,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,1.0,"Whenever I anticipate a busy week/month ahead,...",6.0,1.667811e+09,92.0,[yolofoodsg],2895.0
85,stopitrach,GraphVideo,https://instagram.fsin14-2.fna.fbcdn.net/v/t51...,1.0,Brunch outfits at 15% off when you use <rachel...,12.0,1.667556e+09,637.0,[sheinsingapore],17040.0


In [41]:
influencers_df = influencers_df.drop_duplicates('username')
influencers_df

Unnamed: 0,username,name,biography,num_followers,business_category_name,overall_category_name,category_enum,category_name,transparency_product,is_verified,related_influencers
0,huinileee,Hayley,📍 Based in Singapore\n🐣 hayleyhnlee@gmail.com\...,100423.0,,,,Fitness Model,STATE_CONTROLLED_MEDIA,0.0,"[chankimberly, xianwenpoops, elizachong, stopi..."
1,chankimberly,Kimberly Chan,📍 Singapore\n🐥 chankimberly@hotmail.sg,86409.0,,,,Artist,STATE_CONTROLLED_MEDIA,0.0,"[xianwenpoops, elizachong, itzeugena, stopitra..."
2,xianwenpoops,Hailey Teo 献文,digital creator + presenter-host in sunny SG🌤️...,142279.0,,,,Blogger,STATE_CONTROLLED_MEDIA,0.0,"[chankimberly, patrinechoo, f0xypony, xianexyy..."
3,elizachong,H U I H U I,🇸🇬| Beach Volleyball \n🐈‍⬛| PUMASG,36478.0,,,,Athlete,STATE_CONTROLLED_MEDIA,0.0,"[xianwenpoops, chankimberly, yitinggoyt, ban.n..."
4,stopitrach,Rachel Wong (previously @rachelwongggg),🪬 On a journey to live intentionally and mindf...,46329.0,,,,Digital creator,STATE_CONTROLLED_MEDIA,0.0,"[chankimberly, xianwenpoops, elizachong, chxny..."


In [42]:
queue  # view all the related_influencers that were added to the queue


deque(['sgagsg',
       'f0xypony',
       'patrinechoo',
       'xianexyy',
       'symoneoei',
       'itzeugena',
       'chxnyixin',
       'kaseyavariellelow',
       'nicaboo.t',
       'janaechua',
       'yingxuantaan',
       'nedface',
       'justadd0ne',
       'bellywellyjelly',
       'tehhan',
       'ban.nalee',
       'claudine.ng',
       'luckymizili',
       'omgxiaoqian',
       'straits_times',
       'yitinggoyt',
       'aglimpseofrach',
       'gengenygen',
       'anna_en',
       'oosolleb_',
       'hereisyingying',
       'chloecloudy',
       'judithlcl',
       'y0ngtingting',
       'bimbiboon',
       'peiween',
       'graceglazee',
       'jady.toh',
       'celestljh',
       'marilynmonru',
       'triciaxlee',
       'awansauce',
       'rachelongll',
       '_shinekoh',
       'fangrong'])