# loading

In [1]:
import pandas as pd
import numpy as np
from numpy import random
from tqdm import tqdm
from time import sleep
from datetime import datetime
import re
import json
import os.path
import glob

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

import undetected_chromedriver.v2 as uc

apps = pd.read_csv("apps_of_interest.csv")
links = apps["appstore_link"]

def extract_twitter_id(link):
    id = link.split("/")[6]
    return id.split("id")[1].split("?")[0]

id = links.apply(extract_twitter_id)

apps["id"] = id

saved_csv = pd.read_csv("sample_data.csv")
saved_ids = saved_csv["apple_id"]
unique_all_ids = apps["id"].unique().astype(int)
unique_saved_ids = saved_ids.unique().astype(int)
ids = np.setdiff1d(unique_all_ids, unique_saved_ids, assume_unique = True).tolist()

def save_cookie(driver, path):
    with open(path, 'w') as filehandler:
        json.dump(driver.get_cookies(), filehandler)

def load_cookie(driver, path):
    with open(path, 'r') as cookiesfile:
        cookies = json.load(cookiesfile)
    for cookie in cookies:
        if 'sameSite' in cookie:
            if cookie['sameSite'] == 'None':
                cookie['sameSite'] = 'Strict'
        driver.add_cookie(cookie)

def get_date(dt):
    try: 
        return dt.split("(")[1][:-1]
    except IndexError:
        return ""

# scrape

In [None]:
############ CONFIGURATION ###############

# these are start and end indices for apps
# this code runs 3 apps each batch
# if the code crashes, you can restart for that particular batch (the last batch the code had been trying
# to run will be indicated in the printed output below this code cell) by replacing the "start" value 
# with the index you want to begin to run from

# you can try running this code cell with start, end = 6000, 6003 to see an example. the output will show up in the
# folder "to-run" as saved6000to6003.csv; please check there are three app IDs in the csv output. Chrome will open
# up a new tab with each batch of apps you're running. 

start, end = 6000, 8000

# App Annie log in details to key in

APPANNIE_EMAIL = ""
APPANNIE_PASSWORD = ""

# If you receive Error 500 on the webpage that shows up, increase the wait time (in seconds) between scraping each app
# by more seconds; can play around with it until App Annie no longer blocks you
# but might want to reduce once you're off the hook or the scraping will take longer

minwait = 60     # e.g. 70
maxwait = 90     # e.g. 100

############ SCRAPER ###############

BATCH_SIZE = random.randint(3,4)

unique_saved_ids = saved_ids.unique().astype(int)
unique_all_ids = apps["id"].unique().astype(int)
ids_full = np.setdiff1d(unique_all_ids, unique_saved_ids, assume_unique = True).tolist()

numAll = len(unique_all_ids)
numToScrape = len(ids) - 1
print("number of apps to scrape in total:", numAll)
print("number of apps left to scrape:", numToScrape)

for batch in range(start, end, BATCH_SIZE):
    print("starting batch", str(batch))  
    
    # prepare for scraping
    base_url = "https://www.appannie.com/apps/ios/app/"
    add_url = "/details?date=!(%272021-10-10%27,%272022-01-01%27)&granularity=weekly&country_code=WW"
    options = uc.ChromeOptions()
    driver = uc.Chrome(options = options)
    cookies_path = "cookies.json"
    all_ids, all_versions, all_dates, all_descriptions, missing_apps = [],[],[],[],[]
    
    if os.path.exists(cookies_path):
        os.remove(cookies_path)
    
    # loop through each app via its ID
    ids = ids_full[batch:batch + BATCH_SIZE]
    removed, not_removed = [], []
    
    for idx in tqdm(range(len(ids))):
        
        # access App Annie page
        i = ids[idx]
        url = "".join([base_url, str(i), add_url])
        driver.get(url)
        
        # if cookies containing login info doesn't exist, log in and save cookie
        if not os.path.exists(cookies_path):
            sleep(random.uniform(0,1))
            username = driver.find_element_by_xpath("/html/body/div[2]/div/div/div[1]/div/div[3]/form/div/div[1]/input")
            password = driver.find_element_by_xpath("/html/body/div[2]/div/div/div[1]/div/div[3]/form/div/div[2]/input")
            username.send_keys(APPANNIE_EMAIL)
            sleep(random.uniform(0,1))
            password.send_keys(APPANNIE_PASSWORD)
            sleep(random.uniform(3,5))
            driver.find_element_by_xpath("/html/body/div[2]/div/div/div[1]/div/div[3]/form/div/button").click() # log in button
            save_cookie(driver, cookies_path)
        else:
            sleep(random.uniform(50,70))
            load_cookie(driver, cookies_path)
            
        #get content in What's New, first check if it's still present in the app store
        try:
            xp = "/html/body/div[2]/div/div[2]/div[1]/div[3]/div[2]/div[2]/div[3]/div[2]/div[1]/div[3]/dl"
            table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, xp)))
            not_removed.append(i)
        except TimeoutException as e:
            # if it's not present in the app store, use a different xpath
            try:
                xp = "/html/body/div[2]/div/div[2]/div[1]/div[3]/div[2]/div[2]/div[3]/div[3]/div[1]/div[3]/dl"
                table = WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.XPATH, xp)))
                removed.append(i)
            # but if app doesn't exist in App Annie, webdriver automatically closes
            except TimeoutException as ex:
                sleep(random.uniform(50,70))
                driver.close()
                options = uc.ChromeOptions()
                driver = uc.Chrome(options = options)
                if os.path.exists(cookies_path):
                    os.remove(cookies_path)
                continue
                             
        # save content
        vers = table.find_elements_by_css_selector("dt")
        vers_text = [x.text for x in vers]
        versions_only = [x.split(" ")[1] for x in vers_text]
        dates_only = [get_date(x) for x in vers_text]

        desc = table.find_elements_by_css_selector("dd")
        desc_text = [x.text for x in desc]

        # add content to lists of all_
        all_ids += [i] * len(versions_only)
        all_versions += versions_only
        all_dates += dates_only
        all_descriptions += desc_text
        
        # crawl-delay between each app
        if idx < len(ids) - 1:
            sleep(random.uniform(minwait, maxwait))
        else:
            sleep(random.uniform(15, 20))
        
    driver.close()
    if os.path.exists(cookies_path):
        os.remove(cookies_path)
        
    # save batch job
    df = pd.DataFrame({"apple_id": all_ids, "version": all_versions, "date": all_dates, "description": all_descriptions})
    df.to_csv("".join(["saved_", str(batch), "to", str(batch + BATCH_SIZE), ".csv"]), index=False)
    print("rows:", len(df))
    print("apps removed from app store:", removed)
    print("unable to get data for apps:", missing_apps)

    # crawl-delay between each batch
    sleep(random.uniform(70, 120))