In [None]:
import requests
import json
import pandas as pd
import time
from datetime import datetime
import pytz

In [None]:
now = datetime.now(tz=pytz.timezone("Asia/Jakarta"))
current_time = now.strftime("%H:%M:%S")
current_day = now.strftime("%Y-%m-%d")
print("Today:", current_day)
print("Start time:", current_time)

Today: 2021-11-29
Start time: 23:38:24


In [None]:
url = "https://gql.tokopedia.com/"
# keywords = [input("Please input keywords:")]

keywords = [
    "Neocate LCP", "Morinaga Chil Kid Soya", "Nutricia Bebelove FL",
    "SGM Eksplor Soya Rasa Madu", "Nutrilon Royal Soya", "Nutramigen LGG",
    "Pregestimil", "Morinaga Chil Kid P-HP MoriCare", "Isomil Plus Advance Soya",
    "Puramino", "Prenagen Mommy", "SGM Bunda",
    "Lactamil Pregnasis", "Lovamil", "Anmum Materna",
    "Friso Gold Mum", "SUN Ibu", "Frisian Flag Mama Tahap 0 Suprima",
    "Enfamama A+","Vidoran Ibunda Milk"
]

params = "device=desktop&ob=23&q={}&related=true&start={}&rows=200&safe_search=false&scheme=https&shipping=&source=search&st=product&unique_id=13911d1fa1da088fe579955741524855&user_id=&variants="
query = "query SearchProductQuery($params: String) {  searchProduct(params: $params) {    source    totalData: count    totalDataText: count_text    additionalParams: additional_params    redirection {      redirectionURL: redirect_url      departmentID: department_id      __typename    }    responseCode: response_code    keywordProcess: keyword_process    suggestion {      suggestion      suggestionCount      currentKeyword      instead      insteadCount      suggestionText: text      suggestionTextQuery: query      __typename    }    related {      relatedKeyword: related_keyword      otherRelated: other_related {        keyword        url        __typename      }      __typename    }    isQuerySafe    ticker {      text      query      typeID: type_id      __typename    }    products {      id      name      childs      url      imageURL: image_url      imageURL300: image_url_300      imageURL500: image_url_500      imageURL700: image_url_700      price      priceRange: price_range      category: department_id      categoryID: category_id      categoryName: category_name      categoryBreadcrumb: category_breadcrumb      discountPercentage: discount_percentage      originalPrice: original_price      shop {        id        name        url        isPowerBadge: is_power_badge        isOfficial: is_official        location        city        reputation        clover        __typename      }      wholesalePrice: whole_sale_price {        quantityMin: quantity_min        quantityMax: quantity_max        price        __typename      }      courierCount: courier_count      condition      labels {        title        color        __typename      }      labelGroups: label_groups {        position        type        title        __typename      }      badges {        title        imageURL: image_url        show        __typename      }      isFeatured: is_featured      rating      countReview: count_review      stock      GAKey: ga_key      preorder: is_preorder      wishlist      shop {        id        name        url        goldmerchant: is_power_badge        location        city        reputation        clover        official: is_official        __typename      }      __typename    }    __typename  }}"

In [None]:
def product_search(keyword, params, query):
    requests.packages.urllib3.disable_warnings() 
    start = 0

    product_id = []
    product_name = []
    product_price = []
    product_original_price = []
    product_discount_percentage = []
    seller_name = []
    seller_city = []
    seller_type = []
    product_url = []
    product_key = []
    seller_url = []

    while(True):
        json_param = {"operationName":"SearchProductQuery",
                     "variables":{
                         "params":params.format(keyword, start)
                     },
                     "query":query}
        req = requests.post(url, json=json_param, verify=False)
        print("Total Data:", start)

        json_data = json.loads(req.text)
        done = False
        
        while (done == False):
            try:
                searchProduct = json_data["data"]["searchProduct"]["products"]
                done = True
            except:
                req = requests.post(url, json=json_param, verify=False)
                print("Retrying!")
                    
        if len(searchProduct) == 0:
            break

        for product in searchProduct:
            product_id.append(product["id"])
            product_name.append(product["name"])
            product_price.append(int(product["price"].replace("Rp", '').replace('.', '')))
            original_price = product["originalPrice"]
            if len(original_price) == 0:
                product_original_price.append(int(product["price"].replace("Rp", '').replace('.', '')))
            else:
                product_original_price.append(int(product["originalPrice"].replace("Rp", '').replace('.', '')))
            product_discount_percentage.append(product["discountPercentage"])

            shop = product["shop"]
            seller_name.append(shop["name"])
            if shop["isOfficial"] == True:
                seller_type.append("Official Store")
            elif shop["isPowerBadge"] == True:
                seller_type.append("Power Merchant")
            else:
                seller_type.append("Regular Merchant")
            seller_city.append(shop["city"])

            product_url.append(product["url"])
            product_key.append(product["GAKey"].split("/")[-1])
            seller_url.append(shop["url"])

        start += 200
        
        time.sleep(0.2)

    product_search_df = pd.DataFrame({
        "product_id":product_id,
        "product_name":product_name,
        "product_price":product_price,
        "product_original_price":product_original_price,
        "product_discount_percentage":product_discount_percentage,
        "seller_name":seller_name,
        "seller_city":seller_city,
        "seller_type":seller_type,
        "product_url":product_url,
        "seller_url":seller_url
    })
    
    return product_search_df, product_key

In [None]:
def product_detail(seller_url, product_key):
    shop_domain = [s.split("/")[-1] for s in seller_url]
    count = 0
    
    product_id = []
    product_weight = []
    product_condition = []
    product_insurance = []
    product_category = []
    product_sub_category1 = []
    product_sub_category2 = []
    product_view_count = []
    product_rating = []
    product_review_count = []
    transaction_success = []
    transaction_rejected = []
    product_sold_count = []
    product_stock = []
    seller_id = []
    
    for s, p in zip(shop_domain, product_key):
        variables = {
            "shopDomain":"{}".format(s),
            "productKey":"{}".format(p)
        }
        json_request = {
            "operationName":"PDPInfoQuery",
            "variables":variables,
            "query":"query PDPInfoQuery($shopDomain: String, $productKey: String) {\n  getPDPInfo(productID: 0, shopDomain: $shopDomain, productKey: $productKey) {\n    basic {\n      id\n      shopID\n      name\n      alias\n      price\n      priceCurrency\n      lastUpdatePrice\n      description\n      minOrder\n      maxOrder\n      status\n      weight\n      weightUnit\n      condition\n      url\n      sku\n      gtin\n      isKreasiLokal\n      isMustInsurance\n      isEligibleCOD\n      isLeasing\n      catalogID\n      needPrescription\n      __typename\n    }\n    category {\n      id\n      name\n      title\n      breadcrumbURL\n      isAdult\n      detail {\n        id\n        name\n        breadcrumbURL\n        __typename\n      }\n      __typename\n    }\n    pictures {\n      picID\n      fileName\n      filePath\n      description\n      isFromIG\n      width\n      height\n      urlOriginal\n      urlThumbnail\n      url300\n      status\n      __typename\n    }\n    preorder {\n      isActive\n      duration\n      timeUnit\n      __typename\n    }\n    wholesale {\n      minQty\n      price\n      __typename\n    }\n    videos {\n      source\n      url\n      __typename\n    }\n    campaign {\n      campaignID\n      campaignType\n      campaignTypeName\n      originalPrice\n      discountedPrice\n      isAppsOnly\n      isActive\n      percentageAmount\n      stock\n      originalStock\n      startDate\n      endDate\n      endDateUnix\n      appLinks\n      hideGimmick\n      __typename\n    }\n    stats {\n      countView\n      countReview\n      countTalk\n      rating\n      __typename\n    }\n    txStats {\n      txSuccess\n      txReject\n      itemSold\n      __typename\n    }\n    cashback {\n      percentage\n      __typename\n    }\n    variant {\n      parentID\n      isVariant\n      __typename\n    }\n    stock {\n      useStock\n      value\n      stockWording\n      __typename\n    }\n    menu {\n      name\n      __typename\n    }\n    __typename\n  }\n}\n"
        }
        response = requests.post(url, json=json_request, verify=False)
        
        try:
            json_data = json.loads(response.text)
        except:
            print("This product failed!")
            print(shop_domain, product_key)
            print(response.text)
        if json_data["data"] == None:
            continue
        product_data = json_data["data"]["getPDPInfo"]
        basic_data = product_data["basic"]
                
        product_id.append(basic_data["id"])
        product_weight.append(str(basic_data["weight"]) + " " + basic_data["weightUnit"])
        product_condition.append(basic_data["condition"])
        insurance = basic_data["isMustInsurance"]
        if insurance == True:
            product_insurance.append("Wajib")
        elif insurance == False:
            product_insurance.append("Opsional")
        
        category = product_data["category"]["detail"]
        try:
            product_category.append(category[0]["name"])
        except:
            product_category.append(None)
        try:
            product_sub_category1.append(category[1]["name"])
        except:
            product_sub_category1.append(None)
        try:
            product_sub_category2.append(category[2]["name"])
        except:
            product_sub_category2.append(None)
        
        stats = product_data["stats"]
        product_view_count.append(stats["countView"])
        product_rating.append(stats["rating"])
        product_review_count.append(stats["countReview"])
        
        transaction_stats = product_data["txStats"]
        transaction_success.append(transaction_stats["txSuccess"])
        transaction_rejected.append(transaction_stats["txReject"])
        product_sold_count.append(transaction_stats["itemSold"])
        
        stock = product_data["stock"]
        product_stock.append(stock["value"])
        
        seller_id.append(basic_data["shopID"])
        
        count += 1
        if count % 200 == 0:
            print("Total Data:", count)
        
    product_df = pd.DataFrame({
        "product_id":product_id,
        "product_weight":product_weight,
        "product_condition":product_condition,
        "product_insurance":product_insurance,
        "product_category":product_category,
        "product_sub_category1":product_sub_category1,
        "product_sub_category2":product_sub_category2,
        "product_view_count":product_view_count,
        "product_rating":product_rating,
        "product_review_count":product_review_count,
        "transaction_success":transaction_success,
        "transaction_rejected":transaction_rejected,
        "product_sold_count":product_sold_count,
        "product_stock":product_stock,
        "seller_id":seller_id
    })
    
    return product_df

In [None]:
product_search_df = pd.DataFrame()
product_key_list = []
for keyword in keywords:
    print(keyword)
    search_df, product_key = product_search(keyword, params, query)
    search_df["keyword"] = keyword
    product_search_df = product_search_df.append(search_df)
    product_key_list.extend(product_key)
    
    print("Please wait 10 seconds for next keyword!!")
    time.sleep(10)
    
      
print("Total Data Seluruh Keyword:", len(product_key_list))
product_df = product_detail(product_search_df["seller_url"].tolist(), product_key_list)
product_final_df = product_search_df.merge(product_df, on="product_id")
keywords_list = product_final_df["keyword"].tolist()
product_final_df = product_final_df.drop("keyword", axis=1)
product_final_df["keyword"] = keywords_list
product_final_df["crawl_date"] = current_day

Neocate LCP
Total Data: 0
Total Data: 200
Total Data: 400
Total Data: 600
Total Data: 800
Please wait 10 seconds for next keyword!!
Morinaga Chil Kid Soya
Total Data: 0
Total Data: 200
Total Data: 400
Total Data: 600
Total Data: 800
Total Data: 1000
Total Data: 1200
Please wait 10 seconds for next keyword!!
Nutricia Bebelove FL
Total Data: 0
Please wait 10 seconds for next keyword!!
SGM Eksplor Soya Rasa Madu
Total Data: 0
Total Data: 200
Please wait 10 seconds for next keyword!!
Nutrilon Royal Soya
Total Data: 0
Total Data: 200
Total Data: 400
Please wait 10 seconds for next keyword!!
Nutramigen LGG
Total Data: 0
Total Data: 200
Total Data: 400
Total Data: 600
Total Data: 800
Please wait 10 seconds for next keyword!!
Pregestimil
Total Data: 0
Total Data: 200
Total Data: 400
Total Data: 600
Total Data: 800
Total Data: 1000
Total Data: 1200
Please wait 10 seconds for next keyword!!
Morinaga Chil Kid P-HP MoriCare
Total Data: 0
Total Data: 200
Please wait 10 seconds for next keyword!!
Is

In [None]:
product_final_df.to_excel("susu_product_detail_tokopedia.xlsx".format(keywords), index=False)

In [None]:
end_time = now.strftime("%H:%M:%S")
current_day = now.strftime("%Y-%m-%d")
print("Today:", current_day)
print("End time:", end_time)

Today: 2021-11-29
End time: 23:38:24
