In [2]:
# web scraping
from bs4 import BeautifulSoup

# date parser
from datetime import datetime
from dateutil.parser import parse

# web driver
import undetected_chromedriver as uc 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# random wait
import time
import random

# mongodb connection
from pymongo import MongoClient

# progress bar
from tqdm import tqdm

# saving to excel
import pandas as pd

import requests
from io import BytesIO

import json

In [3]:
# date parser function
def parse_date(date_str):
    date1 = datetime.strptime(date_str, '%d-%b-%Y')
    output_date_string = date1.strftime("%Y-%m-%d")
    return output_date_string

In [4]:
# class to determine ganjil genap
def is_date(string, fuzzy=False):
    try: 
        parse(string, fuzzy=fuzzy)
        return True
    except ValueError:
        return False

In [5]:
# reading all BL number from google sheet
from acquiringbl import takingBL
bl_list = takingBL("CNC")

In [6]:
# view bl list
len(bl_list)

22

In [19]:
bl_list = ['ARM0301769', 'ARM0302585']

<h3><strong>Web Scraping Flow</strong></h3>
<ol>
 <li>Acquiring every BL Number that want to be track and store it into a list of BL</li>
 <li>Iterate all of the list and search it through the liners web</li>
 <li>Take list of container number and store it into a list</li>
 <li>Using BS4 to scrape web data and parse it</li>
 <li>Change the milestone key and store the list of dictionaries to MongoDB</li>

</ol>

In [22]:
hasil_akhir = []
list_of_dict_fix2 = []
list_of_dict = []
gagal=[]
# web driving
options = Options()
options.add_argument("--window-size=1920,1280")
driver = uc.Chrome()
driver.get("https://www.cnc-line.com/ebusiness/tracking")

for q, bls in enumerate(tqdm(bl_list)):
    try:
        time.sleep(random.randrange(2,6))
        
        # inputing new bl
        search_box2 = driver.find_element(By. XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[1]/span[1]/input[2]')
        search_box2.clear()
        search_box2.send_keys(bls)
        time.sleep(1.1)
        search_button = driver.find_element(By.XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[2]/button')
        time.sleep(1)
        search_button.click()
        time.sleep(random.randrange(3,7))

        # taking data from web 
        soup = BeautifulSoup(driver.page_source, 'lxml')

        # finding containers in bl
        containers_in_bl = soup.find_all('dl',{'class','container-ref'})
        list_of_containers = []
        for tag in containers_in_bl:
            for e, f  in enumerate(tag.find_all('span')):
                if len(f.text) == 11:
                    list_of_containers.append(f.text)

        # checking how many container consisting in 1 bl 
        print(bls, 'consist of ', len(list_of_containers),' containers')
        num_of_ctr_idx = len(list_of_containers) 
        
        # clicking more button
        current_dict = {}
        
        # click all
        for button in range(len(list_of_containers)):
            while True:
                if num_of_ctr_idx == 0:
                    break
                else:
                    try:
                        more_button= WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/main/section[2]/div/div/ul/li[{}]/article/section[2]/div[1]/div/label'.format(num_of_ctr_idx))))  
                        more_button.click()
                        print(num_of_ctr_idx, " CLICKED")
                        time.sleep(random.randrange(2,4))
                        more_button.click()
                        num_of_ctr_idx = num_of_ctr_idx-1
                    except Exception as e:
                        print(e)
                        driver.execute_script('window.scrollBy(0, -20);')
                        print("{} GAGAL CLICK".format(bls))
                

        
        time.sleep(5)
        soup1 = BeautifulSoup(driver.page_source, 'lxml')
        
        div_of_script = soup1.find_all("div", {"class":"l-zone__main"})
        for tag in div_of_script:
            for t, tag2 in enumerate(tag.find_all('script')):
                script = tag2
        
        index_separator = '{"IsFavoriteContainer":'
        var = script.text
        list_sementara = var.split("var ")
        str_milestone = list_sementara[1].replace("model =","").replace("\n","")[1:-6]
        list_str = str_milestone.split(index_separator)
        list_str.pop(0)

        for j, j_container in enumerate(list_str):
            json_text_string = index_separator + j_container

            if json_text_string[-1] == ",":
                json_text_string = json_text_string[:-1]

            json_data = json.loads(json_text_string)
            
            ctr_number = json_data['ContainerReference']
        
            origin = json_data['ContainerMoveDetails']['routingInformation']['portOfLoading']['name'][:-5]
            destination = json_data['ContainerMoveDetails']['routingInformation']['portOfDischarge']['name'][:-5]
            milestones = json_data['ContainerMoveDetails']['pastMoves'] + json_data['ContainerMoveDetails']['currentMoves'] + json_data['ContainerMoveDetails']['futureMoves'] 
            current_dict = {
                "Liners" : "CNC",
                "BL Number" : bls,
                "Container Number" : ctr_number,
                "From" : origin,
                "To" : destination,
                }
            for d, milestone in enumerate(milestones):
                if milestone['location']['name'] == origin:
                    mils = milestone['containerStatus'] + " ORIGIN"
                    case = {mils : milestone['containerStatusDate'][:10]}
                    current_dict.update(case)
                elif milestone['location']['name'] == destination:
                    mils = milestone['containerStatus'] + " DESTINATION"
                    case = {mils : milestone['containerStatusDate'][:10]}
                    current_dict.update(case)
            
            list_of_dict.append(current_dict)            
    
    # for failed scraping and further analysis for error
    except Exception as e:
        print(e)
        print("{} GAGAL!!".format(bls))
        gagal.append(bls)

# changing dict key to db key
key_mapping = {
    'ActualVesselDeparture ORIGIN': 'ATD',
    'ActualVesselArrival DESTINATION': 'ATA',
    'ContainerToConsignee DESTINATION': 'Container Release',
    'EmptyInDepotMEA DESTINATION': 'Container Return'
}

# itterate dictionaries and change the keys to match DB's key
for item in list_of_dict:
    transformed_item = {key_mapping.get(key, key): value for key, value in item.items() if key in key_mapping or key not in key_mapping}
    list_of_dict_fix2.append(transformed_item)   

  0%|          | 0/2 [00:00<?, ?it/s]

ARM0301769 consist of  5  containers
5  CLICKED
4  CLICKED
3  CLICKED
2  CLICKED
Message: element click intercepted: Element <label for="card-details-switch-0">...</label> is not clickable at point (1141, 17). Other element would receive the click: <a class="nav-link" aria-current="page" href="/intermodal-solutions">...</a>
  (Session info: chrome=118.0.5993.89)
Stacktrace:
	GetHandleVerifier [0x00007FF79D308EF2+54786]
	(No symbol) [0x00007FF79D275612]
	(No symbol) [0x00007FF79D12A64B]
	(No symbol) [0x00007FF79D171A6B]
	(No symbol) [0x00007FF79D16FE39]
	(No symbol) [0x00007FF79D16DC08]
	(No symbol) [0x00007FF79D16CCC3]
	(No symbol) [0x00007FF79D1629CF]
	(No symbol) [0x00007FF79D18BE6A]
	(No symbol) [0x00007FF79D1622E6]
	(No symbol) [0x00007FF79D18C080]
	(No symbol) [0x00007FF79D1A4D02]
	(No symbol) [0x00007FF79D18BC43]
	(No symbol) [0x00007FF79D160941]
	(No symbol) [0x00007FF79D161B84]
	GetHandleVerifier [0x00007FF79D657F52+3524194]
	GetHandleVerifier [0x00007FF79D6AD800+3874576]
	GetH

 50%|█████     | 1/2 [00:36<00:36, 36.71s/it]

ARM0302585 consist of  5  containers
5  CLICKED
4  CLICKED
3  CLICKED
2  CLICKED
Message: element click intercepted: Element <label for="card-details-switch-0">...</label> is not clickable at point (1141, 37). Other element would receive the click: <a class="nav-link" aria-current="page" href="/intermodal-solutions">...</a>
  (Session info: chrome=118.0.5993.89)
Stacktrace:
	GetHandleVerifier [0x00007FF79D308EF2+54786]
	(No symbol) [0x00007FF79D275612]
	(No symbol) [0x00007FF79D12A64B]
	(No symbol) [0x00007FF79D171A6B]
	(No symbol) [0x00007FF79D16FE39]
	(No symbol) [0x00007FF79D16DC08]
	(No symbol) [0x00007FF79D16CCC3]
	(No symbol) [0x00007FF79D1629CF]
	(No symbol) [0x00007FF79D18BE6A]
	(No symbol) [0x00007FF79D1622E6]
	(No symbol) [0x00007FF79D18C080]
	(No symbol) [0x00007FF79D1A4D02]
	(No symbol) [0x00007FF79D18BC43]
	(No symbol) [0x00007FF79D160941]
	(No symbol) [0x00007FF79D161B84]
	GetHandleVerifier [0x00007FF79D657F52+3524194]
	GetHandleVerifier [0x00007FF79D6AD800+3874576]
	GetH

100%|██████████| 2/2 [01:10<00:00, 35.28s/it]


In [26]:
list_of_dict_fix2

[{'Liners': 'CNC',
  'BL Number': 'ARM0301769',
  'Container Number': 'TEMU6511220',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-07-05',
  'Readytobeloaded ORIGIN': '2023-07-07',
  'LoadedonboardXOF ORIGIN': '2023-07-09',
  'ATD': '2023-07-09',
  'ATA': '2023-08-04',
  'Discharged DESTINATION': '2023-08-04',
  '_id': ObjectId('6538d8aa24ddb632a8cd8301')},
 {'Liners': 'CNC',
  'BL Number': 'ARM0301769',
  'Container Number': 'CMAU3343538',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-07-05',
  'Readytobeloaded ORIGIN': '2023-07-06',
  'LoadedonboardXOF ORIGIN': '2023-07-09',
  'ATD': '2023-07-09',
  'ATA': '2023-08-04',
  'Discharged DESTINATION': '2023-08-04',
  '_id': ObjectId('6538d8aa24ddb632a8cd8302')},
 {'Liners': 'CNC',
  'BL Number': 'ARM0301769',
  'Container Number': 'CMAU7072835',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-07-05',
  'Readytobeloaded ORIGIN': '2

In [24]:
# connect to mongodb
from mongoinit import mongo_table_initiation, insert_many_mongo

mongo_table_initiation()
insert_many_mongo(list_of_dict_fix2)

Today's Collection Name ===>  all_tracking_Oct-25-2023
Today's Collection Has Been Made
Inserting Many Complete!!


In [27]:
# checking failed bl
gagal

[]

In [33]:
# today's date
# initiating mongo
from pymongo import MongoClient
from datetime import date
# connect to mongodb
from mongoinit import mongo_table_initiation, insert_many_mongo

mongo_table_initiation()

today = date.today()
date_today = today.strftime("%b-%d-%Y")
collection_name = "all_tracking_" + date_today + 'REVISI 3'

cluster = MongoClient("mongodb+srv://tobiassion:tobiassion@cluster0.u2vzz3d.mongodb.net/?retryWrites=true&w=majority")
db = cluster["bl_tracking"]
collection = db[collection_name]
collection.insert_many(list_of_dict_fix2)

Today's Collection Name ===>  all_tracking_Oct-09-2023
Today's Collection Has Been Made


<pymongo.results.InsertManyResult at 0x1cdbf1cc5e0>