In [3]:
# web scraping
from bs4 import BeautifulSoup

# date parser
from datetime import datetime
from dateutil.parser import parse

# web driver
import undetected_chromedriver as uc 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# random wait
import time
import random

# mongodb connection
from pymongo import MongoClient

# progress bar
from tqdm import tqdm

# saving to excel
import pandas as pd

import requests
from io import BytesIO

import json

In [4]:
# date parser function
def parse_date(date_str):
    date1 = datetime.strptime(date_str, '%d-%b-%Y')
    output_date_string = date1.strftime("%Y-%m-%d")
    return output_date_string

In [5]:
# class to determine ganjil genap
def is_date(string, fuzzy=False):
    try: 
        parse(string, fuzzy=fuzzy)
        return True
    except ValueError:
        return False

In [6]:
# reading all BL number from google sheet
from acquiringbl import takingBL
bl_list = takingBL("CNC")

In [7]:
# view bl list
len(bl_list)

29

In [17]:
# collecting failed BL to track
bl_list = ['ARM0305513', 'ARM0300221', 'ARM0300326', 'ARM0301801', 'ARM0301769']

<h3><strong>Web Scraping Flow</strong></h3>
<ol>
 <li>Acquiring every BL Number that want to be track and store it into a list of BL</li>
 <li>Iterate all of the list and search it through the liners web</li>
 <li>Take list of container number and store it into a list</li>
 <li>Using BS4 to scrape web data and parse it</li>
 <li>Change the milestone key and store the list of dictionaries to MongoDB</li>

</ol>

In [18]:
hasil_akhir = []
list_of_dict_fix2 = []
list_of_dict = []

# web driving
options = Options()
options.add_argument("--window-size=1920,1280")
driver = uc.Chrome()
driver.get("https://www.cnc-line.com/ebusiness/tracking/search")

for q, bls in enumerate(tqdm(bl_list)):
    try:
        time.sleep(random.randrange(2,6))
        
        # inputing new bl
        search_box2 = driver.find_element(By. XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[1]/span[1]/input[2]')
        search_box2.clear()
        search_box2.send_keys(bls)
        time.sleep(1.1)
        search_button = driver.find_element(By.XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[2]/button')
        time.sleep(1)
        search_button.click()
        time.sleep(random.randrange(3,7))

        # taking data from web 
        soup = BeautifulSoup(driver.page_source, 'lxml')

        # finding containers in bl
        containers_in_bl = soup.find_all('dl',{'class','container-ref'})
        list_of_containers = []
        for tag in containers_in_bl:
            for e, f  in enumerate(tag.find_all('span')):
                if len(f.text) == 11:
                    list_of_containers.append(f.text)

        # checking how many container consisting in 1 bl 
        print(bls, 'consist of ', len(list_of_containers),' containers')
        num_of_ctr_idx = len(list_of_containers) 
        
        # clicking more button
        current_dict = {}
        
        # click all
        for button in range(len(list_of_containers)):
            more_button= WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/main/section[2]/div/div/ul/li[{}]/article/section[2]/div[1]/div/label'.format(num_of_ctr_idx))))  
            more_button.click()
            time.sleep(random.randrange(2,4))
            more_button.click()
            num_of_ctr_idx = num_of_ctr_idx-1
        
        time.sleep(5)
        soup1 = BeautifulSoup(driver.page_source, 'lxml')
        
        div_of_script = soup1.find_all("div", {"class":"l-zone__main"})
        for tag in div_of_script:
            for t, tag2 in enumerate(tag.find_all('script')):
                script = tag2
        
        index_separator = '{"IsFavoriteContainer":'
        var = script.text
        list_sementara = var.split("var ")
        str_milestone = list_sementara[1].replace("model =","").replace("\n","")[1:-6]
        list_str = str_milestone.split(index_separator)
        list_str.pop(0)

        for j, j_container in enumerate(list_str):
            json_text_string = index_separator + j_container

            if json_text_string[-1] == ",":
                json_text_string = json_text_string[:-1]

            json_data = json.loads(json_text_string)
            
            ctr_number = json_data['ContainerReference']
        
            origin = json_data['ContainerMoveDetails']['routingInformation']['portOfLoading']['name'][:-5]
            destination = json_data['ContainerMoveDetails']['routingInformation']['portOfDischarge']['name'][:-5]
            milestones = json_data['ContainerMoveDetails']['pastMoves'] + json_data['ContainerMoveDetails']['currentMoves'] + json_data['ContainerMoveDetails']['futureMoves'] 
            current_dict = {
                "Liners" : "CNC",
                "BL Number" : bls,
                "Container Number" : ctr_number,
                "From" : origin,
                "To" : destination,
                }
            for d, milestone in enumerate(milestones):
                if milestone['location']['name'] == origin:
                    mils = milestone['containerStatus'] + " ORIGIN"
                    case = {mils : milestone['containerStatusDate'][:10]}
                    current_dict.update(case)
                elif milestone['location']['name'] == destination:
                    mils = milestone['containerStatus'] + " DESTINATION"
                    case = {mils : milestone['containerStatusDate'][:10]}
                    current_dict.update(case)
            
            list_of_dict.append(current_dict)            
    
    # for failed scraping and further analysis for error
    except Exception as e:
        print(e)
        print("{} GAGAL!!".format(bls))
        gagal.append(bls)

# changing dict key to db key
key_mapping = {
    'ActualVesselDeparture ORIGIN': 'ATD',
    'ActualVesselArrival DESTINATION': 'ATA',
    'ContainerToConsignee DESTINATION': 'Container Release',
    'EmptyInDepotMEA DESTINATION': 'Container Return'
}

# itterate dictionaries and change the keys to match DB's key
for item in list_of_dict:
    transformed_item = {key_mapping.get(key, key): value for key, value in item.items() if key in key_mapping or key not in key_mapping}
    list_of_dict_fix2.append(transformed_item)   

  0%|          | 0/5 [00:00<?, ?it/s]

ARM0305513 consist of  3  containers


 20%|██        | 1/5 [00:27<01:49, 27.33s/it]

ARM0300221 consist of  0  containers


 40%|████      | 2/5 [00:45<01:06, 22.03s/it]

ARM0300326 consist of  2  containers


 60%|██████    | 3/5 [01:09<00:45, 22.84s/it]

ARM0301801 consist of  0  containers


 80%|████████  | 4/5 [01:29<00:21, 21.65s/it]

ARM0301769 consist of  5  containers


100%|██████████| 5/5 [02:04<00:00, 24.98s/it]


In [19]:
list_of_dict

[{'Liners': 'CNC',
  'BL Number': 'ARM0305513',
  'Container Number': 'SEKU5696322',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-08-07',
  'Readytobeloaded ORIGIN': '2023-08-10',
  'LoadedonboardXOF ORIGIN': '2023-08-12',
  'ActualVesselDeparture ORIGIN': '2023-08-13',
  'ActualVesselArrival DESTINATION': '2023-09-02',
  'Discharged DESTINATION': '2023-09-02'},
 {'Liners': 'CNC',
  'BL Number': 'ARM0305513',
  'Container Number': 'CMAU8718766',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-08-09',
  'Readytobeloaded ORIGIN': '2023-08-11',
  'LoadedonboardXOF ORIGIN': '2023-08-12',
  'ActualVesselDeparture ORIGIN': '2023-08-13',
  'ActualVesselArrival DESTINATION': '2023-09-02',
  'Discharged DESTINATION': '2023-09-02'},
 {'Liners': 'CNC',
  'BL Number': 'ARM0305513',
  'Container Number': 'TCKU6354975',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-08-07',
  'Readytobeloaded

In [20]:
list_of_dict

[{'Liners': 'CNC',
  'BL Number': 'ARM0305513',
  'Container Number': 'SEKU5696322',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-08-07',
  'Readytobeloaded ORIGIN': '2023-08-10',
  'LoadedonboardXOF ORIGIN': '2023-08-12',
  'ActualVesselDeparture ORIGIN': '2023-08-13',
  'ActualVesselArrival DESTINATION': '2023-09-02',
  'Discharged DESTINATION': '2023-09-02'},
 {'Liners': 'CNC',
  'BL Number': 'ARM0305513',
  'Container Number': 'CMAU8718766',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-08-09',
  'Readytobeloaded ORIGIN': '2023-08-11',
  'LoadedonboardXOF ORIGIN': '2023-08-12',
  'ActualVesselDeparture ORIGIN': '2023-08-13',
  'ActualVesselArrival DESTINATION': '2023-09-02',
  'Discharged DESTINATION': '2023-09-02'},
 {'Liners': 'CNC',
  'BL Number': 'ARM0305513',
  'Container Number': 'TCKU6354975',
  'From': 'JAKARTA',
  'To': 'BATANGAS',
  'EmptyDeliveredToShipper ORIGIN': '2023-08-07',
  'Readytobeloaded

In [21]:
# connect to mongodb
from mongoinit import mongo_table_initiation, insert_many_mongo

mongo_table_initiation()
insert_many_mongo(list_of_dict_fix2)

Today's Collection Name ===>  all_tracking_Oct-09-2023
Today's Collection Has Been Made
Inserting Many Complete!!


In [14]:
# checking failed bl
gagal

['ARM0305513', 'ARM0300221', 'ARM0300326', 'ARM0301801', 'ARM0301769']