In [2]:
# web scraping
from bs4 import BeautifulSoup

# date parser
from datetime import datetime
from dateutil.parser import parse

# web driver
import undetected_chromedriver as uc 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# random wait
import time
import random

# mongodb connection
from pymongo import MongoClient

# progress bar
from tqdm import tqdm

# saving to excel
import pandas as pd

import requests
from io import BytesIO

import json

In [3]:
# date parser function
def parse_date(date_str):
    date1 = datetime.strptime(date_str, '%d-%b-%Y')
    output_date_string = date1.strftime("%Y-%m-%d")
    return output_date_string

In [4]:
# class to determine ganjil genap
def is_date(string, fuzzy=False):
    try: 
        parse(string, fuzzy=fuzzy)
        return True
    except ValueError:
        return False

In [96]:
# reading all BL number from google sheet
from acquiringbl import takingBL
bl_list = takingBL("CNC")

In [97]:
# view bl list
len(bl_list)

30

In [98]:
# collecting failed BL to track
gagal = []

<h3><strong>Web Scraping Flow</strong></h3>
<ol>
 <li>Acquiring every BL Number to track and store it into a list of BL</li>
 <li>Iterate all of the list and search it through the liners web</li>
 <li>Take list of container number and store it into a list</li>
 <li>If container number>1 iterate through all container for container's milestone info and store it into a dictionary and append that to a list of dict</li>
 <li>If container = 1 take container info and store it into a dictionary and append it to list of dictionary</li>
 <li>Modify dictionaries to match db's column template</li>
 <li>Insert list of dictionary to mongo db or export it into and excel file</li>
</ol>

In [102]:
hasil_akhir = []
list_of_dict_fix2 = []
list_of_dict = []
# web scripting
options = Options()
options.add_argument("--window-size=1920,1280")
driver = uc.Chrome()
driver.get("https://www.cnc-line.com/ebusiness/tracking/search")

for q, bls in enumerate(tqdm(bl_list)):
    try:
        time.sleep(random.randrange(2,6))
        # masukin BL baru
        search_box2 = driver.find_element(By. XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[1]/span[1]/input[2]')
        # driver.execute_script("window.scrollTo(100,document.body.scrollHeight);")
        search_box2.clear()
        search_box2.send_keys(bls)
        time.sleep(1.1)
        search_button = driver.find_element(By.XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[2]/button')
        time.sleep(1)
        search_button.click()

        time.sleep(random.randrange(3,7))
        # taking data from web 
        soup = BeautifulSoup(driver.page_source, 'lxml')

        # finding containers in bl
        containers_in_bl = soup.find_all('dl',{'class','container-ref'})
        list_of_containers = []
        for tag in containers_in_bl:
            for e, f  in enumerate(tag.find_all('span')):
                if len(f.text) == 11:
                    list_of_containers.append(f.text)

        # web scraping flow if bl have >1 container number
        #if len(list_of_containers)>1:
        print(bls, 'consist of ', len(list_of_containers),' containers')
        num_of_ctr_idx = len(list_of_containers) 
        # clicking more button
        current_dict = {}
        
        # click all
        for button in range(len(list_of_containers)):
            more_button= WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/main/section[2]/div/div/ul/li[{}]/article/section[2]/div[1]/div/label'.format(num_of_ctr_idx))))  
            more_button.click()
            time.sleep(random.randrange(2,4))
            more_button.click()
            num_of_ctr_idx = num_of_ctr_idx-1
        
        time.sleep(5)
        soup1 = BeautifulSoup(driver.page_source, 'lxml')
        
        div_of_script = soup1.find_all("div", {"class":"l-zone__main"})
        for tag in div_of_script:
            for t, tag2 in enumerate(tag.find_all('script')):
                script = tag2
        
        index_separator = '{"IsFavoriteContainer":'
        var = script.text
        list_sementara = var.split("var ")
        str_milestone = list_sementara[1].replace("model =","").replace("\n","")[1:-6]
        list_str = str_milestone.split(index_separator)
        list_str.pop(0)

        for j, j_container in enumerate(list_str):
            json_text_string = index_separator + j_container

            if json_text_string[-1] == ",":
                json_text_string = json_text_string[:-1]

            json_data = json.loads(json_text_string)
            
            ctr_number = json_data['ContainerReference']
        
            origin = json_data['ContainerMoveDetails']['routingInformation']['portOfLoading']['name'][:-5]
            destination = json_data['ContainerMoveDetails']['routingInformation']['portOfDischarge']['name'][:-5]
            milestones = json_data['ContainerMoveDetails']['pastMoves'] + json_data['ContainerMoveDetails']['currentMoves'] + json_data['ContainerMoveDetails']['futureMoves'] 
            current_dict = {
                "Liners" : "CNC",
                "BL Number" : bls,
                "Container Number" : ctr_number,
                "From" : origin,
                "To" : destination,
                }
            for d, milestone in enumerate(milestones):
                if milestone['location']['name'] == origin:
                    mils = milestone['containerStatus'] + " ORIGIN"
                    case = {mils : milestone['containerStatusDate'][:10]}
                    current_dict.update(case)
                elif milestone['location']['name'] == destination:
                    mils = milestone['containerStatus'] + " DESTINATION"
                    case = {mils : milestone['containerStatusDate'][:10]}
                    current_dict.update(case)
            
            list_of_dict.append(current_dict)            
        
    except Exception as e:
        # for failed bl
        print(e)
        print("{} GAGAL!!".format(bls))
        gagal.append(bls)

# changing dict key to db key
# for filter_dict in list_of_dict:
key_mapping = {
    'EmptyDeliveredToShipper ORIGIN': 'ATD',
    'ActualVesselArrival DESTINATION': 'ATA',
    'ContainerToConsignee DESTINATION': 'Container Release',
    'EmptyInDepotMEA DESTINATION': 'Container Return'
}

# Transformation
for item in list_of_dict:
    transformed_item = {key_mapping.get(key, key): value for key, value in item.items() if key in key_mapping or key not in key_mapping}
    list_of_dict_fix2.append(transformed_item)   

  0%|          | 0/30 [00:00<?, ?it/s]

ARM0306500 consist of  13  containers


  3%|▎         | 1/30 [00:55<26:59, 55.84s/it]

ARM0306330 consist of  2  containers


  7%|▋         | 2/30 [01:21<17:38, 37.80s/it]

ARM0307999 consist of  3  containers


 10%|█         | 3/30 [01:49<15:03, 33.46s/it]

ARM0301801 consist of  0  containers


 13%|█▎        | 4/30 [02:08<11:59, 27.66s/it]

ARM0301769 consist of  5  containers


 17%|█▋        | 5/30 [02:40<12:16, 29.46s/it]

ARM0303415 consist of  0  containers


 20%|██        | 6/30 [02:58<10:14, 25.60s/it]

ARM0300209 consist of  0  containers


 23%|██▎       | 7/30 [03:17<08:54, 23.22s/it]

ARM0308202 consist of  4  containers


 27%|██▋       | 8/30 [03:48<09:29, 25.88s/it]

ARM0300326 consist of  2  containers


 30%|███       | 9/30 [04:12<08:48, 25.15s/it]

ARM0306618 consist of  2  containers


 33%|███▎      | 10/30 [04:37<08:20, 25.03s/it]

ARM0300876 consist of  4  containers


 37%|███▋      | 11/30 [05:03<08:05, 25.55s/it]

ARM0302741 consist of  3  containers


 40%|████      | 12/30 [05:33<08:00, 26.72s/it]

ARM0308333 consist of  0  containers


 43%|████▎     | 13/30 [05:51<06:50, 24.13s/it]

ARM0305653 consist of  0  containers


 47%|████▋     | 14/30 [06:08<05:53, 22.08s/it]

ARM0302772 consist of  0  containers


 50%|█████     | 15/30 [06:27<05:17, 21.15s/it]

ARM0305513 consist of  3  containers


 53%|█████▎    | 16/30 [06:54<05:18, 22.76s/it]

ARM0302106 consist of  2  containers


 57%|█████▋    | 17/30 [07:18<05:00, 23.14s/it]

ARM0303661 consist of  2  containers


 60%|██████    | 18/30 [07:42<04:40, 23.35s/it]

ARM0308716 consist of  0  containers


 63%|██████▎   | 19/30 [08:01<04:03, 22.12s/it]

ARM0307246 consist of  2  containers


 67%|██████▋   | 20/30 [08:25<03:47, 22.75s/it]

ARM0302585 consist of  5  containers


 70%|███████   | 21/30 [08:57<03:50, 25.57s/it]

ARM0301958 consist of  0  containers


 73%|███████▎  | 22/30 [09:17<03:09, 23.75s/it]

ARM0303044 consist of  3  containers


 77%|███████▋  | 23/30 [09:43<02:51, 24.47s/it]

ARM0303785 consist of  2  containers


 80%|████████  | 24/30 [10:04<02:21, 23.54s/it]

ARM0303224 consist of  0  containers


 83%|████████▎ | 25/30 [10:24<01:52, 22.42s/it]

ARM0305556 consist of  2  containers


 87%|████████▋ | 26/30 [10:49<01:32, 23.17s/it]

ARM0301802 consist of  2  containers


 90%|█████████ | 27/30 [11:15<01:12, 24.03s/it]

ARM0300221 consist of  0  containers


 93%|█████████▎| 28/30 [11:33<00:44, 22.35s/it]

ARM0306454 consist of  5  containers


 97%|█████████▋| 29/30 [12:06<00:25, 25.43s/it]

ARM0302330 consist of  0  containers


100%|██████████| 30/30 [12:24<00:00, 24.82s/it]


In [105]:
# connect to mongodb
from mongoinit import mongo_table_initiation, insert_many_mongo

mongo_table_initiation()
insert_many_mongo(list_of_dict_fix2)

Today's Collection Name ===>  all_tracking_Oct-02-2023
Today's Collection Has Been Made
Inserting Many Complete!!
