In [1]:
# web scraping
from bs4 import BeautifulSoup

# date parser
from datetime import datetime
from dateutil.parser import parse

# web driver
import undetected_chromedriver as uc 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# random wait
import time
import random

# mongodb connection
from pymongo import MongoClient

# progress bar
from tqdm import tqdm

# saving to excel
import pandas as pd

import requests
from io import BytesIO

import json

In [2]:
# date parser function
def parse_date(date_str):
    date1 = datetime.strptime(date_str, '%d-%b-%Y')
    output_date_string = date1.strftime("%Y-%m-%d")
    return output_date_string

In [3]:
# class to determine ganjil genap
def is_date(string, fuzzy=False):
    try: 
        parse(string, fuzzy=fuzzy)
        return True
    except ValueError:
        return False

In [4]:
# reading all BL number from google sheet
from acquiringbl import takingBL
bl_list = takingBL("CNC")

In [5]:
# view bl list
bl_list

['ARM0293708', 'ARM0293980', 'ARM0296321']

In [6]:
# bl_list = ['ARM0301769', 'ARM0302585']

<h3><strong>Web Scraping Flow</strong></h3>
<ol>
 <li>Acquiring every BL Number that want to be track and store it into a list of BL</li>
 <li>Iterate all of the list and search it through the liners web</li>
 <li>Take list of container number and store it into a list</li>
 <li>Using BS4 to scrape web data and parse it</li>
 <li>Change the milestone key and store the list of dictionaries to MongoDB</li>

</ol>

In [7]:
hasil_akhir = []
list_of_dict_fix2 = []
list_of_dict = []
gagal=[]
# web driving
options = Options()
options.add_argument("--window-size=1920,1280")
driver = uc.Chrome()
driver.get("https://www.cnc-line.com/ebusiness/tracking")

for q, bls in enumerate(tqdm(bl_list)):
    try:
        time.sleep(random.randrange(2,6))
        
        # inputing new bl
        search_box2 = driver.find_element(By. XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[1]/span[1]/input[2]')
        search_box2.clear()
        search_box2.send_keys(bls)
        time.sleep(1.1)
        search_button = driver.find_element(By.XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[2]/button')
        time.sleep(1)
        search_button.click()
        time.sleep(random.randrange(3,7))

        # taking data from web 
        soup = BeautifulSoup(driver.page_source, 'lxml')

        # finding containers in bl
        containers_in_bl = soup.find_all('dl',{'class','container-ref'})
        list_of_containers = []
        for tag in containers_in_bl:
            for e, f  in enumerate(tag.find_all('span')):
                if len(f.text) == 11:
                    list_of_containers.append(f.text)

        # checking how many container consisting in 1 bl 
        print(bls, 'consist of ', len(list_of_containers),' containers')
        num_of_ctr_idx = len(list_of_containers) 
        
        # clicking more button
        current_dict = {}
        
        # click all
        for button in range(len(list_of_containers)):
            # logic if cant click scroll up 20px, and then click untill the button is viewable and clickable
            while True:
                if num_of_ctr_idx == 0:
                    break
                else:
                    try:
                        more_button= WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/main/section[2]/div/div/ul/li[{}]/article/section[2]/div[1]/div/label'.format(num_of_ctr_idx))))  
                        more_button.click()
                        print(num_of_ctr_idx, " CLICKED")
                        time.sleep(random.randrange(2,4))
                        more_button.click()
                        num_of_ctr_idx = num_of_ctr_idx-1
                    except Exception as e:
                        print(e)
                        driver.execute_script('window.scrollBy(0, -20);')
                        print("{} GAGAL CLICK".format(bls))
                       
        time.sleep(5)
        soup1 = BeautifulSoup(driver.page_source, 'lxml')
        
        div_of_script = soup1.find_all("div", {"class":"l-zone__main"})
        for tag in div_of_script:
            for t, tag2 in enumerate(tag.find_all('script')):
                script = tag2
        
        index_separator = '{"IsFavoriteContainer":'
        var = script.text
        list_sementara = var.split("var ")
        str_milestone = list_sementara[1].replace("model =","").replace("\n","")[1:-6]
        list_str = str_milestone.split(index_separator)
        list_str.pop(0)

        for j, j_container in enumerate(list_str):
            json_text_string = index_separator + j_container

            if json_text_string[-1] == ",":
                json_text_string = json_text_string[:-1]

            json_data = json.loads(json_text_string)
            
            ctr_number = json_data['ContainerReference']
        
            origin = json_data['ContainerMoveDetails']['routingInformation']['portOfLoading']['name'][:-5]
            destination = json_data['ContainerMoveDetails']['routingInformation']['portOfDischarge']['name'][:-5]
            milestones = json_data['ContainerMoveDetails']['pastMoves'] + json_data['ContainerMoveDetails']['currentMoves'] + json_data['ContainerMoveDetails']['futureMoves'] 
            current_dict = {
                "Liners" : "CNC",
                "BL Number" : bls,
                "Container Number" : ctr_number,
                "From" : origin,
                "To" : destination,
                }
            for d, milestone in enumerate(milestones):
                if milestone['location']['name'] == origin:
                    mils = milestone['containerStatus'] + " ORIGIN"
                    case = {mils : milestone['containerStatusDate'][:10]}
                    current_dict.update(case)
                elif milestone['location']['name'] == destination:
                    mils = milestone['containerStatus'] + " DESTINATION"
                    case = {mils : milestone['containerStatusDate'][:10]}
                    current_dict.update(case)
            
            list_of_dict.append(current_dict)            
    
    # for failed scraping and further analysis for error
    except Exception as e:
        print(e)
        print("{} GAGAL!!".format(bls))
        gagal.append(bls)

# changing dict key to db key
key_mapping = {
    'ActualVesselDeparture ORIGIN': 'ATD',
    'ActualVesselArrival DESTINATION': 'ATA',
    'ContainerToConsignee DESTINATION': 'Container Release',
    'EmptyInDepotMEA DESTINATION': 'Container Return'
}

# itterate dictionaries and change the keys to match DB's key
for item in list_of_dict:
    transformed_item = {key_mapping.get(key, key): value for key, value in item.items() if key in key_mapping or key not in key_mapping}
    list_of_dict_fix2.append(transformed_item)   

  0%|          | 0/3 [00:00<?, ?it/s]

ARM0293708 consist of  30  containers
30  CLICKED
29  CLICKED
28  CLICKED
27  CLICKED
Message: element click intercepted: Element <label for="card-details-switch-25">...</label> is not clickable at point (1140, 10). Other element would receive the click: <div class="container-fluid">...</div>
  (Session info: chrome=120.0.6099.200)
Stacktrace:
	GetHandleVerifier [0x00AC6EE3+174339]
	(No symbol) [0x009F0A51]
	(No symbol) [0x00706FF6]
	(No symbol) [0x0073E48E]
	(No symbol) [0x0073D09E]
	(No symbol) [0x0073B5F8]
	(No symbol) [0x0073AD7F]
	(No symbol) [0x00732B4E]
	(No symbol) [0x0075700C]
	(No symbol) [0x007325B0]
	(No symbol) [0x00757414]
	(No symbol) [0x0076A104]
	(No symbol) [0x00756DA6]
	(No symbol) [0x00731034]
	(No symbol) [0x00731F8D]
	GetHandleVerifier [0x00B64B1C+820540]
	sqlite3_dbdata_init [0x00C253EE+653550]
	sqlite3_dbdata_init [0x00C24E09+652041]
	sqlite3_dbdata_init [0x00C197CC+605388]
	sqlite3_dbdata_init [0x00C25D9B+656027]
	(No symbol) [0x009FFE6C]
	(No symbol) [0x009F83

 33%|███▎      | 1/3 [03:03<06:06, 183.31s/it]

ARM0293980 consist of  31  containers
31  CLICKED
30  CLICKED
29  CLICKED
28  CLICKED
27  CLICKED
Message: element click intercepted: Element <label for="card-details-switch-25">...</label> is not clickable at point (1140, 4). Other element would receive the click: <div class="container-fluid">...</div>
  (Session info: chrome=120.0.6099.200)
Stacktrace:
	GetHandleVerifier [0x00AC6EE3+174339]
	(No symbol) [0x009F0A51]
	(No symbol) [0x00706FF6]
	(No symbol) [0x0073E48E]
	(No symbol) [0x0073D09E]
	(No symbol) [0x0073B5F8]
	(No symbol) [0x0073AD7F]
	(No symbol) [0x00732B4E]
	(No symbol) [0x0075700C]
	(No symbol) [0x007325B0]
	(No symbol) [0x00757414]
	(No symbol) [0x0076A104]
	(No symbol) [0x00756DA6]
	(No symbol) [0x00731034]
	(No symbol) [0x00731F8D]
	GetHandleVerifier [0x00B64B1C+820540]
	sqlite3_dbdata_init [0x00C253EE+653550]
	sqlite3_dbdata_init [0x00C24E09+652041]
	sqlite3_dbdata_init [0x00C197CC+605388]
	sqlite3_dbdata_init [0x00C25D9B+656027]
	(No symbol) [0x009FFE6C]
	(No symbol

 67%|██████▋   | 2/3 [06:28<03:16, 196.04s/it]

ARM0296321 consist of  20  containers
20  CLICKED
19  CLICKED
18  CLICKED
17  CLICKED
Message: element click intercepted: Element <label for="card-details-switch-15">...</label> is not clickable at point (1140, 10). Other element would receive the click: <div class="container-fluid">...</div>
  (Session info: chrome=120.0.6099.200)
Stacktrace:
	GetHandleVerifier [0x00AC6EE3+174339]
	(No symbol) [0x009F0A51]
	(No symbol) [0x00706FF6]
	(No symbol) [0x0073E48E]
	(No symbol) [0x0073D09E]
	(No symbol) [0x0073B5F8]
	(No symbol) [0x0073AD7F]
	(No symbol) [0x00732B4E]
	(No symbol) [0x0075700C]
	(No symbol) [0x007325B0]
	(No symbol) [0x00757414]
	(No symbol) [0x0076A104]
	(No symbol) [0x00756DA6]
	(No symbol) [0x00731034]
	(No symbol) [0x00731F8D]
	GetHandleVerifier [0x00B64B1C+820540]
	sqlite3_dbdata_init [0x00C253EE+653550]
	sqlite3_dbdata_init [0x00C24E09+652041]
	sqlite3_dbdata_init [0x00C197CC+605388]
	sqlite3_dbdata_init [0x00C25D9B+656027]
	(No symbol) [0x009FFE6C]
	(No symbol) [0x009F83

100%|██████████| 3/3 [08:38<00:00, 173.00s/it]


In [11]:
list_of_dict_fix2

[{'Liners': 'CNC',
  'BL Number': 'ARM0293708',
  'Container Number': 'SEKU4348030',
  'From': 'JAKARTA',
  'To': 'YANGON',
  'EmptyDeliveredToShipper ORIGIN': '2023-04-12',
  'Readytobeloaded ORIGIN': '2023-04-13',
  'LoadedonboardXOF ORIGIN': '2023-04-21',
  'ATD': '2023-04-21',
  'Discharged DESTINATION': '2023-05-22',
  '_id': ObjectId('659e009730ae4566cb34b3ef')},
 {'Liners': 'CNC',
  'BL Number': 'ARM0293708',
  'Container Number': 'CMAU5400909',
  'From': 'JAKARTA',
  'To': 'YANGON',
  'EmptyDeliveredToShipper ORIGIN': '2023-04-12',
  'Readytobeloaded ORIGIN': '2023-04-13',
  'LoadedonboardXOF ORIGIN': '2023-04-21',
  'ATD': '2023-04-21',
  'Discharged DESTINATION': '2023-05-22',
  '_id': ObjectId('659e009730ae4566cb34b3f0')},
 {'Liners': 'CNC',
  'BL Number': 'ARM0293708',
  'Container Number': 'CMAU5590840',
  'From': 'JAKARTA',
  'To': 'YANGON',
  'EmptyDeliveredToShipper ORIGIN': '2023-04-12',
  'Readytobeloaded ORIGIN': '2023-04-13',
  'LoadedonboardXOF ORIGIN': '2023-04-21

In [13]:
# connect to mongodb
from mongoinit import mongo_table_initiation, insert_many_mongo

mongo_table_initiation()
insert_many_mongo(list_of_dict_fix2)

Today's Collection Name ===>  all_tracking_Jan-10-2024
Today's Collection Has Been Made


BulkWriteError: batch op errors occurred, full error: {'writeErrors': [{'index': 0, 'code': 11000, 'errmsg': "E11000 duplicate key error collection: bl_tracking.all_tracking_Jan-10-2024 index: _id_ dup key: { _id: ObjectId('659e009730ae4566cb34b3ef') }", 'keyPattern': {'_id': 1}, 'keyValue': {'_id': ObjectId('659e009730ae4566cb34b3ef')}, 'op': {'Liners': 'CNC', 'BL Number': 'ARM0293708', 'Container Number': 'SEKU4348030', 'From': 'JAKARTA', 'To': 'YANGON', 'EmptyDeliveredToShipper ORIGIN': '2023-04-12', 'Readytobeloaded ORIGIN': '2023-04-13', 'LoadedonboardXOF ORIGIN': '2023-04-21', 'ATD': '2023-04-21', 'Discharged DESTINATION': '2023-05-22', '_id': ObjectId('659e009730ae4566cb34b3ef')}}], 'writeConcernErrors': [], 'nInserted': 0, 'nUpserted': 0, 'nMatched': 0, 'nModified': 0, 'nRemoved': 0, 'upserted': []}

In [10]:
# checking failed bl
gagal

[]

In [33]:
# today's date
# initiating mongo
from pymongo import MongoClient
from datetime import date
# connect to mongodb
from mongoinit import mongo_table_initiation, insert_many_mongo

mongo_table_initiation()

today = date.today()
date_today = today.strftime("%b-%d-%Y")
collection_name = "all_tracking_" + date_today + 'REVISI 3'

cluster = MongoClient("mongodb+srv://tobiassion:tobiassion@cluster0.u2vzz3d.mongodb.net/?retryWrites=true&w=majority")
db = cluster["bl_tracking"]
collection = db[collection_name]
collection.insert_many(list_of_dict_fix2)

Today's Collection Name ===>  all_tracking_Oct-09-2023
Today's Collection Has Been Made


<pymongo.results.InsertManyResult at 0x1cdbf1cc5e0>