In [1]:
# web scraping
from bs4 import BeautifulSoup

# date parser
from datetime import datetime
from dateutil.parser import parse

# web driver
import undetected_chromedriver as uc 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# random wait
import time
import random

# mongodb connection
from pymongo import MongoClient

# progress bar
from tqdm import tqdm

# saving to excel
import pandas as pd

In [2]:
# date parser function
def parse_date(date_str):
    date1 = datetime.strptime(date_str, '%d-%b-%Y')
    output_date_string = date1.strftime("%Y-%m-%d")
    return output_date_string

In [3]:
# class to determine ganjil genap
def is_date(string, fuzzy=False):
    try: 
        parse(string, fuzzy=fuzzy)
        return True
    except ValueError:
        return False

In [4]:
# reading excel file
df = pd.read_excel('BL Numbers.xlsx') 
parse_bl = df['CNC'].tolist()
bl_list = []
for i in parse_bl:
    bl_list.append(str(i))
bl_list = [x.replace(' ', '') for x in bl_list]
bl_list = [x for x in bl_list if x != 'nan']

In [5]:
# view bl list
bl_list

['ARM0296363',
 'ARM0302106',
 'ARM0300326',
 'ARM0300876',
 'ARM0300209',
 'ARM0300221',
 'ARM0301801',
 'ARM0303785',
 'ARM0303661',
 'ARM0301769',
 'ARM0301802',
 'ARM0301958',
 'ARM0302330',
 'ARM0302741',
 'ARM0302585',
 'ARM0302772',
 'ARM0303044',
 'ARM0303415',
 'ARM0303224',
 'ARM0305556',
 'ARM0305653',
 'ARM0305513',
 'ARM0306330',
 'ARM0306500']

In [6]:
# collecting failed BL to track
gagal = []

<h3><strong>Web Scraping Flow</strong></h3>
<ol>
 <li>Acquiring every BL Number to track and store it into a list of BL</li>
 <li>Iterate all of the list and search it through the liners web</li>
 <li>Take list of container number and store it into a list</li>
 <li>If container number>1 iterate through all container for container's milestone info and store it into a dictionary and append that to a list of dict</li>
 <li>If container = 1 take container info and store it into a dictionary and append it to list of dictionary</li>
 <li>Modify dictionaries to match db's column template</li>
 <li>Insert list of dictionary to mongo db or export it into and excel file</li>
</ol>

In [7]:
hasil_akhir = []

# web scripting
options = Options()
options.add_argument("--window-size=1920,1280")
driver = uc.Chrome()
driver.get("https://www.cnc-line.com/ebusiness/tracking/search")

# search box
search_box = driver.find_element(By. XPATH, '/html/body/div[2]/main/section/div/div/form[3]/fieldset/div/div[1]/span[1]/input[2]')
search_box.send_keys(bl_list[random.randrange(0,len(bl_list))])

# click search
time.sleep(1)
search_button = driver.find_element(By.XPATH, '/html/body/div[2]/main/section/div/div/form[3]/fieldset/div/div[2]/button')
time.sleep(1)
search_button.click()

for q, bls in enumerate(tqdm(bl_list)):
    try:
        time.sleep(random.randrange(2,5))
        # masukin BL baru
        search_box2 = driver.find_element(By. XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[1]/span[1]/input[2]')
        # driver.execute_script("window.scrollTo(100,document.body.scrollHeight);")
        search_box2.clear()
        search_box2.send_keys(bls)
        time.sleep(1.1)
        search_button = driver.find_element(By.XPATH, '/html/body/div[2]/main/section[1]/div/div/form[3]/fieldset/div/div[2]/button')
        time.sleep(1)
        search_button.click()
    
        time.sleep(3)
        # taking data from web 
        soup = BeautifulSoup(driver.page_source, 'lxml')

        # finding containers in bl
        containers_in_bl = soup.find_all('dl',{'class','container-ref'})
        list_of_containers = []
        for tag in containers_in_bl:
            for e, f  in enumerate(tag.find_all('span')):
                if len(f.text) == 11:
                    list_of_containers.append(f.text)

        # web scraping flow if bl have >1 container number
        if len(list_of_containers)>1:

            print(bls, 'consist of ', len(list_of_containers),' containers' )
            # clicking more button
            more_button= WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/main/section[2]/div/div/ul/li[1]/article/section[2]/div[1]/div/label'))) 
            more_button.click()

            time.sleep(3)
            soup1 = BeautifulSoup(driver.page_source, 'lxml')

            # finding milestone and expected data
            data_milestone = soup1.find_all('tr',{'class','k-master-row done'}) + soup1.find_all('tr',{'class','k-alt k-master-row done'}) + soup1.find_all('tr',{'class','k-alt k-master-row current'}) + soup1.find_all('tr',{'class','k-master-row current'})
            data_expected = soup1.find_all('tr',{'class','k-master-row inactivek-alt'}) + soup1.find_all('tr',{'class','k-master-row inactive'})
            headline = soup1.find_all('strong')

            # making a list of milestones
            list_of_milestone_date = []
            list_of_milestone_movement = []
            list_of_milestone_cities = []
            for tag in data_milestone:
                for a, b  in enumerate(tag.find_all('span',{'class','calendar'})):
                    list_of_milestone_date.append(b.text[-11:])
                for c, d  in enumerate(tag.find_all('span',{'class','capsule'})):
                    list_of_milestone_movement.append(d.text)
                for e, f  in enumerate(tag.find_all('div',{'class','location row js-bubble'})):
                    list_of_milestone_cities.append(f.text[:-16])

            # making a list of expected dict
            list_of_expected_date = []
            list_of_expected_movement = []
            list_of_expected_cities = []
            for tag in data_expected:
                for g, h  in enumerate(tag.find_all('span',{'class','calendar'})):
                    list_of_expected_date.append(h.text[-11:])
                for i, j  in enumerate(tag.find_all('span',{'class','capsule'})):
                    list_of_expected_movement.append(j.text)
                for k, l  in enumerate(tag.find_all('div',{'class','location row js-bubble'})):
                    list_of_expected_cities.append(l.text[:-16])

            # Apending data from list to current dict
            current_dict = {}
            for m, milestone in enumerate(sorted(list_of_milestone_date)):
                if list_of_milestone_cities[m] == headline[1].text[:-5] or list_of_milestone_cities[m] == headline[2].text[:-5]:
                    case_milestone = {list_of_milestone_movement[m] +' '+ list_of_milestone_cities[m] :list_of_milestone_date[m]}
                    current_dict.update(case_milestone)

            for e, expected in enumerate(sorted(list_of_expected_date)):
                if list_of_expected_cities[e] == headline[2].text[:-5] or list_of_expected_cities[e] == headline[1].text[:-5]:
                    case_expected = {"EXPECTED " + list_of_expected_movement[e] + ' ' +  list_of_expected_cities[e]:list_of_expected_date[e]}
                    current_dict.update(case_expected)

            # sorting dictionary using date
            current_dict = dict(sorted(current_dict.items(), key=lambda item: parse_date(item[1])))

            # cities from to
            cities = soup1.find_all('ul',{'class','timeline--items'})

            for x, container in enumerate(list_of_containers):
                appending_dict = current_dict.copy()
                for tag in cities:
                    for cc, city in enumerate(tag.find_all('div',{'class':'timeline--item-description'})):
                        case_city = {city.text.replace("\n","")[:3]:city.text.replace("\n","")[3:-5]}
                        appending_dict.update(case_city)
                appending_dict.update({"Container Number":container})
                appending_dict.update({"BL Number":bls})
                appending_dict.update({"Liners":"CNC"})

                # appending to list of dict
                hasil_akhir.append(appending_dict)
                print(bls, container, " DONE")
            time.sleep(1)
        else:
            print(bls, 'consist of 1 containers' )
            time.sleep(1)
            soup2 = BeautifulSoup(driver.page_source, 'lxml')

            current_dict = {}

            # finding milestone and expected data
            data_milestone = soup2.find_all('tr',{'class','k-master-row done'}) + soup2.find_all('tr',{'class','k-alt k-master-row done'}) + soup2.find_all('tr',{'class','k-alt k-master-row current'}) + soup2.find_all('tr',{'class','k-master-row current'})
            data_expected = soup2.find_all('tr',{'class','k-alt k-master-row inactive'}) + soup2.find_all('tr',{'class','k-master-row inactive'})
            headline = soup2.find_all('strong')

            # making a list of milestones
            list_of_milestone_date = []
            list_of_milestone_movement = []
            list_of_milestone_cities = []
            for tag in data_milestone:
                for a, b  in enumerate(tag.find_all('span',{'class','calendar'})):
                    list_of_milestone_date.append(b.text[-11:])
                for c, d  in enumerate(tag.find_all('span',{'class','capsule'})):
                    list_of_milestone_movement.append(d.text)
                for e, f  in enumerate(tag.find_all('div',{'class','location row js-bubble'})):
                    list_of_milestone_cities.append(f.text[:-16])

            # making a list od expected dict
            list_of_expected_date = []
            list_of_expected_movement = []
            list_of_expected_cities = []
            for tag in data_expected:
                for g, h  in enumerate(tag.find_all('span',{'class','calendar'})):
                    list_of_expected_date.append(h.text[-11:])
                for i, j  in enumerate(tag.find_all('span',{'class','capsule'})):
                    list_of_expected_movement.append(j.text)
                for k, l  in enumerate(tag.find_all('div',{'class','location row js-bubble'})):
                    list_of_expected_cities.append(l.text[:-16])

            # Apending data from list to current dict
            current_dict = {}
            for m, milestone in enumerate(sorted(list_of_milestone_date)):
                if list_of_milestone_cities[m] == headline[2].text[:-5] or list_of_milestone_cities[m] == headline[3].text[:-5]:
                    case_milestone = {list_of_milestone_movement[m] +' '+ list_of_milestone_cities[m] :list_of_milestone_date[m]}
                    current_dict.update(case_milestone)
                  
            for e, expected in enumerate(sorted(list_of_expected_date)):
                if list_of_expected_cities[e] == headline[3].text[:-5] or list_of_expected_cities[e] == headline[2].text[:-5]:
                    case_expected = {"EXPECTED " + list_of_expected_movement[e] + ' ' +  list_of_expected_cities[e]:list_of_expected_date[e]}
                    current_dict.update(case_expected)

            # updating current dict with liners, BL number, Ctr Number, POL, POD
            current_dict_fix = dict(sorted(current_dict.items(), key=lambda item: parse_date(item[1])))
            current_dict_fix.update({"Liners":"CNC"})
            current_dict_fix.update({"BL Number":bls})
            current_dict_fix.update({"Container Number":headline[0].text})
            current_dict_fix.update({"POL":headline[2].text[:-5]})
            current_dict_fix.update({"POD":headline[3].text[:-5]})

            # appending to list of dict
            hasil_akhir.append(current_dict_fix)
            print(bls, headline[0].text, " DONE")
            time.sleep(1)

    except Exception as e:
        # for failed bl
        print(e)
        print("{} GAGAL!!".format(bls))
        gagal.append(bls)

  0%|          | 0/24 [00:00<?, ?it/s]

ARM0296363 consist of  5  containers
ARM0296363 CMAU8827659  DONE
ARM0296363 CMAU8535106  DONE
ARM0296363 TCKU6232468  DONE
ARM0296363 SEKU5921803  DONE
ARM0296363 TXGU7146680  DONE


  4%|▍         | 1/24 [00:17<06:46, 17.69s/it]

ARM0302106 consist of  2  containers
ARM0302106 TXGU5269428  DONE
ARM0302106 CMAU8582181  DONE


  8%|▊         | 2/24 [00:36<06:39, 18.17s/it]

ARM0300326 consist of  2  containers
ARM0300326 CAAU6073180  DONE
ARM0300326 CMAU8750710  DONE


 12%|█▎        | 3/24 [00:54<06:20, 18.11s/it]

ARM0300876 consist of  4  containers
ARM0300876 TLLU4797120  DONE
ARM0300876 FFAU2162030  DONE
ARM0300876 TRHU6499283  DONE
ARM0300876 TLLU4927461  DONE


 17%|█▋        | 4/24 [01:11<05:55, 17.79s/it]

ARM0300209 consist of 1 containers
ARM0300209 CMAU4347395  DONE


 21%|██        | 5/24 [01:24<05:06, 16.13s/it]

ARM0300221 consist of 1 containers
ARM0300221 CMAU6803365  DONE


 25%|██▌       | 6/24 [01:38<04:34, 15.27s/it]

ARM0301801 consist of 1 containers
ARM0301801 TGBU6979463  DONE


 29%|██▉       | 7/24 [01:53<04:20, 15.31s/it]

ARM0303785 consist of  2  containers
ARM0303785 TLLU7706696  DONE
ARM0303785 TRHU8958817  DONE


 33%|███▎      | 8/24 [02:09<04:07, 15.49s/it]

ARM0303661 consist of  2  containers
ARM0303661 ECMU9920352  DONE
ARM0303661 CMAU8551684  DONE


 38%|███▊      | 9/24 [02:26<04:00, 16.02s/it]

ARM0301769 consist of  5  containers
ARM0301769 TEMU6511220  DONE
ARM0301769 CMAU3343538  DONE
ARM0301769 CMAU7072835  DONE
ARM0301769 CMAU4806576  DONE
ARM0301769 TCNU5715229  DONE


 42%|████▏     | 10/24 [02:44<03:49, 16.41s/it]

ARM0301802 consist of  2  containers
ARM0301802 CMAU8936855  DONE
ARM0301802 GCXU5314769  DONE


 46%|████▌     | 11/24 [03:01<03:36, 16.66s/it]

ARM0301958 consist of 1 containers
ARM0301958 CMAU6906471  DONE


 50%|█████     | 12/24 [03:15<03:11, 15.92s/it]

ARM0302330 consist of 1 containers
ARM0302330 TRHU7995570  DONE


 54%|█████▍    | 13/24 [03:30<02:52, 15.64s/it]

ARM0302741 consist of  3  containers
ARM0302741 TRHU7913253  DONE
ARM0302741 TRHU8260370  DONE
ARM0302741 CMAU7148930  DONE


 58%|█████▊    | 14/24 [03:46<02:38, 15.87s/it]

ARM0302585 consist of  5  containers
ARM0302585 CMAU6609468  DONE
ARM0302585 TRHU5006700  DONE
ARM0302585 CMAU8852498  DONE
ARM0302585 SEGU6359023  DONE
ARM0302585 SEKU5761433  DONE


 62%|██████▎   | 15/24 [04:02<02:21, 15.72s/it]

ARM0302772 consist of 1 containers
ARM0302772 SEKU6076687  DONE


 67%|██████▋   | 16/24 [04:17<02:04, 15.57s/it]

ARM0303044 consist of  3  containers
ARM0303044 SEKU4531159  DONE
ARM0303044 CMAU6899794  DONE
ARM0303044 CMAU6177750  DONE


 71%|███████   | 17/24 [04:33<01:49, 15.63s/it]

ARM0303415 consist of 1 containers
ARM0303415 TRHU5686314  DONE


 75%|███████▌  | 18/24 [04:46<01:29, 14.85s/it]

ARM0303224 consist of 1 containers
ARM0303224 CMAU9191767  DONE


 79%|███████▉  | 19/24 [05:00<01:13, 14.66s/it]

ARM0305556 consist of  2  containers
ARM0305556 CAAU6291833  DONE
ARM0305556 CMAU6227064  DONE


 83%|████████▎ | 20/24 [05:15<00:59, 14.80s/it]

ARM0305653 consist of 1 containers
ARM0305653 SEKU6090427  DONE


 88%|████████▊ | 21/24 [05:30<00:44, 14.89s/it]

ARM0305513 consist of  3  containers
ARM0305513 SEKU5696322  DONE
ARM0305513 CMAU8718766  DONE
ARM0305513 TCKU6354975  DONE


 92%|█████████▏| 22/24 [05:45<00:29, 14.98s/it]

ARM0306330 consist of  2  containers
ARM0306330 TCLU1552105  DONE
ARM0306330 BEAU4671679  DONE


 96%|█████████▌| 23/24 [06:02<00:15, 15.42s/it]

ARM0306500 consist of  13  containers
ARM0306500 CMAU6338556  DONE
ARM0306500 CMAU7665152  DONE
ARM0306500 CMAU9253935  DONE
ARM0306500 CMAU6859390  DONE
ARM0306500 TLLU5034709  DONE
ARM0306500 SEKU5669902  DONE
ARM0306500 CMAU7526389  DONE
ARM0306500 TCNU4314664  DONE
ARM0306500 TCLU9821197  DONE
ARM0306500 CMAU4525033  DONE
ARM0306500 CMAU6879000  DONE
ARM0306500 CMAU7497779  DONE
ARM0306500 BHCU4961587  DONE


100%|██████████| 24/24 [06:20<00:00, 15.84s/it]


In [8]:
gagal

[]

In [11]:
hasil_akhir2 = hasil_akhir

In [12]:
hasil_akhir2

[{'Empty to shipper JAKARTA': '17-MAY-2023',
  'Ready to be loaded JAKARTA': '21-MAY-2023',
  'Loaded on board JAKARTA': '25-MAY-2023',
  'Vessel Departure JAKARTA': '26-MAY-2023',
  'Vessel Arrival MANILA': '02-JUN-2023',
  'Discharged MANILA': '03-JUN-2023',
  'Container to consignee MANILA': '08-SEP-2023',
  'POL': 'JAKARTA',
  'POD': 'MANILA',
  'Container Number': 'CMAU8827659',
  'BL Number': 'ARM0296363',
  'Liners': 'CNC'},
 {'Empty to shipper JAKARTA': '17-MAY-2023',
  'Ready to be loaded JAKARTA': '21-MAY-2023',
  'Loaded on board JAKARTA': '25-MAY-2023',
  'Vessel Departure JAKARTA': '26-MAY-2023',
  'Vessel Arrival MANILA': '02-JUN-2023',
  'Discharged MANILA': '03-JUN-2023',
  'Container to consignee MANILA': '08-SEP-2023',
  'POL': 'JAKARTA',
  'POD': 'MANILA',
  'Container Number': 'CMAU8535106',
  'BL Number': 'ARM0296363',
  'Liners': 'CNC'},
 {'Empty to shipper JAKARTA': '17-MAY-2023',
  'Ready to be loaded JAKARTA': '21-MAY-2023',
  'Loaded on board JAKARTA': '25-MAY

In [18]:
# changeing city name in milestone to origin and destination refering from POL and POD
list_of_dict_fix = []
for fd, filter_dict in enumerate(hasil_akhir2):
    try:
        print(filter_dict["Container Number"], fd)
        replacement_mapping = {
            filter_dict["POL"]: 'Origin',
            filter_dict["POD"]: 'Destination'
        }
    except Exception as e:
        print(e)
        gagal.append(filter_dict["BL Number"])
        

    updated_dict = {}

    for key, value in filter_dict.items():
        for old_key, new_key in replacement_mapping.items():
            try:
                key = key.replace(old_key, new_key)
            except:
                pass
        updated_dict[key] = value

    list_of_dict_fix.append(updated_dict)

CMAU8827659 0
CMAU8535106 1
TCKU6232468 2
SEKU5921803 3
TXGU7146680 4
TXGU5269428 5
CMAU8582181 6
CAAU6073180 7
CMAU8750710 8
TLLU4797120 9
FFAU2162030 10
TRHU6499283 11
TLLU4927461 12
CMAU4347395 13
CMAU6803365 14
TGBU6979463 15
TLLU7706696 16
TRHU8958817 17
ECMU9920352 18
CMAU8551684 19
TEMU6511220 20
CMAU3343538 21
CMAU7072835 22
CMAU4806576 23
TCNU5715229 24
CMAU8936855 25
GCXU5314769 26
CMAU6906471 27
TRHU7995570 28
TRHU7913253 29
TRHU8260370 30
CMAU7148930 31
CMAU6609468 32
TRHU5006700 33
CMAU8852498 34
SEGU6359023 35
SEKU5761433 36
SEKU6076687 37
SEKU4531159 38
CMAU6899794 39
CMAU6177750 40
TRHU5686314 41
CMAU9191767 42
CAAU6291833 43
'POL'
CMAU6227064 44
'POL'
SEKU6090427 45
SEKU5696322 46
CMAU8718766 47
TCKU6354975 48
TCLU1552105 49
BEAU4671679 50
CMAU6338556 51
CMAU7665152 52
CMAU9253935 53
CMAU6859390 54
TLLU5034709 55
SEKU5669902 56
CMAU7526389 57
TCNU4314664 58
TCLU9821197 59
CMAU4525033 60
CMAU6879000 61
CMAU7497779 62
BHCU4961587 63


In [19]:
gagal

['ARM0305556', 'ARM0305556']

In [20]:
# change key to git db format
list_of_dict_fix2 = []
for filter_dict in list_of_dict_fix:
    replacement_mapping = {
        "POL" : "From",
        "POD" : "To",
        "Vessel Departure Origin": 'ATD',
        "Discharged Destination": 'ATA',
        "EXPECTED Vessel Arrival Destination" : "ETD",
        "Container to consignee Destination": 'Container Release',
        "Empty in depot Destination" : 'Container Return'
    }

    updated_dict = {}

    for key, value in filter_dict.items():
        for old_key, new_key in replacement_mapping.items():
            key = key.replace(old_key, new_key)
        updated_dict[key] = value

        if is_date(value):
            input_date = datetime.strptime(value, "%d-%b-%Y")
            updated_dict[key] = input_date.strftime("%Y-%m-%d")


    list_of_dict_fix2.append(updated_dict)

In [21]:
list_of_dict_fix2[0]

{'Empty to shipper Origin': '2023-05-17',
 'Ready to be loaded Origin': '2023-05-21',
 'Loaded on board Origin': '2023-05-25',
 'ATD': '2023-05-26',
 'Vessel Arrival Destination': '2023-06-02',
 'ATA': '2023-06-03',
 'Container Release': '2023-09-08',
 'From': 'JAKARTA',
 'To': 'MANILA',
 'Container Number': 'CMAU8827659',
 'BL Number': 'ARM0296363',
 'Liners': 'CNC'}

In [23]:
# exporting to excel
df = pd.DataFrame(list_of_dict_fix2)
excel_file_path = 'export excel/CNC.xlsx'
df.to_excel(excel_file_path, index=False)

In [None]:
# inserting list of dict to mongo db
cluster = MongoClient("mongodb+srv://tobiassion:tobiassion@cluster0.u2vzz3d.mongodb.net/?retryWrites=true&w=majority")
db = cluster["bl_tracking"]
collection = db["all_tracking"]
collection.insert_many(list_of_dict_fix2)
print("inserting many complete!!")