# ShipChecker

In [24]:
## raw data path
rawDataPath = './portData/rawData.txt'

## google search api path
googleSearchApiPath = '../googleSearchApi.txt'
## google search custom search engine id
googleSearchCseIdPath = '../googleSearchCSE.txt'

#Processingdata path
processingDataPath = './processingData.csv'

# processed data path
processedDataPath = './processedData.csv'

#Output path
outputPath = './output.csv'

In [2]:
import pandas as pd
import warnings
import re
from bs4 import BeautifulSoup
import asyncio
import nest_asyncio

from pyppeteer import launch
from googleapiclient.discovery import build

from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=(SettingWithCopyWarning))

In [3]:
# load raw data and show the first 5 rows
rawData = pd.read_csv(rawDataPath, sep='\t')
rawData.head(5)

Unnamed: 0,DATE & TIME,ARRIVAL / DEPARTURE,VESSEL,VESSEL TYPE,FROM,TO,IN PORT
0,Wed 20 Mar 20:00,ARRIVAL,KOTA LIMA,CONTAINER SHIP (FULLY CELLULAR),CHIWAN PT,12 BROTHERSON DOCK (BD12),NO
1,Wed 20 Mar 22:00,ARRIVAL,MSC KANU F,CONTAINER SHIP (FULLY CELLULAR),BELL BAY,9 BROTHERSON DOCK (BD9),NO
2,Thu 21 Mar 03:00,ARRIVAL,WIDE INDIA,CONTAINER SHIP (FULLY CELLULAR),AUCKLAND,7 BROTHERSON DOCK (BD7),NO
3,Thu 21 Mar 15:00,ARRIVAL,GASCHEM HOMER,LPG TANKER,BRISBANE,BULK LIQUID BERTH 1 (BLB1),NO
4,Thu 21 Mar 17:00,ARRIVAL,MARI COUVA,CHEMICAL/PRODUCTS TANKER,ONSAN/ULSAN,BULK LIQUID BERTH 1 (BLB1),NO


In [4]:
# load google search api key and custom search engine id
googleApiKey = open(googleSearchApiPath, 'r').read()
print(googleApiKey[:5])
googleSearchCseId = open(googleSearchCseIdPath, 'r').read()
print(googleSearchCseId)

AIzaS
e0f84b16165d14dc2


In [5]:
# google search function and data extraction
api_key = googleApiKey
cse_id = googleSearchCseId

def google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res

def get_link(results):
    link = results['items'][0]['link']
    return link
    
def get_snippet(results):
    snippet = results['items'][0]['snippet']
    return snippet

def get_IMO_number(snippet):
    imo_number_match = re.search(r'IMO number, (\d+)', snippet)
    if imo_number_match:
        imo_number = imo_number_match.group(1)
    else:
        imo_number = None  
    return imo_number

def get_vessel_details(vessel_name):
    search_query = f"what is imo number for {vessel_name}"
    results = google_search(search_query, api_key, cse_id)
    link = get_link(results)
    snippet = get_snippet(results)
    imo_number = get_IMO_number(snippet)
    
    return link, snippet, imo_number

In [6]:
processingData = rawData
processingData[['link','snippet','imo_number']] = processingData.apply(lambda row: get_vessel_details(row['VESSEL']), axis=1, result_type='expand')
processingData.to_csv(processingDataPath)
processingData.head(5)

In [14]:
# 爬取网页内容
async def init_browser():
    return await launch(headless=True, args=['--no-sandbox'])

async def scrape(browser, url):
    page = await browser.newPage()
    await page.setUserAgent('My User Agent')
    try:
        await page.goto(url, {'timeout': 600000})  # 增加超时时间
        await page.waitForSelector('#port-calls', {'timeout': 600000})
        element = await page.querySelector('#port-calls')
        content = await page.evaluate('(element) => element.innerHTML', element)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        content = "None"
    finally:
        await page.close()
    return content

async def main(processingData):
    browser = await init_browser()
    tasks = [scrape(browser, url) for url in processingData['link']]
    div_contents = await asyncio.gather(*tasks, return_exceptions=True)  # exceoption handling
    await browser.close()
    processingData['div_content'] = [content for content in div_contents]


Error scraping https://www.vesselfinder.com/vessels/details/9698252: Navigation Timeout Exceeded: 60000 ms exceeded.
Error scraping https://www.vesselfinder.com/vessels/details/9665683: Navigation Timeout Exceeded: 60000 ms exceeded.
Error scraping https://www.vesselfinder.com/vessels/details/9915258: Navigation Timeout Exceeded: 60000 ms exceeded.
Error scraping https://www.vesselfinder.com/vessels/details/9329552: Navigation Timeout Exceeded: 60000 ms exceeded.
Error scraping https://www.vesselfinder.com/vessels/details/9228576: Navigation Timeout Exceeded: 60000 ms exceeded.
Error scraping https://www.vesselfinder.com/vessels/details/9960784: Navigation Timeout Exceeded: 60000 ms exceeded.
Error scraping https://www.vesselfinder.com/vessels/details/9419668: Navigation Timeout Exceeded: 60000 ms exceeded.
Error scraping https://www.vesselfinder.com/vessels/details/9225782: Navigation Timeout Exceeded: 60000 ms exceeded.
Error scraping https://www.vesselfinder.com/vessels/details/9906

In [None]:

asyncio.run(main(processingData))
processingData.head(5)


In [21]:

for index, html_content in enumerate(processingData['div_content']):
    # If the html_content is None, set the location and arrival to timeout
    if html_content is None:
        processingData.at[index, 'location_1'] = 'timeout'
        processingData.at[index, 'Arrival_1'] = 'timeout'
        continue  # Skip the rest of the loop

    soup = BeautifulSoup(html_content, 'html.parser')
    divs = soup.find_all('div', class_='_1hgmG')

    if not divs:  # If there are no divs, set the location and arrival to timeout
        processingData.at[index, 'location_1'] = 'timeout'
        processingData.at[index, 'Arrival_1'] = 'timeout'
    else:
        for i, div in enumerate(divs, 1):
            location_anchor = div.find_previous_sibling('a')
            arrival_div = div.find('div', class_='_1GQkK')

            location = location_anchor.text.strip() if location_anchor else 'timeout'
            arrival = arrival_div.text.strip() if arrival_div else 'timeout'
            
            processingData.at[index, f'location_{i}'] = location
            processingData.at[index, f'Arrival_{i}'] = arrival



In [26]:
processedData = processingData  # assuming processingData is a DataFrame
# Now save the DataFrame to a CSV file
processedData.to_csv(processedDataPath)
processedData.head(5)


Unnamed: 0,DATE & TIME,ARRIVAL / DEPARTURE,VESSEL,VESSEL TYPE,FROM,TO,IN PORT,link,snippet,imo_number,...,location_1,Arrival_1,location_2,Arrival_2,location_3,Arrival_3,location_4,Arrival_4,location_5,Arrival_5
0,Wed 20 Mar 20:00,ARRIVAL,KOTA LIMA,CONTAINER SHIP (FULLY CELLULAR),CHIWAN PT,12 BROTHERSON DOCK (BD12),NO,https://www.vesselfinder.com/vessels/details/9...,"IMO number, 9267651. Vessel Name, KOTA LIMA. S...",9267651.0,...,"Botany Bay, Australia","Mar 20, 09:48","Shenzhen, China","Mar 6, 21:03","Nansha, China","Mar 5, 21:15","Shanghai, China","Mar 2, 18:57","Qingdao, China","Feb 29, 10:28"
1,Wed 20 Mar 22:00,ARRIVAL,MSC KANU F,CONTAINER SHIP (FULLY CELLULAR),BELL BAY,9 BROTHERSON DOCK (BD9),NO,https://www.vesselfinder.com/vessels/details/9...,"IMO number, 9236585. Vessel Name, MSC KANU F. ...",9236585.0,...,"Botany Bay, Australia","Mar 20, 12:10","Bell Bay, Australia","Mar 17, 18:43","Botany Bay, Australia","Feb 28, 02:00","Lautoka, Fiji","Feb 20, 13:40","Lautoka Anch., Fiji","Feb 16, 00:15"
2,Thu 21 Mar 03:00,ARRIVAL,WIDE INDIA,CONTAINER SHIP (FULLY CELLULAR),AUCKLAND,7 BROTHERSON DOCK (BD7),NO,https://www.vesselfinder.com/vessels/details/9...,"Vessel WIDE INDIA (IMO 9698252, MMSI 538005751...",9698252.0,...,timeout,timeout,,,,,,,,
3,Thu 21 Mar 15:00,ARRIVAL,GASCHEM HOMER,LPG TANKER,BRISBANE,BULK LIQUID BERTH 1 (BLB1),NO,https://www.vesselfinder.com/vessels/details/9...,"Vessel GASCHEM HOMER (IMO 9915258, MMSI 636093...",,...,timeout,timeout,,,,,,,,
4,Thu 21 Mar 17:00,ARRIVAL,MARI COUVA,CHEMICAL/PRODUCTS TANKER,ONSAN/ULSAN,BULK LIQUID BERTH 1 (BLB1),NO,https://www.vesselfinder.com/vessels/details/9...,"MARI COUVA ; Callsign, LAVL5 ; Flag, Norway ; ...",9848584.0,...,"Onsan, Korea","Mar 4, 05:42","Onsan, Korea","Feb 27, 22:14","Ulsan Anch., Korea","Feb 24, 14:42","Ulsan Anch., Korea","Feb 17, 06:20","Nantong Anch., China","Feb 10, 10:02"
5,Thu 21 Mar 20:30,ARRIVAL,CARL SCHULTE,CONTAINER SHIP (FULLY CELLULAR),MELBOURNE,HAYES DOCK 2 (HD2),NO,https://www.vesselfinder.com/vessels/details/9...,"CARL SCHULTE · Container Ship, IMO 9665683 · P...",,...,timeout,timeout,,,,,,,,
6,Fri 22 Mar 02:15,ARRIVAL,MANDALAY,CONTAINER SHIP (FULLY CELLULAR),BRISBANE,HAYES DOCK 1 (HD1),NO,https://www.vesselfinder.com/vessels/details/9...,"Vessel MANDALAY (IMO 9743502, MMSI 563024900) ...",9743502.0,...,"Brisbane, Australia","Mar 18, 16:28","Noumea, New Caledonia","Mar 13, 18:28","Papeete, French Polynesia","Mar 6, 16:29","Panama Canal Anch. Atlantic, Panama","Feb 23, 12:47","Kingston, Jamaica","Feb 21, 20:31"
7,Fri 22 Mar 04:15,ARRIVAL,NORDMAAS,CONTAINER SHIP (FULLY CELLULAR),PORT CHALMERS,8 BROTHERSON DOCK (BD8),NO,https://www.vesselfinder.com/vessels/details/9...,"Vessel NORDMAAS (IMO 9822736, MMSI 209695000) ...",9822736.0,...,"Nelson, New Zealand","Mar 8, 20:12","Auckland, New Zealand","Mar 4, 18:59","Botany Bay, Australia","Feb 24, 11:31","Tauranga, New Zealand","Feb 16, 07:14","Napier, New Zealand","Feb 14, 23:43"
8,Fri 22 Mar 13:00,ARRIVAL,STI STEADFAST,CRUDE/OIL PRODUCTS TANKER,ULSAN,KURNELL 1 (KUR1),NO,https://www.vesselfinder.com/vessels/details/9...,"Vessel STI STEADFAST (IMO 9719719, MMSI 538006...",9719719.0,...,"Ulsan, Korea","Mar 3, 22:07","Gwangyang, Korea","Feb 25, 07:20","Anegasaki, Japan","Feb 20, 05:07","Anegasaki, Japan","Feb 17, 05:09","Anegasaki Cosmo Sea Berth, Japan","Feb 14, 01:37"
9,Fri 22 Mar 15:30,ARRIVAL,CMA CGM TANCREDI,CONTAINER SHIP (FULLY CELLULAR),MELBOURNE,6 BROTHERSON DOCK (BD6),NO,https://www.vesselfinder.com/vessels/details/9...,"CMA CGM TANCREDI ; Callsign, 9HA2837 ; Flag, M...",9436355.0,...,"Melbourne, Australia","Mar 14, 14:00","Singapore, Singapore","Mar 2, 12:30","Tanjung Pelepas, Malaysia","Mar 1, 15:03","Rotterdam Maasvlakte, Netherlands","Feb 29, 06:21","Fremantle, Australia","Feb 20, 17:24"


In [27]:

outputData = processedData.drop(columns=['link', 'snippet', 'div_content'])
outputData.to_csv(outputPath)
outputData.head(5)

Unnamed: 0,DATE & TIME,ARRIVAL / DEPARTURE,VESSEL,VESSEL TYPE,FROM,TO,IN PORT,imo_number,location_1,Arrival_1,location_2,Arrival_2,location_3,Arrival_3,location_4,Arrival_4,location_5,Arrival_5
0,Wed 20 Mar 20:00,ARRIVAL,KOTA LIMA,CONTAINER SHIP (FULLY CELLULAR),CHIWAN PT,12 BROTHERSON DOCK (BD12),NO,9267651.0,"Botany Bay, Australia","Mar 20, 09:48","Shenzhen, China","Mar 6, 21:03","Nansha, China","Mar 5, 21:15","Shanghai, China","Mar 2, 18:57","Qingdao, China","Feb 29, 10:28"
1,Wed 20 Mar 22:00,ARRIVAL,MSC KANU F,CONTAINER SHIP (FULLY CELLULAR),BELL BAY,9 BROTHERSON DOCK (BD9),NO,9236585.0,"Botany Bay, Australia","Mar 20, 12:10","Bell Bay, Australia","Mar 17, 18:43","Botany Bay, Australia","Feb 28, 02:00","Lautoka, Fiji","Feb 20, 13:40","Lautoka Anch., Fiji","Feb 16, 00:15"
2,Thu 21 Mar 03:00,ARRIVAL,WIDE INDIA,CONTAINER SHIP (FULLY CELLULAR),AUCKLAND,7 BROTHERSON DOCK (BD7),NO,9698252.0,timeout,timeout,,,,,,,,
3,Thu 21 Mar 15:00,ARRIVAL,GASCHEM HOMER,LPG TANKER,BRISBANE,BULK LIQUID BERTH 1 (BLB1),NO,,timeout,timeout,,,,,,,,
4,Thu 21 Mar 17:00,ARRIVAL,MARI COUVA,CHEMICAL/PRODUCTS TANKER,ONSAN/ULSAN,BULK LIQUID BERTH 1 (BLB1),NO,9848584.0,"Onsan, Korea","Mar 4, 05:42","Onsan, Korea","Feb 27, 22:14","Ulsan Anch., Korea","Feb 24, 14:42","Ulsan Anch., Korea","Feb 17, 06:20","Nantong Anch., China","Feb 10, 10:02"
