In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [3]:
base_url = "https://www.iata.org/en/publications/directories/cargolink/directory/"

In [5]:
# Function to scrape a single page
def scrape_page(url, page_num):
    print(f"Scraping page {page_num}...")
    
    # If it's not the first page, we need to add the page parameter
    if page_num > 1:
        url = f"{url}?page={page_num}"
    
    # Send a GET request to the URL
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve page {page_num}. Status code: {response.status_code}")
        return pd.DataFrame()
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table with class "datatable"
    table = soup.find('table', class_='datatable')
    
    if not table:
        print(f"No table found on page {page_num}")
        return pd.DataFrame()
    
    # Extract data from the table
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cols = row.find_all('td')
        if len(cols) >= 5:  # Ensure we have at least 5 columns
            # Skip the first column and extract the next 4 columns
            name = cols[1].text.strip()
            company_type = cols[2].text.strip()
            country = cols[3].text.strip()
            city = cols[4].text.strip()
            rows.append([name, company_type, country, city])
    
    # Create a DataFrame
    df = pd.DataFrame(rows, columns=['Name', 'Company Type', 'Country', 'City'])
    return df

In [9]:
# Scrape the first 3 pages
all_data = pd.DataFrame()
for page in range(1, 582):  # Pages 1, 2, 3
    page_data = scrape_page(base_url, page)
    if not page_data.empty:
        all_data = pd.concat([all_data, page_data], ignore_index=True)
    
    # Add a delay to be respectful to the server
    if page < 3:  # Don't wait after the last page
        time.sleep(2)

# Save the data to a CSV file
if not all_data.empty:
    csv_filename = "iata_cargo_directory.csv"
    all_data.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")
    print(f"Total records: {len(all_data)}")
else:
    print("No data was scraped.")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

Scraping page 397...
Scraping page 398...
Scraping page 399...
Scraping page 400...
Scraping page 401...
Scraping page 402...
Scraping page 403...
Scraping page 404...
Scraping page 405...
Scraping page 406...
Scraping page 407...
Scraping page 408...
Scraping page 409...
Scraping page 410...
Scraping page 411...
Scraping page 412...
Scraping page 413...
Scraping page 414...
Scraping page 415...
Scraping page 416...
Scraping page 417...
Scraping page 418...
Scraping page 419...
Scraping page 420...
Scraping page 421...
Scraping page 422...
Scraping page 423...
Scraping page 424...
Scraping page 425...
Scraping page 426...
Scraping page 427...
Scraping page 428...
Scraping page 429...
Scraping page 430...
Scraping page 431...
Scraping page 432...
Scraping page 433...
Scraping page 434...
Scraping page 435...
Scraping page 436...
Scraping page 437...
Scraping page 438...
Scraping page 439...
Scraping page 440...
Scraping page 441...
Scraping page 442...
Scraping page 443...
Scraping page

In [10]:
all_data

Unnamed: 0,Name,Company Type,Country,City
0,1st Move International Limited,Freight Forwarder,United Kingdom,Bristol
1,AAT Training Hub Pte Ltd,Accredited Training School,Singapore,Singapore 211672
2,"ACT Multinational Transportation Logistics., Ltd",Freight Forwarder,Vietnam,Ha Noi
3,Aeroflot Aviation School,Accredited Training School,Russian Federation,Moscow
4,Africa Global Logistics Tanzania Limited,Freight Forwarder,"Tanzania, United Republic of",Dar es Salaam
...,...,...,...,...
225,10768090 Canada Inc.,Freight Forwarder,Canada,RICHMOND HILL
226,20 CUBE LOGISTICS,Freight Forwarder,Australia,FORTITUDE VALLEY Queensland 4006
227,3N FOR CARGO SERVICES AND CUSTOMS CLEARANCE,Freight Forwarder,Egypt,CAIRO EG
228,4D FINE ARTS SERVICES INC.,Freight Forwarder,United States,NEW HYDE PARK NY 11040
