In [1]:
import json

from bs4 import BeautifulSoup as bs
from urllib.request import urlopen

from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pandas as pd
import html5lib


import pandas as pd 

In [2]:
def scrape(url, years=None, countries=None):
    '''
    Collect country and commodity wide imports and exports data 
    from Indian Department of Commerce's TradeState webpage
    '''
     
    # name of html elements on webpage that cntrol search params
    countriesElementName = 'select3'
    yearsElementName = 'select2'
    HSCodeLevelElementName = 'hslevel'
    numRecordsElementName = 'radioDAll'
    valueElementName = 'radiousd'
    submitButtonElementName = 'button1'

    # initalize chrome driver
    driver = webdriver.Chrome()
    driver.implicitly_wait(0.05)
    
    # build country and year codes, if needed
    driver.get(url)
    if not countries: 
        countries = {x.get_attribute('value'):x.get_attribute('text') for x in Select(driver.find_element_by_id(countriesElementName)).options}
        with open('countryCodes.json', 'w') as file:
            json.dump(countries, file)
    if not years:
        years = {x.get_attribute('value'):x.get_attribute('text') for x in Select(driver.find_element_by_id(yearsElementName)).options}
        with open('yearCodes.json', 'w') as file:
            json.dump(years, file)
        
    
    
    HSLevel = '2' 
    collector = [] # stores data from each page
    for year in years: 
        print(f'Collecting data for year: {years[year]}')
        for country in countries:
            print(f'\tCollecting data for country: {countries[country]}')

            # use the country code and year to fetch a new age of date
            driver.get(url)
            Select(driver.find_element_by_id(yearsElementName)).select_by_value(year)
            Select(driver.find_element_by_id(countriesElementName)).select_by_value(country)
            Select(driver.find_element_by_id(HSCodeLevelElementName)).select_by_value(HSLevel)
            driver.find_element_by_id(numRecordsElementName).click()
            driver.find_element_by_id(valueElementName).click()
            driver.find_element_by_id(submitButtonElementName).click()
            try: # if there are results
                tbl = driver.find_element_by_tag_name('table').get_attribute('outerHTML')
                tblDF = pd.read_html(tbl)[0]
            except Exception as e: # some countries have no trade data for a given year
                print(e)
                tblDf = pd.DataFrame([])
            
            # build dataframe for the page and add to collector
            finalDF = tblDF[['HSCode', 'Commodity', years[year]]].copy()
            finalDF['Year'] = years[year]
            finalDF['Country'] = countries[country]
            finalDF.columns = ['HSCode', 'commodity', 'value', 'year', 'country']
            collector.append(finalDF)
        print('-'*100)
    # combine data in collector into a single DataFrame
    df = pd.concat(collector, axis=0)
    return df

In [4]:
# collect and save data

exportsURL = 'https://tradestat.commerce.gov.in/eidb/ecntcomq.asp'
# exportsData = scrape(exportsURL)
# exportsData.to_csv('Exports.csv')

importsURL = 'https://tradestat.commerce.gov.in/eidb/icntcomq.asp'
# importsData = scrape(importsURL)
# importsData.to_csv('Imports.csv')
