## Webcrawler for HSR ticket, adjustable to different spatio-temporal scale

In [84]:
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas
from datetime import date, timedelta, datetime
import pytz

beijing_timezone = pytz.timezone('Asia/Shanghai')

api_key = 'a3088a73b4bedfcf95e01d08933aa701'

## Define the function to get distance between two address names using AMAP API

In [37]:
import requests

# Function to get latitude and longitude of a city
def get_city_coordinates(city_name, api_key):
    url = f'https://restapi.amap.com/v3/geocode/geo?address={city_name}&key={api_key}'
    response = requests.get(url)
    result = response.json()
    
    if result['geocodes']:
        location = result['geocodes'][0]['location']
        return location.split(",")  # Returns [longitude, latitude]
    else:
        return None

# Function to calculate the distance between two coordinates
def get_distance(origin_coords, dest_coords, api_key):
    origins = ','.join(origin_coords)
    destination = ','.join(dest_coords)
    
    url = f'https://restapi.amap.com/v3/distance?origins={origins}&destination={destination}&type=1&key={api_key}'
    response = requests.get(url)
    result = response.json()
    
    if result['results']:
        distance = result['results'][0]['distance']  # Distance in meters
        return distance
    else:
        return None

# Replace with your actual API key
api_key = 'a3088a73b4bedfcf95e01d08933aa701'

## Get all the cities within a province, or get the city name from the AMAP code

In [95]:
## 给定省份名，返回所有省内城市
def get_cities_in_province(province_name, api_key):
    #api_key = 'YOUR_API_KEY'  # Replace with your AMAP API key
    url = f"https://restapi.amap.com/v3/config/district?key={api_key}&keywords={province_name}&subdistrict=1"
    
    response = requests.get(url)
    data = response.json()
    
    cities = []
    if data['status'] == '1':
        #print(data)
        for district in data['districts']:
            # print(district)
            for citylist in district['districts']:
                #print(citylist)
                cities.append(citylist['name'])
    return cities

def get_cities_code_in_province(province_name, api_key):
    #api_key = 'YOUR_API_KEY'  # Replace with your AMAP API key
    url = f"https://restapi.amap.com/v3/config/district?key={api_key}&keywords={province_name}&subdistrict=1"
    
    response = requests.get(url)
    data = response.json()
    
    cities = []
    if data['status'] == '1':
        #print(data)
        for district in data['districts']:
            # print(district)
            for citylist in district['districts']:
                #print(citylist)
                cities.append(citylist['citycode'])
    return cities

## 给定city_code,在高德地图API获取城市名
def get_city_name(city_code, api_key):
    url = "https://restapi.amap.com/v3/config/district"

    # Define query parameters
    params = {
        'key': api_key,
        'keywords': city_code,  # Use city code here
        'subdistrict': 0,
        'extensions': 'base',   # Only fetch base information
        'output': 'json'
    }

    # Send request to AMap API
    response = requests.get(url, params=params)
    result = response.json()

    # Extract the city name from the response
    if 'districts' in result and result['districts']:
        city_name = result['districts'][0]['name']
        print("城市名称:", city_name)
    else:
        print("未找到城市")

    return(city_name)


## Get all the stations and positions of the inter-city traveling modes given POI

In [107]:
## 给定地名, city_code，返回所有跨市出行方式站点和位置
def intercity_mode_location(city_coord, poi_code, api_key): 
    #poi_code should be intercity mode API ID
    url = "https://restapi.amap.com/v3/place/text"

    # Define query parameters
    params = {
        'key': api_key,
        'location': city_coord,
        'types': poi_code,
        'output': 'json'
    }

    # Send request to AMap API
    response = requests.get(url, params=params)
    result = response.json()
    print(result)

    # Initialize an empty list to store the POI information
    poi_list = []

    # Extract details from the JSON response
    if 'pois' in result:
        for place in result['pois']:
            if place['typecode'] == '150104' or place['typecode'] == '150200' or place['typecode'] == '150400':
                poi_details = {
                    'name': place['name'],
                    'address': place.get('address', 'N/A'),
                    'location': place.get('location', 'N/A')
                }
                poi_list.append(poi_details)
            else: 
                print('查询POI类型错误，详见POI代码列表')

    # Print the resulting list
    print(poi_list)
    # Print the resulting list
    # print(poi_list)

## Get distance from city-name, coordinate from address name, distance from coordinates

In [None]:
# 获取城市间距离
def dist_twonames(add_1, add_2, api_key):
    # Get coordinates
    origin_coords = get_city_coordinates(add_1, api_key)
    dest_coords = get_city_coordinates(add_2, api_key)

    if origin_coords and dest_coords:
        # Get the distance
        distance = float(get_distance(origin_coords, dest_coords, api_key))/1000
        print(f"{add_1} 和 {add_2} 两地相距: {str(distance)} KM")
    else:
        print("地名坐标获取出错.")
    return(distance)


## 给定地名，在高德地图API中获取坐标 -- 用于计算网络指标
# Function to get latitude and longitude of a city
def get_city_coordinates(city_name, api_key):
    url = f'https://restapi.amap.com/v3/geocode/geo?address={city_name}&key={api_key}'
    response = requests.get(url)
    result = response.json()
    
    if result['geocodes']:
        location = result['geocodes'][0]['location']
        return location.split(",")  # Returns [longitude, latitude]
    else:
        return None


# 给定两个坐标，获取两地距离
def get_distance(origin_coords, dest_coords, api_key):
    origins = ','.join(origin_coords)
    destination = ','.join(dest_coords)
    
    url = f'https://restapi.amap.com/v3/distance?origins={origins}&destination={destination}&type=1&key={api_key}'
    response = requests.get(url)
    result = response.json()
    
    if result['results']:
        distance = result['results'][0]['distance']  # Distance in meters
        return distance
    else:
        return None

## Define formatting functions for O-D city pairs, time, and date

In [39]:
### 定义O-D城市ID赋值function. 将市、县去掉
def cityrename(city):
    if "县" in city:
        city_rename = city[:-1] + city[-1].replace('县', '')
    elif "市" in city:
        city_rename = city[:-1] + city[-1].replace('市', '')
    return city


In [40]:
### 定义爬取数据日期。可以为“today”，“tomorrow”，或者具体日期，格式为YYYY-MM-DD
def currentdate(date_input):
    if date_input == "tomorrow": #1 represents tomorrow
        date_return = str(datetime.now(beijing_timezone).date() + timedelta(1))
    elif date_input == "today": #0 represents today
        date_return = str(datetime.now(beijing_timezone).date())
    else:
        date_return = date_input
    return date_return

In [41]:
### 定义爬取数据的时间。如果不定义具体时间，则默认爬取全天数据。 
### 时间对应：t_list = [1,2,3,4,5], 想获取数据的时间段（五个时间段分别对应）全天，0-6，6-12，12-18，18-24，四个时间段
def currenttime(time_input, date_input): 
    #分爬取数据的两种情况
    if date_input == str(datetime.now(beijing_timezone).date()):
        ##如果是今天，即即时数据爬取，提前15分钟结束每个session.否则本session可能由于时间太短爬不出数据.后面可以定义提前多久
        time_input = datetime.strptime(time_input, '%H-%M')
        if time_input <= datetime.strptime("06-00", '%H-%M'):
            timeperiod = 1
        elif time_input > datetime.strptime("06-00", '%H-%M') and time_input <= datetime.strptime("11-45", '%H-%M'): 
            timeperiod = 2
        elif time_input > datetime.strptime("11-45", '%H-%M') and time_input <= datetime.strptime("17:45", '%H-%M'):
            timeperiod = 3
        else:
            timeperiod = 4
    else:
        ##如果不是今天，即未来数据爬取，按正常时间结束session
        time_input = datetime.strptime(time_input, '%H-%M')
        if time_input <= datetime.strptime("06-00", '%H-%M'):
            timeperiod = 1
        elif time_input > datetime.strptime("06-00", '%H-%M') and time_input <= datetime.strptime("12-00", '%H-%M'): 
            timeperiod = 2
        elif time_input > datetime.strptime("12-00", '%H-%M') and time_input <= datetime.strptime("18-00", '%H-%M'):
            timeperiod = 3
        else:
            timeperiod = 4
    return timeperiod


## Testing 1: using a given list of city pairs and store all the results

In [None]:
### 尝试使用上述function挖掘一整个citypair list，明天一整天的票务数据
## 首先定义运行位置
date_input = [currentdate("2024-10-20")]
print(date_input)
t = 0 ###全天
# time = currenttime("8:15") ###如果想用某一刻时间，需要参考左边的表达
## 针对样本数据，可采取已有列举城市对进行数据挖掘（即citypair_dir_1.xlsx. 对于任意城市对，可输入任意城市对为city1和city2，代入方程中进行票务检索。

inputdir = r'D:\\微云同步助手\\332667113\\2023-交科院综合交通开放课题\\研究进度\\软著\\'
cities = pandas.read_excel(str(inputdir + "citypair_sample2.xlsx"),skiprows = 1, header = None)
print(cities.loc[0,:])

ticket = citypairticket(cities, date_input, t)

pandas.DataFrame(ticket).to_excel(str(inputdir + date_input[0] +'_allday_citypair1'+'.xlsx'), index=False) # 保存数据


## Testing 2: given a specific city pair and retrieve the data

In [None]:
### 尝试使用上述function挖掘明天6：00-12：00上海市-苏州市的票务数据
date_input = [currentdate('2024-10-16')]
print(date_input)
t = 0 #明天上午6：00-12：00，不需要输入实时时间
ticket = []

cityO = "南京市"
cityD = "上海市"

# Create a DataFrame from the matrix
citypair = pandas.DataFrame([[cityO, cityD]])

print(str(citypair.loc[0,:][0]))

outputdir = r'D:\\微云同步助手\\332667113\\2023-交科院综合交通开放课题\\研究进度\\软著\\'
outputpath = str(outputdir + date_input[0] + '_allday_' + str(citypair.loc[0,:][0]) + "-" + str(citypair.loc[0,:][1]) + '.xlsx')

ticket_test2 = citypairticket(citypair, date_input, t)

pandas.DataFrame(ticket_test2).to_excel(outputpath, index=False) # 保存数据

## Run the code without specific functions 
### still need to debug the function. Currently, just use the following format

In [42]:
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas
from datetime import date, timedelta, datetime
import pytz
import requests
import itertools

beijing_timezone = pytz.timezone('Asia/Shanghai')

# AMAP API key
api_key = 'a3088a73b4bedfcf95e01d08933aa701'


In [4]:
### 输入城市对 - samples 
inputdir = r'D:\\微云同步助手\\332667113\\2023-交科院综合交通开放课题\\研究进度\\软著\\' ### 定义你的输入/输出文件路径
data = pandas.read_excel(str(inputdir + "citypair_sample2.xlsx"),skiprows = 1, header = None)

In [72]:
### 手动输入城市对
cityO = ["上海市",'宜兴市','苏州市']
cityD = ["宜兴市",'杭州市','合肥市']
data = pandas.DataFrame({'cityO':cityO, 'cityD':cityD})
print(data)

  cityO cityD
0   上海市   宜兴市
1   宜兴市   杭州市
2   苏州市   合肥市


In [None]:
### 把city_code 变成城市输入，得到新的城市对data
#all_cities = pandas.read_excel(str(inputdir + 'AMap_adcode_citycode.xlsx'))
intercity_poi = pandas.read_excel(str(inputdir + 'AMAP_POI_intercity_Code.xlsx'))
region = '江苏省'
city_code_list = get_cities_in_province('江苏省', api_key)
print(city_code_list)
poi_code_list = intercity_poi['POI_ID']

cityO = city_code_list
cityD = city_code_list
data = pandas.DataFrame({'cityO':cityO, 'cityD':cityD})

# Generate all combinations of Column1 and Column2
combinations = list(itertools.product(data['cityO'], data['cityD']))
# Convert the combinations to a DataFrame
data = pandas.DataFrame(combinations, columns=['cityO', 'cityD'])
print(data)


In [83]:
### 输入抓取时间
time_list = datetime.now(beijing_timezone).strftime("%H-%M")
date_list = [currentdate("today")]

print(time_list)
print(date_list)

time_i = currenttime(time_list, date_list)
#print(time_i)

# t对应12306官网上对应发车时间区间（1-5）
t_list = [1,2,3,4,5] 
#time_i = 2 ### 改变想要抓取的数据时间段，0-4分别对应：全天，0-6，6-12，12-18，18-24
t = t_list[time_i]


# default t = 1, allday
#t = 5

print(t)

04-18
['2024-10-19']
2


In [82]:
### d 存储所有票务数据
d = []

#### 打开Chrome
chrome_option = webdriver.ChromeOptions()
# p=r'C:\Users\lms\AppData\Local\Google\Chrome\User Data'
# chrome_option.add_argument('--user-data-dir='+p)
chrome_option.add_argument("--disable-blink-features=AutomationControlled")
chrome_option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
chrome_option.add_argument('--ignore-certificate-errors')
chrome_option.add_argument('--ignore-ssl-errors')
browser = webdriver.Chrome(options=chrome_option)
# browser = webdriver.Chrome(executable_path='E:\\HSR-RTU\\chromedriver\chromedriver.exe')
#browser.maximize_window()
browser.minimize_window()
url = 'https://kyfw.12306.cn/otn/leftTicket/init?linktypeid=dc'
browser.get(url)

while True:
    if len(browser.find_elements(by=By.ID, value='fromStationText')) != 0:
        break
ActionChains(browser).move_to_element(browser.find_element(by=By.ID, value='fromStationText')).perform()
try:
    for n in data.index[:]:
        p = 0
        city = data.loc[n, :].values
        print(city, n)
        city1 = city[0][:-1] + city[0][-1].replace('市', '')
        city2 = city[1][:-1] + city[1][-1].replace('市', '')
        # city1 = '太原'
        # city2 = '张家口'
        if city1 == city2:
            continue
        for date in date_list:
            try:
                city1_input = browser.find_element(by=By.ID, value='fromStationText')
                city1_input.clear()
                city1_input.send_keys(city1)
                while True:
                    if len(browser.find_elements(by=By.ID, value='citem_0')) != 0:
                        break
                p = 0
                for i in browser.find_elements(by=By.CLASS_NAME, value='cityline'):
                    try:
                        if city1 == i.text.split()[0]:
                            i.click()
                            p = 1
                            break
                    except:
                        pass
                if p == 0:
                    city1_input.send_keys(Keys.ENTER)
                city2_input = browser.find_element(by=By.ID, value='toStationText')
                # ActionChains(browser).move_to_element(city2_input).perform()
                city2_input.clear()
                city2_input.send_keys(city2)
                while True:
                    if len(browser.find_elements(by=By.ID, value='citem_0')) != 0:
                        break
                p = 0
                for i in browser.find_elements(by=By.CLASS_NAME, value='cityline'):
                    try:
                        if city2 == i.text.split()[0]:
                            i.click()
                            p = 1
                            break
                    except:
                        pass
                if p == 0:
                    city2_input.send_keys(Keys.ENTER)
                date_input = browser.find_element(by=By.ID, value='train_date')
                date_input.clear()
                date_input.send_keys(date)
                date_input.send_keys(Keys.ENTER)
                browser.execute_script('arguments[0].click();', browser.find_element(by=By.ID, value='query_ticket'))
                time.sleep(0.3)
                p = 0
                for i in range(50):
                    if len(browser.find_element(by=By.ID, value='no_filter_ticket_2').get_attribute('style')) == 0:
                        p = 1
                        break
                    if len(browser.find_elements(by=By.XPATH, value='//tbody[@id="queryLeftTable"]/tr')) != 0:
                        break
                if p == 1:
                    item = {}
                    item['出发城市'] = city1
                    item['到达城市'] = city2
                    item['搜索日期'] = date
                    item['车次'] = '无'
                    item['出发地'] = city[0]
                    item['目的地'] = city[1]
                    #print(item)
                    d.append(item)
                    continue
                if len(browser.find_elements(by=By.ID, value='cc_seat_type_O')) == 0:
                    item = {}
                    item['出发城市'] = city1
                    item['到达城市'] = city2
                    item['搜索日期'] = date
                    item['车次'] = '无'
                    item['出发地'] = city[0]
                    item['目的地'] = city[1]
                    #print(item)
                    d.append(item)
                    continue
                if len(browser.find_elements(by=By.XPATH, value='//tbody[@id="queryLeftTable"]/tr')) == 0:
                    item = {}
                    item['出发城市'] = city1
                    item['到达城市'] = city2
                    item['搜索日期'] = date
                    item['车次'] = '无'
                    item['出发地'] = city[0]
                    item['目的地'] = city[1]
                    #print(item)
                    d.append(item)
                    continue
                # browser.execute_script('arguments[0].click();', browser.find_element(by=By.ID, value='cc_seat_type_O'))
                # browser.find_element(by=By.XPATH, value=f'//*[@id="cc_start_time"]/option[{t}]').click()
                browser.find_element(by=By.XPATH, value=f'//*[@id="cc_start_time"]/option[{t}]').click()
                time.sleep(0.3)
                tr_list = browser.find_elements(by=By.XPATH, value='//tbody[@id="queryLeftTable"]/tr')
                number_list = browser.find_elements(by=By.CLASS_NAME, value='number')
                s_list = browser.find_elements(by=By.CLASS_NAME, value='cdz')
                t_list = browser.find_elements(by=By.CLASS_NAME, value='cds')
                c = 0
                new_tr_list = []
                for i in tr_list:
                    try:
                        for j in i.find_elements(by=By.TAG_NAME, value='td')[1:]:
                            if j.text != '候补':
                                j.click()
                                break
                        c += 1
                    except:
                        pass
                    try:
                        if len(i.find_element(by=By.TAG_NAME, value='td').text.strip()) != 0:
                            new_tr_list.append(i)
                    except:
                        pass
                time.sleep(0.3)
                price_list = []
                for i in browser.find_elements(by=By.XPATH, value='//tbody[@id="queryLeftTable"]/tr'):
                    try:
                        for j in i.find_elements(by=By.TAG_NAME, value='td'):
                            if j.get_attribute('class') == 'p-num':
                                price_list.append([i.find_elements(by=By.TAG_NAME, value='td')[1],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[2],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[3],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[4],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[5],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[6],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[7],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[8],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[9],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[10],
                                                   i.find_elements(by=By.TAG_NAME, value='td')[11]
                                                   ])
                                break
                    except:
                        pass
                for i in range(len(number_list)):
                    item = {}
                    item['出发城市'] = city1
                    item['到达城市'] = city2
                    item['搜索日期'] = date
                    item['车次'] = number_list[i].text
                    item['出发地'] = s_list[i].find_elements(by=By.TAG_NAME, value='strong')[0].text
                    item['目的地'] = s_list[i].find_elements(by=By.TAG_NAME, value='strong')[1].text
                    item['出发时间'] = t_list[i].find_elements(by=By.TAG_NAME, value='strong')[0].text
                    item['到达时间'] = t_list[i].find_elements(by=By.TAG_NAME, value='strong')[1].text
                    item['商务座'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[1].text
                    item['商务座票价'] = price_list[i][0].text
                    item['优选一等座'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[2].text
                    item['优选一等座票价'] = price_list[i][1].text
                    item['一等座'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[3].text
                    item['一等座票价'] = price_list[i][2].text
                    item['二等座'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[4].text
                    item['二等座票价'] = price_list[i][3].text
                    item['高级软卧'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[5].text
                    item['高级软卧票价'] = price_list[i][4].text
                    item['软卧'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[6].text
                    item['软卧票价'] = price_list[i][5].text
                    item['硬卧'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[7].text
                    item['硬卧票价'] = price_list[i][6].text
                    item['软座'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[8].text
                    item['软座票价'] = price_list[i][7].text
                    item['硬座'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[9].text
                    item['硬座票价'] = price_list[i][8].text
                    item['无座'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[10].text
                    item['无座票价'] = price_list[i][9].text
                    # print(item)
                    item['其他'] = new_tr_list[i].find_elements(by=By.TAG_NAME, value='td')[11].text
                    item['其他票价'] = price_list[i][10].text
                    #print(item)
                    #print(i, item['出发地'], item['目的地'])
                    d.append(item)
            except Exception as e:
                item = {}
                item['出发城市'] = city1
                item['到达城市'] = city2
                item['搜索日期'] = date
                item['车次'] = '无'
                item['出发地'] = city[0]
                item['目的地'] = city[1]
                # print(item)
                d.append(item)
except Exception as e:
    print(e)
    print('出错')
print('citypair1 finished')

browser.close()


['上海市' '宜兴市'] 0
['宜兴市' '杭州市'] 1
['苏州市' '合肥市'] 2
citypair1 finished


In [80]:
outputpath = str(inputdir+date_list[0]+'-'+str(time_list)+'-'+cityO[0]+"-"+cityD[0]+'.xlsx')
pandas.DataFrame(d).to_excel(outputpath, index=False)

## Analyze the ticket information retrieved from the previous steps

In [48]:
## 针对给定城市对组合，计算复杂网络指标

# 获取城市间距离

#add_1 = '上海虹桥站'
#add_2 = '上海松江站'

def dist_twonames(add_1, add_2):
    # Get coordinates
    origin_coords = get_city_coordinates(add_1, api_key)
    dest_coords = get_city_coordinates(add_2, api_key)

    if origin_coords and dest_coords:
        # Get the distance
        distance = float(get_distance(origin_coords, dest_coords, api_key))/1000
        print(f"{add_1} 和 {add_2} 两地相距: {str(distance)} KM")
    else:
        print("地名坐标获取出错.")
    return(distance)

#dist_ = dist_twonames(add_1, add_2)


In [49]:
### 定义方程，剩下多少席位
### count the ticket number remaining. If it contains ‘有', it means there are sufficient tickets. Otherwise, sum up all the numbers. If no number exists, it is zero
def remainingseats(ticketdb, seatclass):
    abundance = ticketdb[ticketdb[seatclass].str.startswith('有')].shape[0]
    if abundance == 0:
        remaining = pandas.to_numeric(ticketdb[seatclass], errors='coerce').sum()
    else:
        remaining = 10000 # a max value 
    return(remaining)

# average price
def avgprice(ticketdb, seatclass):
    price = pandas.to_numeric(ticketdb[seatclass].str[1:], errors = 'coerce').mean()
    return(price)

# min price
def minprice(ticketdb, seatclass):
    price = pandas.to_numeric(ticketdb[seatclass].str[1:], errors = 'coerce').min()
    return(price)

# HSR where there is not seats
def noseats(ticketdb, seatclass):
    count = ticketdb[ticketdb[seatclass].str.startswith('-')].shape[0] + ticketdb[ticketdb[seatclass].str.startswith('候补')].shape[0]
    return(count)


In [50]:
### 读取票务爬取数据
d = pandas.read_excel(outputpath)

#### 获取出发和到达城市
origin = d['出发城市'].unique()
destination = d['到达城市'].unique()

print(origin)
print(destination)
#print(d)

## Unique 出发站点和到达站点
originS = d['出发地'].unique()
destinationS = d['目的地'].unique()    
print(originS)
print(destinationS)    

['连云港' '南通' '南京' '苏州' '泰州' '镇江' '盐城' '淮安' '宿迁' '常州' '无锡' '扬州' '徐州']
['南通' '南京' '苏州' '泰州' '镇江' '盐城' '淮安' '宿迁' '常州' '无锡' '扬州' '徐州' '连云港']
['连云港' '连云港东' '连云港市' '南通' '南通西' '南通市' '南京南' '南京' '南京市' '仙林' '苏州' '苏州园区'
 '苏州新区' '苏州北' '苏州市' '泰州' '姜堰' '泰州市' '镇江' '大港南' '丹徒' '镇江南' '镇江市' '盐城'
 '盐城大丰' '盐城市' '淮安东' '淮安' '宿迁市' '宿迁' '洋河' '常州' '常州北' '常州市' '戚墅堰' '金坛' '武进'
 '无锡' '无锡新区' '无锡东' '无锡市' '江阴' '惠山' '扬州东' '江都' '扬州' '徐州' '徐州东' '徐州市']
['南通西' '南通' '南京南' '南京' '苏州' '苏州北' '苏州园区' '泰州' '姜堰' '大港南' '镇江' '丹徒' '盐城'
 '盐城大丰' '淮安东' '宿迁市' '常州市' '无锡市' '扬州东' '江都' '扬州' '徐州市' '连云港' '苏州市' '淮安'
 '宿迁' '洋河' '扬州市' '徐州东' '徐州' '南通市' '苏州新区' '镇江南' '淮安市' '无锡' '无锡新区' '江阴'
 '无锡东' '惠山' '连云港东' '南京市' '泰州市' '镇江市' '常州' '常州北' '戚墅堰' '盐城市' '连云港市' '仙林'
 '武进' '金坛']


In [None]:
## 基于票务数据，进行网络特征分析
#d_df = pandas.DataFrame(d)
#print(networkmetric)
networkmetric = []

dij_effi_price = 0
for i in origin:
    #print(i)
    for j in destination:
        #print(j)
        pairs = d[(d['出发城市'] == i) & (d['到达城市'] == j)]
        # print(pairs)
        #print(pairs['车次'])
        # (pairs['车次'] != '无').any()
        if len(pairs) > 0 and (pairs['车次'] != '无').any():
        ## hsr connection includes G and D
            hsr_connection = pairs[pairs['车次'].str.startswith('G')].shape[0] + pairs[pairs['车次'].str.startswith('D')].shape[0]
            ## if 2nd remaining has the string ’有', it means there are abundant of seats. Otherwise, sum all the numbers
            
            item = {}
            item['CityO'] = i
            item['CityD'] = j
            item['Distance'] = dist_twonames(i,j)
            item['Tot_connection'] = len(pairs)
            item['HSR_connection'] = hsr_connection

            item['2ndClass_remaining'] = remainingseats(pairs, '二等座')
            item['2ndClass_price'] = avgprice(pairs, '二等座票价')
            item['2ndClass_sellout'] = noseats(pairs, '二等座')

            item['1stClass_remaining'] = remainingseats(pairs, '一等座')
            item['1stClass_price'] = avgprice(pairs, '一等座票价')
            item['1stClass_sellout'] = noseats(pairs, '一等座')

            item['BusinessClass_remaining'] = remainingseats(pairs, '商务座')
            item['BusinessClass_price'] = avgprice(pairs, '商务座票价')
            item['BusinessClass_sellout'] = noseats(pairs, '商务座')

            ## shortest path between i and j: here we define as the least price
            item['lowest_price'] = minprice(pairs, '二等座票价')

            ## calculate the number of remaining HSR that has seats available
            item['SeatsAvailableLines'] = len(pairs)-item['2ndClass_sellout'] - item['1stClass_sellout'] + item['BusinessClass_sellout']
                        
            ## calculate redundancy between O and D, taking the lowest price as the shortest path
            item['Redundancy'] = item['SeatsAvailableLines']/item['lowest_price']

            ## calculate the remaining seats of the line
            item['TotRemainingSeats'] = item['2ndClass_remaining'] + item['1stClass_remaining'] + item['BusinessClass_remaining']

            ## accumulate network efficiency regarding the price
            dij_effi_price += minprice(pairs, '二等座')

            ## if we define the shortest path as the 

            # print(item)
            networkmetric.append(item)

print(networkmetric)


In [68]:
## 储存数据到 networkmetric文件

pandas.DataFrame(networkmetric).to_excel(inputdir+'networkmetric.xlsx', index=False)

In [71]:
### 其他网络指标计算
### total number of nodes in the network. We define the number of nodes as number of stations

networkmetric = pandas.read_excel(inputdir+'networkmetric.xlsx')
tot_nodes = len(list(set(originS.tolist() + destinationS.tolist())))
print(tot_nodes)
### calculate network density
network_den = len(d)/(tot_nodes*(tot_nodes-1))
print(network_den)

### calculate network effective density
# print(networkmetric['SeatsAvailableLines'])
#print(pandas.to_numeric(networkmetric['SeatsAvailableLines']))
network_effective_den = pandas.to_numeric(networkmetric['SeatsAvailableLines']).sum()/(tot_nodes*(tot_nodes-1))
print(network_effective_den)

### calculate network efficiency using the lowest price of each HSR
network_effi_price = pandas.to_numeric(networkmetric['lowest_price']).sum()/(tot_nodes*(tot_nodes-1))
print(network_effi_price)

### calculate network efficiency using the remaining seats of all the HSR
network_effi_seats = pandas.to_numeric(networkmetric['TotRemainingSeats']).sum()/(tot_nodes*(tot_nodes-1))
print(network_effi_seats)


51
1.9768627450980392
2.395294117647059
3.7537254901960786
1145.4627450980392


## Analyze inter-city traveling mode (excluding coach)

In [None]:
## need to use flightaware API

### 把city_code 变成城市输入，得到新的城市对data
#all_cities = pandas.read_excel(str(inputdir + 'AMap_adcode_citycode.xlsx'))
intercity_poi = pandas.read_excel(str(inputdir + 'AMAP_POI_intercity_Code.xlsx'))
region = '江苏省'

city_code_list = get_cities_code_in_province('江苏省', api_key)
city_list = get_cities_in_province('江苏省', api_key)
# print(city_code_list)
poi_code_list = intercity_poi['POI_ID']
print(intercity_poi)

city_coord = get_city_coordinates(city_list[0], api_key)

airports = intercity_mode_location(city_coord, '150104', api_key)
print(airports)

In [None]:
## create a new data for the ticket retrieval
city_list = get_cities_in_province('江苏省', api_key)
cityO = city_list
cityD = city_list
data = pandas.DataFrame({'cityO':cityO, 'cityD':cityD})

# Generate all combinations of Column1 and Column2
combinations = list(itertools.product(data['cityO'], data['cityD']))
# Convert the combinations to a DataFrame
data = pandas.DataFrame(combinations, columns=['cityO', 'cityD'])
# print(data)

### 输入城市名，得到城市code，利用poi code，获得所有站点名称和位置

