In [16]:
from py2neo import *
import pandas as pd 
from tqdm.notebook import tqdm

graph = Graph(host='192.168.123.87', auth=('neo4j', 'qwee'))

metro_station_fp = "./shenzhen_metrostation_2014.csv"
metro_station_df = pd.read_csv(metro_station_fp, header=0, encoding='gb2312')
metro_station_df.columns = ['city', 'city_code', 'line_name', 'line_code', 'station_name', 'station_code', 'lon', 'lat', 'poi_code', 'en']
metro_station_df.head()

Unnamed: 0,city,city_code,line_name,line_code,station_name,station_code,lon,lat,poi_code,en
0,深圳市,4403,地铁一号线,440300024063,罗湖,440300024063031,114.118666,22.532083,BV10249978,Luohu
1,深圳市,4403,地铁一号线,440300024063,国贸,440300024063030,114.118909,22.53968,BV10243546,Guomao
2,深圳市,4403,地铁一号线,440300024063,老街,440300024060022,114.116939,22.544232,BV10249964,Laojie
3,深圳市,4403,地铁一号线,440300024063,大剧院,440300024063028,114.107811,22.5418,BV10246013,Grand Theater
4,深圳市,4403,地铁一号线,440300024063,科学馆,440300024046036,114.095173,22.540618,BV10245558,Science Museum


#### Create

In [17]:
NODE_LINE = '地铁线路'
NODE_STATION = '地铁站'
NODE_USER = '用户'


RELATIONSHIP_PASS = '经过'
RELATIONSHIP_NEARBY = '相邻'
RELATIONSHIP_TAKEUP = '上车'
RELATIONSHIP_DROPOFF = '下车'

In [18]:
# 地铁线路
line = Node(NODE_LINE, name='地铁一号线',code='1')
graph.create(line)

line = Node(NODE_LINE, name='地铁二号线',code='3')
graph.create(line)

line = Node(NODE_LINE, name='地铁三号线',code='5')
graph.create(line)

line = Node(NODE_LINE, name='地铁四号线',code='7')
graph.create(line)

line = Node(NODE_LINE, name='地铁五号线',code='9')
graph.create(line)


In [19]:
# 地铁站
for line_name, line_df in metro_station_df.groupby(['line_name']):
    line = graph.nodes.match(name=line_name).first()
    # 地铁路线 经过 地铁站
    for row in line_df.itertuples():
        station_name = getattr(row, 'station_name') 
        station_code = getattr(row, 'station_code')   
        lon = getattr(row, 'lon')   
        lat = getattr(row, 'lat')  
        en = getattr(row, 'en')  
        # 判断是否已经存在，如果已经存在则不再创建节点
        _station = graph.nodes.match(NODE_STATION, name=station_name).first()
        if not _station:
            station = Node(NODE_STATION, name=station_name, code=station_code, lon=lon, lat=lat, en=en)
            graph.create(station)
        else:
            station = _station

        r = Relationship(line, RELATIONSHIP_PASS, station)
        graph.create(r)
            
    # 地铁站 相邻 地铁站
    station_names = line_df.station_name.to_list()
    for i in range(0, len(station_names)-1):
        _tuple = (station_names[i], station_names[i+1])
        a = graph.nodes.match(NODE_STATION, name=_tuple[0]).first()
        b = graph.nodes.match(NODE_STATION, name=_tuple[-1]).first()
        r1 = Relationship(b, RELATIONSHIP_NEARBY, a)
        graph.create(r1)
        r2 = Relationship(a, RELATIONSHIP_NEARBY, b)
        graph.create(r2)

In [20]:
# 上下车
transaction_fp = 'DT_SZT_20140613.csv'
transaction_df = pd.read_csv(transaction_fp, 
                             names=['usr_id', '_1', '_2', 'time', 'line', 'station','_3','tag'],
                            encoding='gb2312',
                            index_col=None, )
transaction_df['usr_id'] = transaction_df.usr_id.astype(str)
transaction_df.head()

Unnamed: 0,usr_id,_1,_2,time,line,station,_3,tag
0,294619939,3,2.85,2014-06-13 00:00:20,地铁五号线,黄贝岭,OGT-129,出站
1,320179727,3,2.85,2014-06-13 00:00:26,地铁五号线,黄贝岭,OGT-122,出站
2,880200292,0,0.0,2014-06-13 00:31:26,地铁三号线,大运,AGM-119,进站
3,880015260,0,0.0,2014-06-13 00:54:03,地铁二号线,安托山,AGT-104,进站
4,880025503,2,0.0,2014-06-13 00:03:16,地铁五号线,布心,OGT-117,出站


In [21]:
print(transaction_df.shape)
transaction_df = transaction_df.dropna()
print(transaction_df.shape)

(3425209, 8)
(3222410, 8)


In [22]:
# create user
all_usr_id_list = []
all_usr_id_list.extend(list(set(transaction_df.usr_id.to_list())))
for u in tqdm(all_usr_id_list):
    n = Node(NODE_USER, code=u)
    graph.create(n)

  0%|          | 0/1026897 [00:00<?, ?it/s]

In [None]:
# 乘车记录
def func(row):
    try:
        station = row.station
        if not station: # none or nan
            return 
        elif station == '深圳北':
            station = '深圳北站'
        elif station == '深圳大学':
            station = '深大'
        elif '站' in station:
            if station == '深圳北站':
                pass
            else:
                station = station[:-1] # 新秀站 => 新秀
    except:
        print('出错,站点名称：'+str(station))
    if row.tag == '出站':
        match_station = graph.nodes.match(name=station).first()
        match_usr = graph.nodes.match(code=row.usr_id).first()
        if match_station == None or match_usr == None:
            print(station)
            print(row.usr_id)
        else:
            r = Relationship(match_usr, RELATIONSHIP_DROPOFF, match_station, time=row.time, line=row.line)
            graph.create(r)
    elif row.tag == '进站':
        match_station = graph.nodes.match(name=station).first()
        match_usr = graph.nodes.match(code=row.usr_id).first()
        if match_station == None or match_usr == None:
            print(station)
            print(row.usr_id)
        else:
            r = Relationship(match_usr, RELATIONSHIP_TAKEUP, match_station, time=row.time, line=row.line)
            graph.create(r)
    else:
        return 

tqdm.pandas(desc="my bar!")
transaction_df.progress_apply(func, axis=1)

my bar!:   0%|          | 0/3222410 [00:00<?, ?it/s]

#### query

In [105]:
graph.nodes.match(name='地铁4号线').all()

[Node('地铁线路', code='7', direction='福田口岸-清湖', name='地铁4号线'),
 Node('地铁线路', code='8', direction='清湖-福田口岸', name='地铁4号线')]

In [94]:
_node = graph.nodes.match(name='深圳北站').first()
match_relations = graph.relationships.match(nodes=[_node, None], r_type=RELATIONSHIP_NEARBY).all()
match1_nodestr = [mr.nodes[-1]['name'] for mr in match_relations]
match1_nodestr

['民治', '长岭陂', '红山', '白石龙']

In [100]:
_node = graph.nodes.match(name='地铁4号线').first()
match_relations = graph.relationships.match(nodes=[_node, None], r_type=RELATIONSHIP_PASS).all()
match2_nodestr = [mr.nodes[-1]['name'] for mr in match_relations]
match2_nodestr

['清湖',
 '龙华',
 '龙胜',
 '上塘',
 '红山',
 '深圳北站',
 '白石龙',
 '民乐',
 '上梅林',
 '莲花北',
 '少年宫',
 '市民中心',
 '会展中心',
 '福民',
 '福田口岸']

In [101]:
set(match1_nodestr).intersection(set(match2_nodestr))

{'白石龙', '红山'}

#### delete all

In [15]:
graph.delete_all()