In [205]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import gmplot
import torch
from torch import nn
from tqdm import tqdm
from tqdm import trange
import h3
from collections import OrderedDict
from sklearn.preprocessing import  OneHotEncoder
import torch.nn.functional as F


In [206]:

def dataset(user,start,end,step):

    userdata =  '../Geolife Trajectories 1.3/Data/'+user+'/Trajectory/'
    filelist = os.listdir(userdata)  #返回指定路径下所有文件和文件夹的名字，并存放于一个列表中
    filelist.sort()
    names = ['lat','lng','zero','alt','days','date','time']
    df_list = [# f为文件索引号，header为列数，names为列表列名，index_col为行索引的列编号或列名
    pd.read_csv(userdata + f,header=6,names=names,index_col=False)
    for f in filelist[start:end]]
    df = pd.concat(df_list, ignore_index=True) #表格列字段不同的表合并
    df.drop(['zero', 'days'], axis=1, inplace=True) #drop函数默认删除行，列需要加axis = 1
    df_min = df.iloc[::step, :]
    return df_min
def synthetic_data(df_min):
    a =df_min['lat'].tolist()
    b = df_min['lng'].tolist()
    a = torch.tensor(a,dtype=torch.float,requires_grad=True).reshape((-1, 1))
    b = torch.tensor(b,dtype=torch.float,requires_grad=True).reshape((-1, 1))
    features = torch.concat([a,b],1)
    return features
#返回（经度，纬度） shape：torch.Size([368, 2])

# 这个提取出来有5个维度
train_dataset = dataset("006",0,20,20)
test_dataset =  dataset("006",20,25,20)
# 这个提取出来有2个维度
train_data = synthetic_data(train_dataset)
test_data = synthetic_data(test_dataset)
all_data =torch.concat([train_data,test_data],0)





def draw(list,number,b,k):
    new_list = torch.tensor(list)
    lat = []
    lng = []
    for i in new_list:
        lat.append(i[0])
        lng.append(i[1])
    old_lat = torch.stack(lat[:k-1])
    old_lng =torch.stack(lng[:k-1])
    print(old_lat)
    print(old_lng)
    lat_predict = (lat[k-1:])
    lng_predict = (lng[k-1:])
    print(lat_predict)
    print(lng_predict)
    lat = torch.stack(lat)
    lng =torch.stack(lng)
    gmap = gmplot.GoogleMapPlotter(lat[0], lng[0], b)
    gmap.plot(lat, lng,color='b',lw=20)  #描绘轨迹点
    gmap.plot(lat_predict, lng_predict,color='r',lw=20)  #描绘轨迹点
    gmap.draw("user{}.html".format(number))   #显示图
    print("over")
def geo_t_h3_norepeat(data):
        h3_list =OrderedDict()
        for i in data:
            a = h3.geo_to_h3(i[0], i[1], 10)
            # print(a)
            h3_list.setdefault(a)
        #这这里去掉h3的重复
        return h3_list
def h3_t_geo(data):
    new_list = []
    for i in data:
        i =h3.h3_to_geo(i)
        new_list.append(i)
    return new_list
def generate_h3_list(data,label='repeat'):
    if label=='no-repeat':
        #不可重复的
        alist = geo_t_h3_norepeat(data)
        # print(type(alist))
        LIST = list(alist.keys())
        return np.array(LIST)
    elif label=='repeat':
        LIST=[]
        for temp in data:
            LIST.append(h3.geo_to_h3(temp[0],temp[1],10))
        return np.array(LIST)
    else:
        return np.array([])
    
    
    
# 搞一个批量训练的模型

In [207]:
Train_h3_list  = generate_h3_list(train_data,label='repeat')
Test_h3_list   = generate_h3_list(test_data,label='repeat')
#这个词典
vocab = generate_h3_list(all_data,label='no-repeat')#vocab也是h3

encoder = OneHotEncoder(sparse=False).fit(vocab.reshape(-1,1))


#这个函数现在没有用上
def encoding(data):
    return encoder.transform(data.reshape(-1,1))
def decoding(one_hot_data):
    return encoder.inverse_transform(one_hot_data)


# 可以通过idnex找到对应的h3
index_h  = dict(enumerate(vocab))

# 可以通过h3编码找到对应的index
h_index ={h3:i for i ,h3 in index_h.items()}


def label_encode(data):
    return np.array([h_index[ch] for ch in data])
def label_decode(data):
    return np.array([index_h[ch] for ch in data])




In [208]:
#y是一个列表，k是步长，这里的数据是h3类型
def dataloader(y,k):
    data =[]
    for i in range(len(y)-k+1):
        indata = y[i:i+k]
        outdata = y[i+k:i+k+1]
        data.append((indata,outdata))
    return data

# def test_dataloader(y,k):
# 现在相当于是在往前看10步
train_dataloader = dataloader(Train_h3_list,10)
test_dataloader =dataloader(Test_h3_list,10)
#暂时不加批量训练
# def data_loader(dataset,batchsize,drop_last=True):
#     for i in range(0,len(dataset)-batchsize+1,batchsize):
#         batch = dataset[i:i+batchsize]
#         batch

In [209]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
class RNN(nn.Module):
    def __init__(self,vocab) -> None:
        super().__init__()
        self.vocab = vocab
        self.num_hiddens = 320
        self.model =nn.LSTM(
            input_size=vocab,
            hidden_size=self.num_hiddens,
            batch_first=True,
            bias=True
        )

        self.output = nn.Sequential(
            nn.Linear(self.num_hiddens,vocab)
        )


    def forward(self,x , state):
        r_out, states= self.model(x.view(1,10,self.vocab) ,state)
        outdata = self.output(r_out[:,-1,:])
        return outdata, states

    def begin_state(self, device, batch_size=1):
        return (torch.zeros((
            self.model.num_layers,
            batch_size, self.num_hiddens), device=device),
                torch.zeros((
                    self.model.num_layers,
                    batch_size, self.num_hiddens), device=device))

net = RNN(len(vocab)).to(device)
# net=nn.DataParallel(net) #这个暂时先别用
optimizer = torch.optim.SGD(net.parameters(),lr = 0.1,momentum=0.8)
optimizer_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)
loss_function = nn.CrossEntropyLoss()#输入是没有softmax的，这个函数是自己softmax

In [2]:
top_k = 5

epoches = 200

def compute_acc(out , top_k):
    probs = F.softmax(out, dim=1).squeeze()
    probs, indices = probs.topk(top_k,largest=True) # 选取概率最大的前top_k个
    indices = indices.cpu().numpy()
    probs = probs.cpu().numpy()
    char_index = np.random.choice(indices,size=1, p = probs / probs.sum()) # 随机选取一个索引
    if not isinstance(char_index,np.ndarray):#这里没太搞懂
        char_index = [char_index]
    # h3_value = label_decode(char_index)
    # predict = h3_t_geo(h3_value.tolist())
    return char_index

train_acc = []
train_loss = []
test_acc = []


for epoch in trange(epoches):
    loss_ =0
    how_many_instance = 0
    how_many_instance_right = 0
    val_loss_ =0
    optimizer_scheduler.step()
    optimizer.step()
    state =  None
    for i , k in train_dataloader[:-1]: #这里的i和k都还是h3编码

        if state is None:
            state = net.begin_state(batch_size=1, device=device)
        else:
            for s in state:
                s.detach_()
        optimizer.zero_grad()
        a = encoder.transform(i.reshape(-1,1))
        a =torch.tensor(a).to(torch.float32).to(device)
        k = label_encode(k)
        k =torch.tensor(k).to(torch.long).to(device)
        out , state = net(a , state)
        loss = loss_function(out,k)
        loss.backward()
        optimizer.step()
        loss_ = loss_+loss
        how_many_instance = how_many_instance + 1
        predict = compute_acc(out.detach() , top_k)#这个char——index和k完全不是一个东西
        # print(f"predict : {predict} k :  {k}")#这个一直不对
        if predict == k:#都是h3的形式
            how_many_instance_right = how_many_instance_right + 1
    train_loss.append(loss_.cpu().detach().numpy() / how_many_instance)
    train_acc.append(how_many_instance_right / how_many_instance)
    print(f'--------------Epochs{epoches} | {epoch}---------------')
    print(f'average Train Loss : {train_loss[-1]} , train acc : {train_acc[-1]}')
    state =  None
    if(epoch%5==0):
        val_ls = 0
        how_many_instance = 0
        how_many_instance_right = 0
        with torch.no_grad():
            for i , k in test_dataloader[:-1]:
                if state is None:
                    state = net.begin_state(batch_size=1, device=device)
                else:
                    for s in state:
                        s.detach_()
                a = encoder.transform(i.reshape(-1,1))
                a =torch.tensor(a).to(torch.float32).to(device)
                k = label_encode(k)
                k =torch.tensor(k).to(torch.long).to(device)#k的size确实是1，然后out的size是884的向量
                out , state = net(a ,state)
                loss = loss_function(out,k)
                val_ls +=loss
                predict = compute_acc(out , top_k)
                # print(f"char_index : {char_index} k :  {k}")这个一直不对
                if predict == k:
                    how_many_instance_right = how_many_instance_right + 1
                how_many_instance = how_many_instance + 1
                #这个取索引的方法是取出前5个然后只根据这5个概率去看


        test_acc.append(how_many_instance_right / how_many_instance )
        print(f'test accuracy : { test_acc[-1]} ')


plt.plot(train_acc, label='Average Train Accuracy')
plt.plot(train_loss, label='Average Train Loss')
plt.plot(test_acc, label='test_acc')
plt.title('Loss vs Epochs')
plt.legend()
plt.show()






NameError: name 'trange' is not defined

In [None]:
top_k = 5
name_id = 1
with torch.no_grad():

    for i , k in test_dataloader[:-1]:


        a = encoder.transform(i.reshape(-1,1))
        a =torch.tensor(a).to(torch.float32).to(device)
        k = label_encode(k)
        k =torch.tensor(k).to(torch.long).to(device)
        out = net(a)
        loss = loss_function(out,k)
        probs = F.softmax(out, dim=1).squeeze()
        probs, indices = probs.topk(top_k) # 选取概率最大的前top_k个
        indices = indices.cpu().numpy()
        probs = probs.cpu().numpy()
        char_index = np.random.choice(indices, p = probs / probs.sum()) # 随机选取一个索引
        h3_value = label_decode([char_index])
        predict = h3_t_geo(h3_value.tolist())
        old=h3_t_geo(i.tolist())
        total_list = old+predict
        # print(total_list)
        draw(total_list,name_id,20,10)
        name_id+=1



