In [180]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [181]:
data = pd.read_excel('Data/formatted_Small_V5_segments.xlsx')
data.columns = ['row_idx', 'id', 'time', 'x', 'y', 'z']
data = data[['id', 'time', 'x', 'y', 'z']]

In [182]:
cells = sorted(list(set(data['id'])))
times = sorted(list(set(data['time'])))

In [183]:
data

Unnamed: 0,id,time,x,y,z
0,372003,0,76.342396,31.838097,110.450932
1,372003,4,75.788278,31.129983,110.499970
2,372003,8,76.170474,31.348491,110.096623
3,372003,12,76.190203,31.403692,110.140794
4,372003,16,76.631888,32.835718,110.006854
...,...,...,...,...,...
114753,1262868,648,1483.362007,69.676151,1277.105082
114754,1262868,652,1483.975966,69.098518,1277.090154
114755,1262868,656,1483.401067,70.178201,1277.669400
114756,1262868,660,1483.371691,71.179779,1277.669400


In [49]:
format_data = np.zeros((len(cells), 3*len(times)))
format_data.shape

(1189, 630)

In [184]:
def format_cell(cell, num_back=10):
    """
    Creates a matrix of `num_back` length paths from `cell`s data.
    If a cell has fewer than `num_back` time points, it discards the data.
    If a cell has more than `num_back` time points, each `num_back` length
    path constitutes one row in the returned matrix
    
    cell: the cell id
    num_back: the path length
    
    returns: ds, the path matrix with 3*num_back columns
    """
    d = data[data['id'] == cell]
    ts = sorted(d['time'])
    ds = np.zeros((len(ts)//num_back + 1, num_back*3))
    for i, t in enumerate(ts):
        ds[i//num_back, i%num_back*3:i%num_back*3+3] = d[d['time'] == t][['x', 'y', 'z']].to_numpy()
    for i in range(ds.shape[0]):
        ofs = ds[i, 0:3]
        for j in range(num_back):
            ds[i, 3*j:3*j+3] = ds[i, 3*j:3*j+3] - ofs
    if ds[-1,-1] == 0:
        return ds[:-1, :]
    return ds
        
    
#format_cell(1262868), data[data['id'] == 1262868]

In [116]:
# creating a the matrix of paths to train models on

num_back = 10
cell_arrays = []
for c in cells:
    times = data[data['id'] == c]['time']
    r = max(times) - min(times)
    if r < 4*num_back:
        continue
    cell_arrays.append(format_cell(c, num_back))
    

In [176]:
fmt_data = np.vstack(cell_arrays)

In [187]:
# y is the last point in the path. X is the first num_back - 1 points
X, y = fmt_data[:, :-3], fmt_data[:, -3:]

In [186]:
y[:5]

array([[ 78.36070368,  29.63309086, 111.53348504],
       [ 79.21917651,  29.99745735, 110.20615538],
       [ 79.24018617,  28.46135638, 107.40512511],
       [609.89036978, 130.34372731, 468.9753078 ],
       [618.31433751, 145.14853122, 485.6484767 ]])

In [169]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.25)

In [170]:
rgr = RandomForestRegressor()

In [171]:
rgr.fit(x_train, y_train)

RandomForestRegressor()

In [172]:
rgr.score(x_test, y_test)

0.9980422659027175

In [173]:
dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)

DecisionTreeRegressor()

In [174]:
dtr.score(x_test,y_test)

0.9941215761238272

In [175]:
lr = LinearRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.9989918265076719