In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error as mse

In [2]:
data = pd.read_excel('Data/formatted_Small_V5_segments.xlsx')
data_n = pd.read_excel('Data/Small_V5_tracks.xlsx')
data.columns = ['row_idx', 'id', 'time', 'x', 'y', 'z']
data = data[['id', 'time', 'x', 'y', 'z']]

In [3]:
cells = sorted(list(set(data['id'])))
times = sorted(list(set(data['time'])))

In [80]:
types = [n.split('_')[1] for n in data_n['Name']]
t_list = set(types)
t_dict = {t: i for i, t in enumerate(t_list)}
type_ints = [t_dict[t] for t in types]
data_n['Type'] = type_ints

{'SYN', 'DEAD', 'UNINF', 'MONO'}


In [6]:
format_data = np.zeros((len(cells), 3*len(times)))
format_data.shape

(1189, 630)

In [81]:
def format_cell(cell, num_back=10):
    """
    Creates a matrix of `num_back` length paths from `cell`s data.
    If a cell has fewer than `num_back` time points, it discards the data.
    If a cell has more than `num_back` time points, each `num_back` length
    path constitutes one row in the returned matrix
    
    cell: the cell id
    num_back: the path length
    
    returns: ds, the path matrix with 3*num_back columns
    """
    d = data[data['id'] == cell]
    ts = sorted(d['time'])
    ds = np.zeros((len(ts)//num_back + 1, num_back*3 + 1))
    for i, t in enumerate(ts):
        ds[i//num_back, i%num_back*3:i%num_back*3+3] = d[d['time'] == t][['x', 'y', 'z']].to_numpy()
    for i in range(ds.shape[0]):
        ofs = ds[i, 0:3]
        for j in range(num_back):
            ds[i, 3*j:3*j+3] = ds[i, 3*j:3*j+3] #- ofs
    ds[:,-1] = data_n[(data_n['Id'] == cell)]['Type']
    if ds[-1,-2] == 0:
        return ds[:-1, :]
    return ds
        
    
#format_cell(1262868), data[data['id'] == 1262868]

In [102]:
# creating a the matrix of paths to train models on

num_back = 10
cell_arrays = []
for c in cells:
    times = data[data['id'] == c]['time']
    r = max(times) - min(times)
    if r < 4*num_back:
        continue
    cell_arrays.append(format_cell(c, num_back))
    

In [104]:
fmt_data = np.vstack(cell_arrays)
fmt_data.shape

(11027, 31)

In [106]:
mask = fmt_data[:,-1] != t_dict['DEAD']
fmt_data = fmt_data[mask, :]
fmt_data.shape

(9584, 31)

In [107]:
# y is the last point in the path. X is the first num_back - 1 points
# X,y = fmt_data[:, :-4], fmt_data[:, -3:-1]
X, y = fmt_data[:, :-1], fmt_data[:, -1]

In [108]:
y = y.astype(np.int32)

In [109]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.25)

In [110]:
rgr = RandomForestClassifier()

In [111]:
rgr.fit(x_train, y_train)

RandomForestClassifier()

In [112]:
preds = rgr.predict(x_test)
rgr.score(x_test, y_test), mse(preds, y_test)

(0.929465776293823, 0.14816360601001669)

In [113]:
dtr = DecisionTreeClassifier()
dtr.fit(x_train, y_train)

DecisionTreeClassifier()

In [114]:
preds = dtr.predict(x_test)
dtr.score(x_test,y_test), mse(preds, y_test)

(0.9010851419031719, 0.2149415692821369)

In [115]:
dtr.get_depth()

32

In [116]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
preds = lr.predict(x_test)
lr.score(x_test, y_test), mse(preds, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.8868948247078464, 0.21076794657762937)

In [121]:
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.optim as optim

In [122]:
BATCH_SIZE = 256
n_features = 30
n_classes=len(t_list)
training_dataset = TensorDataset(torch.from_numpy(x_train).float(), 
                                 torch.from_numpy(y_train).long())
train_loader = DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

testing_dataset = TensorDataset(torch.from_numpy(x_test).float(), 
                                torch.from_numpy(y_test).long())
test_loader = DataLoader(testing_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [123]:
class Classifier(nn.Module):
    def __init__(self, n_features, n_classes, n_hidden=128, p_dropout=0.5):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(n_features, n_hidden, bias=True)
        self.fc2 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc3 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc4 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc5 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc6 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc7 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc8 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc9 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc10 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc11 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc12 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc13 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc14 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc15 = nn.Linear(n_hidden, n_hidden, bias=True)
        self.fc16 = nn.Linear(n_hidden, n_classes, bias=True)

    def forward(self, x):
        x = nn.ReLU()(self.fc1(x))
        x = nn.ReLU()(self.fc2(x))
        x = nn.ReLU()(self.fc3(x))
        x = nn.ReLU()(self.fc4(x))
        x = nn.ReLU()(self.fc5(x))
        x = nn.ReLU()(self.fc6(x))
        x = nn.ReLU()(self.fc7(x))
        #x = nn.ReLU()(self.fc8(x))
        #x = nn.ReLU()(self.fc9(x))
        #x = nn.ReLU()(self.fc10(x))
        #x = nn.ReLU()(self.fc11(x))
        #x = nn.ReLU()(self.fc12(x))
        #x = nn.ReLU()(self.fc13(x))
        #x = nn.ReLU()(self.fc14(x))
        #x = nn.ReLU()(self.fc15(x))
        x = self.fc16(x)
        x = torch.log_softmax(x, dim=1)
        return x

In [124]:
def loader_accuracy(model, test_loader, lf=nn.NLLLoss()):
    lossies = []
    accs = []
    #grab a batch from the test loader
    with torch.no_grad():
        for examples, labels in test_loader:
            torch.cuda.empty_cache()
            outputs = model.forward(examples)
            lossies.append(lf(torch.squeeze(outputs), torch.squeeze(labels)).item())

            #for each output in the batch, check if the label is correct
            preds = np.argmax(outputs.detach().numpy(), axis=1)
            labels = labels.detach().numpy()
            accuracy = (preds == labels).mean()
            accs.append(accuracy)

    loss = sum(lossies)/len(lossies)
    acc = sum(accs)/len(accs)

    return acc, loss

In [125]:
def train(epochs=20):
    # reset the model
    model = Classifier(n_features=n_features, n_classes=n_classes)
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(),lr=0.001)

    for epoch in range(epochs):
        losses = []

        for x_batch_train, y_batch_train in train_loader:
            optimizer.zero_grad()
            
            outputs = model.forward(x_batch_train)
            loss = criterion(outputs, y_batch_train)
            loss.backward()
            optimizer.step()
            losses.append(loss.detach().numpy())
    
        if epoch % 10 == 0:
            print(np.mean(losses))
            print(loader_accuracy(model, test_loader))
        
    return model

In [126]:
m = train(200)

0.49391037
(0.8880208333333334, 0.46231256590949166)
0.41077343
(0.8880208333333334, 0.4008244110478295)
0.39584282
(0.8897569444444444, 0.404458483060201)
0.37488395
(0.8910590277777778, 0.37450043360392254)
0.3674578
(0.8919270833333334, 0.3675025635295444)
0.34624973
(0.88671875, 0.3611697455247243)
0.32526198
(0.8932291666666666, 0.35570699638790554)
0.31076398
(0.89453125, 0.3329940173361037)
0.31170517
(0.890625, 0.3628953860865699)
0.30369392
(0.8940972222222222, 0.33491947915818954)
0.2766269
(0.890625, 0.33552850948439705)
0.26931438
(0.8953993055555556, 0.3172774248652988)
0.26031822
(0.8958333333333334, 0.34970776240030926)
0.28884655
(0.8936631944444444, 0.3364178505208757)
0.2229385
(0.89453125, 0.37384254071447587)
0.21852048
(0.8980034722222222, 0.3791923721631368)
0.21190755
(0.89453125, 0.397639446788364)
0.19883701
(0.8953993055555556, 0.39260249336560565)
0.21242496
(0.9006076388888888, 0.41020987762345207)
0.20520644
(0.8897569444444444, 0.3940731982390086)


In [127]:
vals = m(torch.Tensor(x_test))

In [128]:
preds = torch.argmax(vals,dim=1).numpy()

In [129]:
sum(preds == y_test)/len(y_test)

0.8939899833055092

In [132]:
[sum(y_test == i) for i in range(4)]

[71, 0, 2129, 196]

In [134]:
2129/len(y_test)

0.8885642737896494