In [1]:
#%matplotlib widget
from time import time
import random
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from functools import lru_cache
import numpy as np
from scipy import ndimage, stats
from itertools import islice

In [2]:
# blur/focus based
def variance_of_laplacian(image):
    # compute the Laplacian of the image and then return the focus
    # measure, which is simply the variance of the Laplacian
    return cv2.Laplacian(image, cv2.CV_64F).var()

def tile(im, nrows=1, ncolumns=6, debugging=False):
    M = im.shape[0] // nrows
    N = im.shape[1] // ncolumns
    rows = []
    for x in range(0, M*nrows,M):
        row = []
        for y in range(0,N*ncolumns,N):
            row.append(im[x:x+M,y:y+N])
        rows.append(row)
    if debugging:
        width = 3.0
        height = width/im.shape[1]*im.shape[0]
        fig = plt.figure(figsize = (width,height))
        gs = gridspec.GridSpec(nrows, ncolumns, figure=fig)
        gs.update(wspace=0.0, hspace=0.0)
        for r in range(nrows):
            for c in range(ncolumns):
                ax = fig.add_subplot(gs[r, c])
                ax.imshow(rows[r][c], vmin=0, vmax=im.max())
                ax.get_xaxis().set_visible(False)
                ax.get_yaxis().set_visible(False)

        plt.show()
            
    return rows

flatten = lambda l: [item for sublist in l for item in sublist]

def via_variance_of_laplacian(f):
    image = f['_']
    tiles = tile(image, nrows=3, ncolumns=1)
    return [variance_of_laplacian(i) for i in flatten(tiles)]

In [3]:
# Optical Flow Based
def fix_perspective(im, debugging=False):
    h, w = im.shape
    assert (w, h) == (640, 160) # this is tuned for a very specific crop and dashcam position
    left = 60 # left-right adjustment
    top = 5
    bottom = 30
    if debugging:
        src_rect = np.array([
            [245, originy+top],   [370, originy+top],
            [0, 125],   [600, 100]],
            dtype = "float32")
        dst_rect = np.array([
            [80-left, 0],    [330-left, 0],
            [108-left, 840],  [320-left, 800]],
            dtype = "float32")
        M = cv2.getPerspectiveTransform(src_rect, dst_rect)
        print(repr(M))
    else:
        M = np.array(
           [[-5.79976346e+00, -2.25571424e+01,  1.92672659e+03],
            [-1.81898940e-14, -1.56260338e+02,  3.90650844e+03],
            [ 5.42171076e-05, -1.56819369e-01,  1.00000000e+00]])
    dst = cv2.warpPerspective(im,M,(300,840-bottom))
    if debugging:
        plt.rcParams['figure.figsize'] = [20, 12]
        plt.imshow(im)
        plt.show()
        plt.imshow(dst)
        plt.show()
    return dst

def mutating_base_calcs(df, n):
    df['dx'] = df.x2 - df.x1 + 0.00001
    df['dy'] = (df.y2 - df.y1)/n
    df['Vf_slope'] = df.dy/df.dx
    df['|Vf|'] = np.sqrt(df.dx**2 + df.dy**2)

    df['right_direction'] = (df.y2>df.y1) & (abs(df.Vf_slope) > 3) # down and steep
    df['good'] = df['right_direction']

def analyze_lk_optical_flow_dfs(dfs):
    xs = []
    def analyze_df(df):
        nonlocal xs
        # absurd
        #df['good'] = df['good'] & (df['|Vf|'] > 2.5)
        df['good'] = df['good'] & (df['|Vf|'] < (35/.45)) #25 is data set max, .45 coverts from Vf to velocity
        
        if sum(df['good']==True) == 0:
            xs += [np.nan, np.nan]
        else:
            with np.errstate(divide='ignore',invalid='ignore'):
                df.loc[df['good'],'z'] = stats.zscore(df.loc[df['good'],'|Vf|'])
            df.loc[df['good']==False,'z'] = 100.0
            df['good'] = df['good'] & (df['z'] < 1.7)

            if len(df) != 0:
                # filter out noisy "small" flow vectors
                Vf_max_good = df[df['good']==True]['|Vf|'].max()
                df['good'] = df['good'] & (df['|Vf|'] > Vf_max_good * 0.25)
            xs.append(df.loc[df['good'],'|Vf|'].mean())
            xs.append(df.loc[df['good'],'|Vf|'].std())
        xs.append(df['dy'].mean())
    overall = 0
    for n, df in enumerate(dfs):
        global N
        N = n + 1
        mutating_base_calcs(df,N)
        analyze_df(df)
        
        overall += df.loc[df['good'],'|Vf|'].mean()/(len(dfs)+1)
    
    stacked_df = pd.concat(dfs, ignore_index=True)
    analyze_df(stacked_df)
    overall += stacked_df.loc[stacked_df['good'],'|Vf|'].mean()/(len(dfs)+1)
    
    xs.append(overall)

    # 0 1 2 #n=1
    # 3 4 5 #n=2
    # 6 7 8 #n=3
    # 9 10 11 #stacked
    # 12 #overall
    return xs

def via_lk_optical_flow_multi(frame, count=3):
    frame['optical_flow'] = []
    dfs = []
    for i in range(1, count+1):
        dfs.append(optical_flow(frame['_'], frame[str(i)], frame))
    return analyze_lk_optical_flow_dfs(dfs)

def debug_optical(df,img):
    img = image_next.copy()
    # we lose old arrow debugging with the df approach, sorry
    #         if right_direction:
    #             color=255
    #             df.loc[i,'good'] = True
    #         else:
    #             color=130
    #             df.loc[i,'good'] = False

    #         if debugging:
    #             img = cv2.arrowedLine(img,(int(x1),int(y1)),(int(x2),int(y2)), color, tipLength=.3)
    #             img = cv2.circle(img,(int(x1),int(y1)),2,color, -1)
    if len(df) == 0:
        print("no useful points")
    else:
        display(df.sort_values(by='|Vf|'))       
        bins = list(range(0,101,10))
        plt.rcParams['figure.figsize'] = [20, 5]
        df['|Vf|'].hist(bins=bins)
        df[df['good']==True]['|Vf|'].hist(bins=bins)
        plt.show()

    plt.rcParams['figure.figsize'] = [20, 12]
    plt.imshow(img)
    plt.show()

# params for ShiTomasi corner detection
feature_params = dict(
    maxCorners = 100,
    qualityLevel = 0.007,
    minDistance = 20,
    blockSize = 9,
    #useHarrisDetector = True,
    )

# Parameters for lucas kanade optical flow
lk_params = dict(
    winSize  = (15,15),
    maxLevel = 1,
    criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03),
    )

def optical_flow(image, image_next, frame, debugging=False):
    p0 = cv2.goodFeaturesToTrack(image, mask=None, **feature_params)
    p1, st, err = cv2.calcOpticalFlowPyrLK(image, image_next, p0, None, **lk_params)
    
    points = []
    for new, old in zip(p1[st==1],p0[st==1]):
        x1, y1 = old.ravel()
        x2, y2 = new.ravel()
        points.append((x1,x2,y1,y2))
    
    df = pd.DataFrame(data=points, columns=('x1','x2','y1','y2'))
    if debugging:
        debug_optical(df,img)

    frame['optical_flow'].append(df)
    return df


In [4]:
# Diff based for stop detect
def via_diff(f):
    return [abs(cv2.subtract(f['_'],f['1']).sum())]


In [5]:
# Process frames
def frames(file='../data/train.mp4'):
    vidcap = cv2.VideoCapture(file)
    while True:
        success, image = vidcap.read()
        if success:
            yield {'orig': image, '_': image, 'xs':[]}
        else:
            return

originy=None
def crop(image, bottom=100, top=220):
    # take of top and bottom
    global originy
    originy = image.shape[0] / 2 - top
    return image[top:image.shape[0] - bottom,:]

def lookahead(frames, count=3):
    # add "lookahead" in keys '1', '2', ...
    # repeats at the end to keep length len
    fs = list()
    
    def _updated_f():
        f = fs.pop(0)
        f.update({str(n+1):f['_'] for n, f in enumerate(fs)})
        return f 
        
    for f in frames:
        fs.append(f)
        if len(fs) > count:
            yield _updated_f()

    for _ in range(count):
        fs.append(f)
        yield _updated_f()
        
def print_frame_keys(frames):
    for f in frames:
        print(repr(list(f.keys())))
        yield f

def view_frames(frames):
    for f in frames:
        for k in f.keys():
            try:
                cv2.imshow(k,f[k])
            except:
                pass
        try:
            cv2.waitKey(0)
        except KeyboardInterrupt:
            cv2.destroyAllWindows()
            print("Stopping early, KeyboardInterrupt")
            return
        yield f
    cv2.destroyAllWindows()

def persist_raw_optical_flow(frames):
    import os
    path = f'./data/{int(time())}'
    os.makedirs(path)
    for i, f in enumerate(frames):
        for j, df in enumerate(f['optical_flow']):
            df: pd.DataFrame
            df.to_pickle(os.path.join(path,f'{i}_{j}.pkl'))
        yield f

class FeatureExtractor():
    def __init__(self, frames_generator_maker):
        self._frames = frames_generator_maker
        self._steps = []
    def add_step(self, step):
        """step(frames_iterator) yields-> [frame,frame,...]; you can filter or gather frames"""
        if callable(step):
            self._steps.append(step)
    def add_processor(self, processor):
        """processor(img) returns-> img; frame['_'] is mutated"""
        def _step(frames):
            for f in frames:
                f['_'] = processor(f['_'])
                yield f
        self.add_step(_step)
    def add_analyzer(self, analyzer):
        """analyzer(frame) returns-> [x1,x2,...]; frame['_'] is forwarded untouched, features are collected"""
        def _step(frames):
            for f in frames:
                f['xs'] += analyzer(f)
                yield f
        self.add_step(_step)

    def __iter__(self):
        pipeline = self._frames()
        for s in self._steps:
            pipeline = s(pipeline)
        return pipeline
    def _pprogress(self, count, force=False):
        if force or time()-self._last>30:
            self._last = time()
            print(f"{count+1} processed in {(time()-self._start)/60:2.1f} minutes")
    def extract_features(self):
        self._start = time()
        self._last = self._start
        X = []
        i=0
        for i, f in enumerate(self):
            X.append(f['xs'])
            self._pprogress(i)
        self._pprogress(i,True)
        return X
import shutil

In [6]:
fe = FeatureExtractor(frames)

#fe.add_step(lambda g: islice(g, 17500, 20400, 1)) # limit frames (start, stop, step)
#fe.add_step(lambda g: islice(g, 400, 420, 1)) # limit frames (start, stop, step)

fe.add_processor(lambda img: crop(img, bottom=100, top=220))
fe.add_processor(lambda img: cv2.cvtColor(img,cv2.COLOR_BGR2GRAY))
fe.add_processor(fix_perspective)
fe.add_processor(lambda img: cv2.GaussianBlur(img,(7,7),0))

fe.add_step(lambda frames: lookahead(frames, count=3))

#fe.add_analyzer(via_lk_optical_flow)
fe.add_analyzer(via_lk_optical_flow_multi)
fe.add_analyzer(via_variance_of_laplacian)
fe.add_analyzer(via_diff)

#fe.add_step(persist_raw_optical_flow)
#fe.add_step(print_frame_keys)
#fe.add_step(view_frames)

xs = fe.extract_features()

629 processed in 0.5 minutes
1249 processed in 1.0 minutes
1857 processed in 1.5 minutes
2470 processed in 2.0 minutes
3084 processed in 2.5 minutes
3738 processed in 3.0 minutes
4387 processed in 3.5 minutes
5052 processed in 4.0 minutes
5714 processed in 4.5 minutes
6374 processed in 5.0 minutes
7034 processed in 5.5 minutes
7697 processed in 6.0 minutes
8356 processed in 6.5 minutes
8999 processed in 7.0 minutes
9653 processed in 7.5 minutes
10303 processed in 8.0 minutes
10937 processed in 8.5 minutes
11585 processed in 9.0 minutes
12227 processed in 9.5 minutes
12868 processed in 10.0 minutes
13510 processed in 10.5 minutes
14142 processed in 11.0 minutes
14778 processed in 11.5 minutes
15429 processed in 12.0 minutes
16091 processed in 12.5 minutes
16722 processed in 13.0 minutes
17375 processed in 13.5 minutes
18025 processed in 14.0 minutes
18648 processed in 14.5 minutes
19289 processed in 15.0 minutes
19930 processed in 15.5 minutes
20400 processed in 15.9 minutes


In [7]:
def shuffle_time_in_chunks(df, n):
    """Break df into n-lengths mini dfs"""
    assert len(df) >= n*10, "doesn't meet minimum number of chunks"
    assert (len(df) % n) == 0, "all chunks equal size"
    
    chunk_count = len(df) // n
    chunks = []
    for x in range(0, len(df), n):
        chunks.append(df[x:x + n])
    random.shuffle(chunks)
    print(f"Using {len(chunks):0d} chunks")
    return pd.concat(chunks, axis=0)

In [32]:
y = pd.read_csv('../data/train.txt', header=None)
X = pd.DataFrame(xs)
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
count,20301.0,20199.0,20400.0,20171.0,20038.0,20399.0,19960.0,19799.0,20399.0,20300.0,20277.0,20400.0,19907.0,20400.0,20400.0,20400.0,20400.0
mean,22.052734,5.525136,4.799234,21.724671,5.542001,2.844092,21.51841,5.687407,2.333646,21.417903,5.42417,3.357223,21.750685,2.429951,3.254773,3.384131,206080.6
std,16.133754,4.117217,9.66915,15.673859,4.248105,7.581492,15.271258,4.52961,6.865346,15.371821,3.901604,7.012366,15.284451,1.381638,2.112268,1.639592,154313.9
min,0.004153,0.002088,-116.187047,0.001527,0.000472,-126.458439,0.002043,0.000354,-99.169061,0.002558,0.001149,-114.883626,0.032415,0.348827,0.441444,0.421173,0.0
25%,8.168556,2.425897,-0.238537,8.194919,2.224346,-0.818561,8.350239,2.147118,-0.985925,8.177106,2.313584,-0.352002,8.562533,1.495517,1.848284,2.115873,128063.8
50%,17.904147,4.502413,3.863911,17.847354,4.385355,2.245137,17.948092,4.433294,1.793814,17.581195,4.558648,2.688383,18.125114,2.148932,2.84921,3.103574,173377.5
75%,35.40603,7.734051,9.793322,35.071106,8.178047,6.362307,34.640009,8.480785,5.344351,34.780746,8.106215,6.777629,35.137876,3.00337,4.121873,4.284229,237701.2
max,73.891515,38.534682,83.617086,77.726476,34.490408,52.578316,77.525239,39.584484,48.778914,73.523423,38.534682,49.815615,66.597995,14.013617,19.977022,23.558937,3287497.0


In [33]:
#X.fillna(value=0, inplace=True)
X.fillna(method='pad', inplace=True)
#X=X[[0,2,4,3,'024']] # 1,2,3 frame Vf and 2-frame std, and linear average of the three Vf
    # 0 1 2 #n=1
    # 3 4 5 #n=2
    # 6 7 8 #n=3
    # 9 10 11 #stacked
    # 12 #overall
X=X[[0,1,2,  3,4,5,  6,7,8  ,9]]
X=X[[3,4,5,9,12]]
X.describe()

KeyError: '[12] not in index'

In [34]:
Xy = X.copy()
Xy['y'] = y

chunksize = 2040 # ten chunks
chunksize = 204*2 # 50
Xy = shuffle_time_in_chunks(Xy, chunksize)
Xy

Using 50 chunks


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y
2040,50.531488,11.349832,13.030548,48.133533,13.797435,5.190921,47.457343,18.171526,13.689880,47.896641,23.083131
2041,56.814117,8.824110,-10.789770,46.110387,15.595254,14.968960,43.055168,11.818676,1.571956,48.989580,23.079400
2042,40.936344,16.971879,-13.327786,59.355430,15.270372,-6.011798,45.324238,18.465573,1.260840,50.130520,23.097455
2043,54.848185,10.624068,8.024640,49.886347,16.514924,10.264162,38.177946,17.449448,4.096790,52.305665,23.070734
2044,57.265716,16.176728,5.337735,49.003952,20.111635,13.704181,38.618794,14.781997,-0.926672,50.562096,23.075198
...,...,...,...,...,...,...,...,...,...,...,...
8971,12.338974,3.846167,5.793846,10.587193,3.327510,3.191908,12.647903,3.495186,1.696768,11.425903,9.380622
8972,12.501878,3.337351,4.935507,10.712543,2.569409,5.314723,11.238910,3.117254,11.026762,11.282899,9.331728
8973,11.704848,2.571974,3.126681,10.013975,3.259235,4.206196,12.077608,2.000770,-0.167175,11.271269,9.309530
8974,11.817096,3.320771,3.294933,9.742487,2.836865,6.507989,12.888727,2.264141,2.999764,10.964815,9.311108


In [35]:
def find_testcount(test_df, fraction_testset=0.3):
    chunks = 1
    while True:
        chunks += 1
        testcount = chunksize * chunks
        if testcount/len(test_df)>fraction_testset:
            break
    print(f"using testcount = {testcount}")
    return testcount
    
testcount = find_testcount(Xy, .43)

Xy_train, Xy_test = train_test_split(Xy, test_size=testcount,shuffle=False)
Xy_train, Xy_test = Xy_train.copy(deep=True), Xy_test.copy(deep=True)
print("Percent test    =", testcount/len(Xy))

using testcount = 8976
Percent test    = 0.44


In [50]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor

m_dum = DummyRegressor(strategy='mean')

m_ensemble = StackingRegressor([
   ('svr', SVR(C=.3)),
   ('rf', RandomForestRegressor(n_estimators=60)),
   #('mlp', MLPRegressor(shuffle=False, alpha=0.1)),
   ])

pipe = Pipeline([
    ('scale', StandardScaler()),
    #('poly', PolynomialFeatures(interaction_only=False, include_bias=False)),
    #'reduce_dims', PCA(n_components=4)),
    #('svr', SVR(C=2)),
    #('rf', RandomForestRegressor()),
    #'mlp', MLPRegressor(shuffle=False, alpha=0.1)),
    #('br', BayesianRidge()),
    ('stack', m_ensemble),
    #('lin', LinearRegression()),
    ])

# # Training classifiers
# reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10)
# reg2 = RandomForestRegressor(random_state=1, n_estimators=10)
# reg3 = LinearRegression()
# ereg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])
# BaysianRidge
# MLP NN

from sklearn.model_selection import GridSearchCV
parameters = {
    'stack__svr__C':[0.0001, 0.001,.01, .1, .3, .9, 2, 4, 10, 15, 20, 50, 100, 200],
    'stack__rf__n_estimators': [3, 5, 8, 10, 20, 50, 100, 150],
    #'stack__rf__max_depth': None,
    
    #'svr__C':[10, 20, 30, 40, 70, 120],
    #'mlp__alpha':[0.0001, 0.001, 0.01],
    #'svr__C':[.3],RandomForestRegressor
    #'svr__gamma':[0.006/4, 0.006/2,0.006,0.006*2],
    }
grid = GridSearchCV(pipe, parameters, verbose=10, n_jobs=8, cv=7)

m_real = grid


In [None]:
m_dum.fit(Xy_train[X.columns], Xy_train['y'])
m_real.fit(Xy_train[X.columns], Xy_train['y'])
try:
    print(m_real.best_estimator_)
except:
    print(m_real)

Fitting 7 folds for each of 112 candidates, totalling 784 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   47.7s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  3.0min
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:  3.8min
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:  4.7min
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:  5.6min
[Parallel(n_jobs=8)]: Done  69 tasks      | elapsed:  7.8min
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed: 11.1min
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed: 17.8min
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed: 20.9min
[Parallel(n_jobs=8)]: Done 129 tasks      | elapsed: 22.4min
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed: 24.0min
[Parallel(n_jobs=8)]: Done 165 tasks      | elapsed: 26.6min
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed: 32.8min
[Parallel(

In [1]:
grid.

SyntaxError: invalid syntax (<ipython-input-1-16fd8e9bda6c>, line 1)

In [None]:
# recreate full original df
Xy_test.loc[:,'is_test'] = True
Xy_train.loc[:,'is_test'] = False
Xy2 = pd.concat([Xy_test,Xy_train]).sort_index()

# get predictions
Xy2['dum'] = m_dum.predict(Xy2[X.columns])
Xy2['pred'] = m_real.predict(Xy2[X.columns])

# for convience
Xy2_test_ix = Xy2['is_test'] == True
Xy2_train_ix = Xy2['is_test'] == False

# smooth based on distribution of acceleration in training set
idea use kalman filter based 

In [None]:
# tune a kalman filter
from pykalman import KalmanFilter
em_vars = [
     #'transition_covariance',
     'observation_covariance',
     'initial_state_mean', 'initial_state_covariance']

T = np.array([[.0009]]) # smaller is more resistance to acceleration

kf = KalmanFilter(initial_state_mean=0, n_dim_obs=1, transition_covariance=T)
kf = kf.em(Xy2.loc[Xy2_train_ix,'y'].values, n_iter=0, em_vars=em_vars)

In [None]:
# apply kalman
Xy2.loc[Xy2_test_ix,'pred_kf'] = kf.smooth(Xy2.loc[Xy2_test_ix,'pred'].values)[0]

In [None]:
# never below 0
Xy2.loc[Xy2['pred_kf'] < 0.0,'pred_kf'] = 0.0

In [None]:
target_mean = Xy2.loc[Xy2_train_ix,'y'].mean()
target_mean = Xy2['y'].mean() # for full comparison, overly optimistic though

NoMLCol = 9

_ = kf.smooth(Xy2[NoMLCol].values)[0]
Xy2['NoML_kf'] = _ * target_mean/_.mean()
Xy2['NoML_scaled'] = Xy2[NoMLCol] * target_mean/Xy2[NoMLCol].mean()


In [None]:
plt.rcParams['figure.figsize'] = [100, 12]
plt.gca().set_xlim((0,len(Xy2)))
def plot_Xy2(ix_mask, column, **kwargs):
    try:
        if ix_mask is None:
            df = Xy2.loc[:,column]
        else:
            df = Xy2.loc[ix_mask, column]
        plt.plot(df.index, df.values, **kwargs)
    except KeyError:
        print(f"Skipping {column}")
    
#plot_Xy2(Xy2_test_ix, 'pred', marker='o', linewidth=0.0, color='green', alpha=.1)
#plot_Xy2(Xy2_train_ix, 'pred', marker='o', linewidth=0.0, color='purple', alpha=.1)
plot_Xy2(None, 'NoML_scaled', marker='o', linewidth=0.0, color='yellow', alpha=.1)
plot_Xy2(None, 'NoML_kf', marker='', linewidth=1.5, color='orange')
plot_Xy2(None, '1_thresh', marker='', linewidth=0.5, color='blue')
plot_Xy2(None, 'y', marker='', linewidth=1.4, color='red')
plot_Xy2(Xy2_test_ix, 'pred_kf', marker='o', linewidth=0.0, color='green', alpha=.1)

In [None]:
def print_summary_line(ix_mask, column='dum'):
    if ix_mask is not None:
        Xy = Xy2[ix_mask]
    else:
        Xy = Xy2
    err = mean_squared_error(Xy['y'], Xy[column])
    print(f"{err:0.1f}", end='\t')

print(f"dummy\ttest\ttrain\ttest_kf\tNoML")
print_summary_line(Xy2_test_ix,'dum')
print_summary_line(Xy2_test_ix,'pred')
print_summary_line(Xy2_train_ix,'pred')
print_summary_line(Xy2_test_ix,'pred_kf')
print_summary_line(None,'NoML_kf') # 10.3 was best, gaussian made it 7.2

dummy	test	train	test_kf	NoML
69.6	16.6	5.6	7.7	13.5	

^^^^got this,,,,,why>???????? trying to lower absurity ceiling again 260->160
maybe column multiplication should be done _after_ forward fillign/padding Xdf

In [None]:
Xy2.describe()

In [22]:
df = Xy2.copy()
df = df.groupby(np.arange(len(df.index)) // 100).mean()
df['loss']=abs(df['y']-df['NoML_kf'])
#plt.plot(df.index, df.values)


KeyError: 'NoML'

In [None]:
df.sort_values(by='loss', ascending=False).head(10).index.values

# inference

In [None]:
# be sure pipeline is set up the same in both places
columns_used_to_train = X.columns

In [None]:
feo = FeatureExtractor(lambda: frames(file='../data/train.mp4'))

#feo.add_step(lambda g: islice(g, 17500, 20400, 1)) # limit frames (start, stop, step)
#feo.add_step(lambda g: islice(g, 400, 420, 1)) # limit frames (start, stop, step)

feo.add_processor(lambda img: crop(img, bottom=100, top=220))
feo.add_processor(lambda img: cv2.cvtColor(img,cv2.COLOR_BGR2GRAY))
feo.add_processor(fix_perspective)
feo.add_processor(lambda img: cv2.GaussianBlur(img,(7,7),0))

feo.add_step(lambda frames: lookahead(frames, count=3))

#feo.add_analyzer(via_lk_optical_flow)
feo.add_analyzer(via_lk_optical_flow_multi)
feo.add_analyzer(via_variance_of_laplacian)
feo.add_analyzer(via_diff)

#feo.add_step(print_frame_keys)
#feo.add_step(view_frames)

xso = fe.extract_features()

In [None]:
columns_used_to_train