In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
import clustering as cs
from glob import glob

# Forward modelling

In [None]:
filename, ext = 'mq_forward_probabilities.pickle', True
df = pd.read_pickle(filename).fillna(np.nan)
clusters = cs.calculate_clusters(df, mode='co', minimum=1, n_clusters=3)

In [None]:
# cs.plot_clustered_heatmap(df, clusters)

In [None]:
# cs.plot_sns(df, clusters)

# Backward modelling

In [None]:
filename, ext = 'mq_backwards_probabilities.pickle', True
df = pd.read_pickle(filename).fillna(np.nan)
clusters = cs.calculate_clusters(df, mode='co', minimum=1, n_clusters=3)

In [None]:
# cs.plot_clustered_heatmap(df, clusters)

In [None]:
# cs.plot_sns(df, clusters)

# Actual distribution

In [None]:
filename, ext = '../emac.ml.tm1.f32.little.5x90x160x320_3.raw.residual.bplanes.32.csv', True
df = pd.read_csv(filename, skiprows=1, index_col=0).astype(float)
ones = df.multiply(df.index.size).divide(np.arange(df.index.size)+1, axis=0)

In [None]:
criteria = ones > .5
groups = criteria
groups.head()

In [None]:
groupclusters = (groups.shift(1) != groups).astype(int).cumsum()
groupclusters.head()

In [None]:
# Rename columns
for col in groupclusters:
    groupclusters[col+'b'] = groupclusters[col]
    groupclusters.drop(col, inplace=True, axis=1)

In [None]:
ones_with_blocks = pd.concat([criteria, groupclusters], axis=1)
ones_with_blocks.head()

In [None]:
total_columns = 0
for col in ones:
    dframe = ones_with_blocks.reset_index().groupby([col,col+'b'])['ix'].apply(np.array)
    total_columns += dframe.size
total_columns

In [None]:
blocks = np.ones((ones.index.size,total_columns))*np.nan
blocks = pd.DataFrame(blocks, columns=["c{:03d}".format(x) for x in range(total_columns)])
blocks.head()

In [None]:
def create_blocks(ones):
    criteria = ones > .5
    groups = criteria
    groupclusters = (groups.shift(1) != groups).astype(int).cumsum()
    
    # Rename columns
    for col in groupclusters:
        groupclusters[col+'b'] = groupclusters[col]
        groupclusters.drop(col, inplace=True, axis=1)
    ones_with_blocks = pd.concat([criteria, groupclusters], axis=1)
    
    # Calculate columns
    total_columns = 0
    for col in ones:
        dframe = ones_with_blocks.reset_index().groupby([col,col+'b'])['ix'].apply(np.array)
        total_columns += dframe.size
    total_columns
    
    # Create blocks
    blocks = np.ones((ones.index.size,total_columns))*np.nan
    blocks = pd.DataFrame(blocks, columns=["c{:03d}".format(x) for x in range(total_columns)])
    
    ix = 0
    get_rid = []
    for col in ones:
        dframe = ones_with_blocks.reset_index().groupby([col,col+'b'])['ix'].apply(np.array)
        for indices in dframe:
            s = [x for x in range(indices.size)]
            blocks["c{:03d}".format(ix)][s] = ones[col][indices].values
            ix+=1
    
    for col in blocks:
        i = 0
        while i < blocks.index.size:
            shifted = blocks.loc[:,col].shift(-i)
            if shifted[0] < 1 and shifted[0] > 0:
                blocks.loc[:,col] = blocks.loc[:,col].shift(-i)
                break
            i+=1
        if i == 32:
            blocks.drop(col, inplace=True, axis=1)
    
    for col in blocks:
        if (blocks[col][:2] > .5).all():
            blocks[col] = 1 - blocks[col]
        elif blocks[col][0] > .5 and np.isnan(blocks[col][1]):
            blocks[col] = 1 - blocks[col]
    
    assert (blocks.iloc[0,:] > .5).sum() == 0, "Woooohooooo///"
    return blocks

In [None]:
ix = 0
get_rid = []
for col in ones:
    dframe = ones_with_blocks.reset_index().groupby([col,col+'b'])['ix'].apply(np.array)
    for indices in dframe:
        s = [x for x in range(indices.size)]
        blocks["c{:03d}".format(ix)][s] = ones[col][indices].values
        ix+=1

In [None]:
# valid_entries = blocks.loc[:,blocks.index.size - blocks.isna().sum() > 10]
# valid_entries.loc[:, valid_entries.corr().isna().sum() > 1]

In [None]:
def get_series_with_at_least_k_valid_values(df, k):
    return df.loc[:,df.index.size - df.isna().sum() > k]

In [None]:
# get_series_with_at_least_k_valid_values(blocks, 10).diff().cumsum()
# blocks[blocks.diff().cumsum() == 0] = np.nan

In [None]:
for col in blocks:
    i = 0
    while i < blocks.index.size:
        shifted = blocks.loc[:,col].shift(-i)
        if shifted[0] < 1 and shifted[0] > 0:
            blocks.loc[:,col] = blocks.loc[:,col].shift(-i)
            break
        i+=1
    if i == 32:
        blocks.drop(col, inplace=True, axis=1)

In [None]:
for col in blocks:
    if (blocks[col][:2] > .5).all():
        blocks[col] = 1 - blocks[col]
    elif blocks[col][0] > .5 and np.isnan(blocks[col][1]):
        blocks[col] = 1 - blocks[col]

In [None]:
(blocks.iloc[0,:] > .5).sum() == 0

In [None]:
def nan_equal(a,b):
    try:
        np.testing.assert_equal(a,b)
    except AssertionError:
        return False
    return True

In [None]:
nan_equal(blocks.values, create_blocks(ones).values)

In [None]:
def split_columns(blocks, sinks, col, sink):
    result = dict()
    sink = 1
    for (bit, df) in sinks[col].reset_index().groupby(col):
        indices = df['index'].index.values
        if bit == sink:
            result['sinking'] = split_and_fill(blocks[col], indices)
        else:
            result['rising'] = split_and_fill(blocks[col], indices)
    return result

In [None]:
def split_by_continues_behaviour(indices):
    splits = []
    subset = [indices[0]-1, indices[0]] if indices[0] != 0 else [indices[0]]
    for v in indices[1:]:
        if np.isnan(v):
            break
        if v == subset[-1]+1:
            subset.append(v)
        else:
            splits.append(subset)
            subset = [v-1, v]
    splits.append(subset)
    return splits

In [None]:
def add_nans(series, splits):
    goal = series.size
    result = []
    for s in splits:
        data = np.ones(goal) * np.nan
        data[np.arange(len(s))] = series[s]
        result.append(data)
    return result

In [None]:
def split_and_fill(series, indices):
    splits = split_by_continues_behaviour(indices)
#     print(series,splits)
    result = add_nans(series, splits)
    return result

In [None]:
def create_sinking_df(df):
    sinking = (blocks.shift(1) >= blocks).astype(int)  # 1 if it is sinking
    sinking.iloc[0,:] = sinking.iloc[1,:]
    return sinking, 1

sinking = create_sinking_df(blocks)[0]

In [None]:
from itertools import chain
from functools import namedtuple

def final_function(ones):
    df = create_blocks(ones)
    sinking, sink = create_sinking_df(df)
    r_f_splits = [split_columns(blocks, sinking, col, sink) for col in sinking]
    sinking = [x['sinking'] for x in r_f_splits if 'sinking' in x.keys()]
    sinking = pd.DataFrame(list(chain.from_iterable(sinking)))
    sinking.name = 'sinking'
    
    rising = [x['rising'] for x in r_f_splits if 'rising' in x.keys()]
    rising = pd.DataFrame(list(chain.from_iterable(rising)))
    rising.name = 'rising'
    
    result = namedtuple('result', 'rising, sinking')
    return result(rising.T, sinking.T)

In [None]:
final_function(ones).sinking