In [1]:
# These are the basic imports (many fallbacks to numpy)
import numpy as np
import pandas as pd

### Column dependency detection

Via factorization: factorization is numbering unique values (and giving a map, to get the actual values back).

If 2 columns determine the value of another column / grooup, it will have an identical factorization (provided it is processed in the same order - i.e. from the same dataframe).

In [2]:
df = pd.DataFrame({
    'day': [1, 1, 1, 1, 2, 2, 2, 2], # multiple cars on a day
    'car': [1, 1, 2, 2, 1, 1, 2, 2], # multiple rows of 'noise'
    'noise': [4, 3, 4, 2, 1, 4, 5, 3],
    'tour': [1, 1, 2, 2, 3, 3, 4, 4], # this is actually as unique as (day, car) pair    
})
df

Unnamed: 0,day,car,noise,tour
0,1,1,4,1
1,1,1,3,1
2,1,2,4,2
3,1,2,2,2
4,2,1,1,3
5,2,1,4,3
6,2,2,5,4
7,2,2,3,4


In [3]:
df.tour.factorize()

(array([0, 0, 1, 1, 2, 2, 3, 3]), Int64Index([1, 2, 3, 4], dtype='int64'))

In [4]:
df_by_dc = df.set_index(['day', 'car'])

In [5]:
df_by_dc.index

MultiIndex(levels=[[1, 2], [1, 2]],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1, 1]],
           names=['day', 'car'])

In [6]:
df_by_dc.index.factorize()

(array([0, 0, 1, 1, 2, 2, 3, 3]), MultiIndex(levels=[[1, 2], [1, 2]],
            labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))

In [7]:
df.tour.factorize()

(array([0, 0, 1, 1, 2, 2, 3, 3]), Int64Index([1, 2, 3, 4], dtype='int64'))

In [8]:
def are_dependent_columns(df, left, right):
    left_idx = df.set_index(left).index
    right_idx = df.set_index(right).index

    return (left_idx.factorize()[0] == right_idx.factorize()[0]).all()

In [9]:
are_dependent_columns(df, ['car', 'day'], 'tour')

True

In [10]:
# this will be a perturbed version: 
df2 = pd.DataFrame({
    'day': [1, 1, 1, 1, 2, 2, 2, 2], 
    'car': [1, 1, 2, 2, 1, 1, 2, 2],
    'noise': [4, 3, 4, 2, 1, 4, 5, 3],
    'tour': [1, 1, 2, 2, 3, 3, 4, 5], # 5 is wrong
})
are_dependent_columns(df2, ['car', 'day'], 'tour')

False

In [11]:
# this will be a perturbed version: 
df3 = pd.DataFrame({
    'day': [1, 1, 1, 1, 2, 2, 2, 2], 
    'car': [1, 3, 2, 2, 1, 1, 2, 2], # car 3 is wrong
    'noise': [4, 3, 4, 2, 1, 4, 5, 3],
    'tour': [1, 1, 2, 2, 3, 3, 4, 5],
})
are_dependent_columns(df2, ['car', 'day'], 'tour')

False