In [1]:
import pandas as pd
import numpy as np

from TrainPredictSelf import TrainPredictSelf
from InterpretPredictions import InterpretPredictions
from ExplainPredictions import ExplainPredictions

import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO, datefmt='%H:%M:%S', format='%(asctime)s.%(msecs)03d - %(filename)s:%(lineno)d - %(message)s')

In [2]:
observed = { 'country': ['Germany','Germany','Germany','Germany','Germany', 'Germany',
                     'US', 'US', 'US', 'US', 'US',
                     'UK', 'UK', 'Germany', 'UK', 'UK', 
                     'LUX','LUX','LUX','LUX','LUX'],
         'currency':['EUR','EUR','EUR','EUR','EUR','MXN',
                     'USD','USD','USD','USD','USD',
                     'GBP','GBP','GBP','GBP','GBP',
                     'EUR','EUR','EUR','EUR','EUR'],
         'manyvalues':['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E', 'E', 'E', 'K', 'L', 'M', 'N', 'O',
                      'Z', 'Z', 'Z', 'Z', 'Y'],
         'rand': [ np.random.uniform(100,200) for _ in range(21) ],
         'linear': [ x for x in range(21) ],
         'city': ['Frankfurt am Main'] * 6 + ['Los Angeles'] * 5 + ['London'] * 5 + ['Luxembourg'] * 4 + ['Bad'] * 1}
dfobserved = pd.DataFrame(observed) 
print(dfobserved)

    country currency manyvalues        rand  linear               city
0   Germany      EUR          A  179.015394       0  Frankfurt am Main
1   Germany      EUR          A  192.992884       1  Frankfurt am Main
2   Germany      EUR          B  171.814989       2  Frankfurt am Main
3   Germany      EUR          B  109.669376       3  Frankfurt am Main
4   Germany      EUR          C  196.639554       4  Frankfurt am Main
5   Germany      MXN          C  137.445375       5  Frankfurt am Main
6        US      USD          D  123.185703       6        Los Angeles
7        US      USD          D  187.781268       7        Los Angeles
8        US      USD          E  138.760506       8        Los Angeles
9        US      USD          E  153.586893       9        Los Angeles
10       US      USD          E  144.314993      10        Los Angeles
11       UK      GBP          K  126.552159      11             London
12       UK      GBP          L  141.377238      12             London
13  Ge

In [3]:
tps = TrainPredictSelf()
results_labels, results_proba = tps.Train(dfobserved)

ip = InterpretPredictions()
dfpredicted, dfprobas = ip.SinglePredictionPerCell(results_labels, results_proba)
dfBoolDifferences =  ip.boolDifferences(dfobserved, results_labels, results_proba)
print(dfpredicted)
print(dfBoolDifferences)


    country currency manyvalues        rand  linear               city
0   Germany      EUR          B  119.964357     2.0  Frankfurt am Main
1   Germany      EUR          B  119.964357     2.0  Frankfurt am Main
2   Germany      EUR          A  189.312847     2.0  Frankfurt am Main
3   Germany      MXN          C  189.312847     2.0  Frankfurt am Main
4   Germany      EUR          A  119.964357     6.0  Frankfurt am Main
5   Germany      EUR          D  189.312847     6.0             London
6        US      USD          E  189.312847    10.0        Los Angeles
7        US      USD          E  119.964357     6.0        Los Angeles
8        US      USD          E  149.545696    10.0        Los Angeles
9        US      USD          E  149.545696    10.0        Los Angeles
10       US      USD          E  149.545696    10.0        Los Angeles
11  Germany      GBP          L  149.545696    14.0             London
12       UK      GBP          E  119.964357    14.0             London
13    

In [4]:
results_labels, results_proba, results_feature_importances = tps.TrainPredictSingleCell('country', 13)
print(results_labels, '\n', results_proba, '\n', results_feature_importances)

['Germany' 'LUX' 'UK' 'US'] 
 [[0.04321751 0.04003536 0.8769222  0.03982493]] 
 [('currency', 0.27125418), ('manyvalues', 0.2777115), ('rand', 0.00089120003), ('linear', 0.16927323), ('city', 0.28086984)]


In [5]:
ep = ExplainPredictions(tps, dfobserved, dfpredicted)

In [6]:
ep.ExplainOneDifference('country', 13)

We think column country row 13 is wrong
Observed :  Germany
We think the most likely are:
UK  :  87.69221901893616 %
Germany  :  4.321750998497009 %
LUX  :  4.003536328673363 %
Reason1:
	When city = London , like this row
	Then:
		 country is UK 80.0 % of the time
		 country is Germany 20.0 % of the time
Reason2:
	When city = London , and manyvalues = M ,like this row
	Then:
		 country is Germany 100.0 % of the time


In [7]:
ep.ExplainOneDifference('currency', 5)

We think column currency row 5 is wrong
Observed :  MXN
We think the most likely are:
EUR  :  89.85338807106018 %
MXN  :  5.566785484552383 %
GBP  :  4.579824581742287 %
Reason1:
	When country = Germany , like this row
	Then:
		 currency is EUR 71.42857142857143 % of the time
		 currency is MXN 14.285714285714285 % of the time
		 currency is GBP 14.285714285714285 % of the time
Reason2:
	When country = Germany , and city = Frankfurt am Main ,like this row
	Then:
		 currency is EUR 83.33333333333334 % of the time
		 currency is MXN 16.666666666666664 % of the time


In [8]:
ep.ExplainOneDifference('city', 20)

We think column city row 20 is wrong
Observed :  Bad
We think the most likely are:
Los Angeles  :  85.17017364501953 %
Frankfurt am Main  :  7.9596541821956635 %
Reason1:
	When currency = EUR , like this row
	Then:
		 city is Frankfurt am Main 50.0 % of the time
		 city is Luxembourg 40.0 % of the time
		 city is Bad 10.0 % of the time
Reason2:
	When currency = EUR , and country = LUX ,like this row
	Then:
		 city is Luxembourg 80.0 % of the time
		 city is Bad 20.0 % of the time
