In [34]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from IPython.display import display, clear_output

In [16]:
import sklearn.ensemble

In [17]:
!head -n 3 ../train1.csv

V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
x,x,x,x,o,o,x,o,o,positive
x,x,x,x,o,o,o,o,x,positive


In [18]:
def preprocess(df):
    df = df.copy()
    last_col = list(df)[-1]
    for col in list(df)[:-1]:
        df[col + "_0"] = (df[col] == 'x') | (df[col] == 'o')
        df[col + "_1"] = (df[col] == 'o')
        df = df.drop([col], axis=1)
    df["y"] = (df[last_col] == 'positive')
    df = df.drop([last_col], axis=1)
    return df

def get_common_features(X):
    if len(X) == 0:
        return None, None
    else:
        mask = (X == X[0]).sum(axis=0) == len(X)
        return mask, X[0][mask]

In [19]:
class FCAPredictor:
    def __init__(self, min_support=0.001, min_confidence=0.001, min_intersect=3):
        for name in self.__init__.__code__.co_varnames:
            if name != 'self':
                setattr(self, name, locals()[name])
        
    def fit(self, X, y):
        X = np.array(X)  # L x F
        y = (np.array(y) != 0)  # L
        
        self.examples = [X[~y], X[y]]  # negative and positive examples 
        self.examples_X = X
        self.examples_y = y
        
        self.targets_sums = [(~y).sum(), y.sum()]
        
    def predict_one(self, x):
        x = np.array(x)  # F
        
        # masks of equal features
        intersections = (self.examples_X == x)
        
        results = []
        
        rating2 = [0, 0]
        rating = [0, 0]
        for elem, cand, t in zip(self.examples_X, intersections, self.examples_y):
            min_subset_cardinality = cand.sum()
            if min_subset_cardinality < self.min_intersect:
                continue
            including_mask = ((intersections & cand).sum(axis=1) >= min_subset_cardinality)
            
            corresponding_targets = (self.examples_y[including_mask] == t)
            corresponding_X = self.examples_X[including_mask]
            
            support = corresponding_targets.sum() / self.targets_sums[int(t)]
            confidence = (~corresponding_targets).sum() / self.targets_sums[int(~t)]
    
            common_features_mask, common_features_values = \
                get_common_features(corresponding_X[~corresponding_targets])
            
            if common_features_mask is not None and np.all(x[common_features_mask] == common_features_values) \
                and support > self.min_support and confidence > self.min_confidence:
                rating2[int(~t)] += (common_features_mask.sum() - min_subset_cardinality) * support
             
            #rating[int(t)] += (support > self.min_support) & (unconfidence < self.max_unconfidence)
            #rating[int(t)] += ((support > self.min_support) & (support > 3 * unconfidence)) * intersection_cardinality
            rating[int(t)] += support > 2 * confidence
        
        rating2[1] /= self.targets_sums[1]
        rating2[0] /= self.targets_sums[0]
        return rating2[1] >= rating2[0]
            
        rating[1] /= self.targets_sums[1]
        rating[0] /= self.targets_sums[0]
        return rating[1] >= rating[0]
        
    def predict(self, X):
        X = np.array(X)  # L' x F
        y = np.array([self.predict_one(x) for x in X], dtype=np.bool)  # L
        return y        

In [20]:
a = np.array([[1, 0], [0, 4]], dtype=np.bool)
a == np.array([False, False])

array([[False,  True],
       [ True, False]], dtype=bool)

In [25]:
def calc_metrics(y_pred, y_real):
    metrics = dict(
        TP=(y_pred & y_real).sum(),
        FP=(y_pred & ~y_real).sum(),
        TN=(~y_pred & ~y_real).sum(),
        FN=(~y_pred & y_real).sum(),
        accuracy = (y_pred == y_real).sum() / len(y_pred),
    )
    metrics.update(
        precision=metrics["TP"] / (metrics["TP"] + metrics["FP"]),
        recall=metrics["TP"] / (metrics["TP"] + metrics["FN"]),
    )
    return metrics


def try_it(train, test, predictor=None):
    train = preprocess(train)
    test = preprocess(test)
    predictor = predictor if predictor is not None else FCAPredictor()
    predictor.fit(train.drop(["y"], axis=1), train["y"])
    predictions = predictor.predict(test.drop(["y"], axis=1))

    
    real_targets = np.array(test["y"])
    return calc_metrics(predictions, real_targets)

def show_on_set(i, predictor=None):
    predictor = predictor if predictor is not None else FCAPredictor(min_intersect=12)
    return try_it(pd.read_csv("../train%d.csv" % i), pd.read_csv("../test%d.csv" % i), predictor)

In [37]:
rows = []
for i in range(1, 11):
    pref = "%d_" % i
    row = {
        pref + "FCA": show_on_set(i, FCAPredictor(min_intersect=12)), 
        pref + "RANDOMFOREST": show_on_set(i, sklearn.ensemble.RandomForestClassifier(n_estimators=200)),
        pref + "GRADIENT_BOOSTING": show_on_set(i, sklearn.ensemble.GradientBoostingClassifier(n_estimators=200))
    }
    display(row)
    for k, v in row.items():
        v["run_id"] = k
    rows.append(row)
clear_output(wait=True)
display(pd.DataFrame(rows))

{'1_FCA': {'FN': 0,
  'FP': 0,
  'TN': 32,
  'TP': 61,
  'accuracy': 1.0,
  'precision': 1.0,
  'recall': 1.0},
 '1_GRADIENT_BOOSTING': {'FN': 1,
  'FP': 0,
  'TN': 32,
  'TP': 60,
  'accuracy': 0.989247311827957,
  'precision': 1.0,
  'recall': 0.98360655737704916},
 '1_RANDOMFOREST': {'FN': 5,
  'FP': 2,
  'TN': 30,
  'TP': 56,
  'accuracy': 0.92473118279569888,
  'precision': 0.96551724137931039,
  'recall': 0.91803278688524592}}

AttributeError: 'dict' object has no attribute 'iteritems'

In [28]:
log_file = "log_txt"
with open(log_file, 'w') as f:
    f.write("\n")
for min_support in [0.001, 0.003, 0.005]:
    for min_confidence in [0.001, 0.003, 0.005]:
        for min_intersect in [12]:
            params = {
                "min_support": min_support,
                "min_confidence": max_unconfidence,
                "min_intersect": min_intersect,
            }
            predictor = FCAPredictor(**params)
            params.update(
                try_it(pd.read_csv("../train2.csv"), pd.read_csv("../test2.csv"), predictor)
            )
            with open(log_file, 'a') as f:
                f.write(str(params) + "\n")
            print(params)

NameError: name 'max_unconfidence' is not defined

In [300]:
df = pd.read_csv("../train1.csv")
display(df.head())
set(df["V2"])
df = preprocess(df)
display(df.head())
np.array((df.head() == 0))

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,o,x,positive
2,x,x,x,x,o,o,o,b,b,positive
3,x,x,x,x,o,o,b,o,b,positive
4,x,x,x,x,o,o,b,b,o,positive


Unnamed: 0,V1_0,V1_1,V2_0,V2_1,V3_0,V3_1,V4_0,V4_1,V5_0,V5_1,V6_0,V6_1,V7_0,V7_1,V8_0,V8_1,V9_0,V9_1,y
0,True,False,True,False,True,False,True,False,True,True,True,True,True,False,True,True,True,True,True
1,True,False,True,False,True,False,True,False,True,True,True,True,True,True,True,True,True,False,True
2,True,False,True,False,True,False,True,False,True,True,True,True,True,True,False,False,False,False,True
3,True,False,True,False,True,False,True,False,True,True,True,True,False,False,True,True,False,False,True
4,True,False,True,False,True,False,True,False,True,True,True,True,False,False,False,False,True,True,True


array([[False,  True, False,  True, False,  True, False,  True, False,
        False, False, False, False,  True, False, False, False, False,
        False],
       [False,  True, False,  True, False,  True, False,  True, False,
        False, False, False, False, False, False, False, False,  True,
        False],
       [False,  True, False,  True, False,  True, False,  True, False,
        False, False, False, False, False,  True,  True,  True,  True,
        False],
       [False,  True, False,  True, False,  True, False,  True, False,
        False, False, False,  True,  True, False, False,  True,  True,
        False],
       [False,  True, False,  True, False,  True, False,  True, False,
        False, False, False,  True,  True,  True,  True, False, False,
        False]], dtype=bool)

In [33]:
pd.DataFrame([{"a":1}, {"a":3}])

Unnamed: 0,a
0,1
1,3
