In [262]:
import fcalc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder


# Binarized data

In [12]:
column_names = [
        'top-left-square', 'top-middle-square', 'top-right-square',
        'middle-left-square', 'middle-middle-square', 'middle-right-square',
        'bottom-left-square', 'bottom-middle-square', 'bottom-right-square',
        'Class'
    ]
df = pd.read_csv('data_sets/tic-tac-toe.data', names = column_names)
df['Class'] = [x == 'positive' for x in df['Class']]
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,Class
0,x,x,x,x,o,o,x,o,o,True
1,x,x,x,x,o,o,o,x,o,True
2,x,x,x,x,o,o,o,o,x,True
3,x,x,x,x,o,o,o,b,b,True
4,x,x,x,x,o,o,b,o,b,True


In [13]:
X = pd.get_dummies(df[column_names[:-1]], prefix=column_names[:-1]).astype(bool)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [14]:
bin_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy(), method="standard-support")

In [15]:
bin_cls.predict(X_test.values)

In [16]:
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, bin_cls.predictions))
print(f1_score(y_test, bin_cls.predictions))

0.9965277777777778
0.9974160206718347


In [74]:
X = df.iloc[:,:-1]
y = df['deposit_yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [75]:
pat_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy())

In [76]:
pat_cls.predict(X_test.values)

In [77]:
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, pat_cls.predictions))
print(f1_score(y_test, pat_cls.predictions))

0.8222222222222222
0.8181818181818181


# BinarizedClassifier

## Bank 

In [352]:
df = pd.read_csv('data_sets/bank.csv')

In [353]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']
unique_values = {column: df[column].unique() for column in categorical_columns}
unique_values

{'job': array(['admin.', 'technician', 'services', 'management', 'retired',
        'blue-collar', 'unemployed', 'entrepreneur', 'housemaid',
        'unknown', 'self-employed', 'student'], dtype=object),
 'marital': array(['married', 'single', 'divorced'], dtype=object),
 'education': array(['secondary', 'tertiary', 'primary', 'unknown'], dtype=object),
 'default': array(['no', 'yes'], dtype=object),
 'housing': array(['yes', 'no'], dtype=object),
 'loan': array(['no', 'yes'], dtype=object),
 'contact': array(['unknown', 'cellular', 'telephone'], dtype=object),
 'month': array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
        'mar', 'apr', 'sep'], dtype=object),
 'poutcome': array(['unknown', 'other', 'failure', 'success'], dtype=object),
 'deposit': array(['yes', 'no'], dtype=object)}

In [354]:
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
numerical_stats = df[numerical_columns].describe()
numerical_stats

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,15.658036,371.993818,2.508421,51.330407,0.832557
std,11.913369,3225.413326,8.42074,347.128386,2.722077,108.758282,2.292007
min,18.0,-6847.0,1.0,2.0,1.0,-1.0,0.0
25%,32.0,122.0,8.0,138.0,1.0,-1.0,0.0
50%,39.0,550.0,15.0,255.0,2.0,-1.0,0.0
75%,49.0,1708.0,22.0,496.0,3.0,20.75,1.0
max,95.0,81204.0,31.0,3881.0,63.0,854.0,58.0


In [355]:
def bin_numerical_data(data, column, bins, labels):
    data[column] = pd.cut(data[column], bins=bins, labels=labels, right=False)
    return pd.get_dummies(data, columns=[column])

# Определение диапазонов и меток для каждой числовой переменной
age_bins = [18, 31, 41, 51, 61, 96]
age_labels = ['18-30', '31-40', '41-50', '51-60', '61+']

balance_bins = [-float('inf'), 0, 1001, 3001, 5001, float('inf')]
balance_labels = ['negative', '0-1000', '1001-3000', '3001-5000', '5001+']

day_bins = [1, 11, 21, 32]
day_labels = ['1-10', '11-20', '21-31']

duration_bins = [0, 101, 201, 301, 401, float('inf')]
duration_labels = ['0-100', '101-200', '201-300', '301-400', '401+']

campaign_bins = [1, 3, 5, float('inf')]
campaign_labels = ['1-2', '3-4', '5+']

pdays_bins = [-float('inf'), 0, 101, 201, float('inf')]
pdays_labels = ['not_contacted', '0-100', '101-200', '201+']

previous_bins = [0, 1, 3, float('inf')]
previous_labels = ['0', '1-2', '3+']
for column, bins, labels in zip(numerical_columns, 
                                [age_bins, balance_bins, day_bins, duration_bins, campaign_bins, pdays_bins, previous_bins], 
                                [age_labels, balance_labels, day_labels, duration_labels, campaign_labels, pdays_labels, previous_labels]):
    df = bin_numerical_data(df, column, bins, labels)
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,deposit,...,campaign_1-2,campaign_3-4,campaign_5+,pdays_not_contacted,pdays_0-100,pdays_101-200,pdays_201+,previous_0,previous_1-2,previous_3+
0,admin.,married,secondary,no,yes,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False
1,admin.,married,secondary,no,no,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False
2,technician,married,secondary,no,yes,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False
3,services,married,secondary,no,yes,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False
4,admin.,married,tertiary,no,no,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False


In [356]:
df = pd.get_dummies(df, columns=categorical_columns)
df.head()

Unnamed: 0,age_18-30,age_31-40,age_41-50,age_51-60,age_61+,balance_negative,balance_0-1000,balance_1001-3000,balance_3001-5000,balance_5001+,...,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,deposit_no,deposit_yes
0,False,False,False,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,True,False,True
1,False,False,False,True,False,False,True,False,False,False,...,True,False,False,False,False,False,False,True,False,True
2,False,False,True,False,False,False,False,True,False,False,...,True,False,False,False,False,False,False,True,False,True
3,False,False,False,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,True,False,True
4,False,False,False,True,False,False,True,False,False,False,...,True,False,False,False,False,False,False,True,False,True


In [357]:
df = df.sample(150)

In [358]:
X = df.iloc[:,:-1]
y = df['deposit_yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [359]:
pat_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy())

In [360]:
pat_cls.predict(X_test.values)

In [361]:
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, pat_cls.predictions))
print(f1_score(y_test, pat_cls.predictions))

0.9111111111111111
0.8823529411764706


In [362]:
def cross_validation_fca(model_class, context, labels, k=5, alphas=[0.], methods=['standard']):
    kf = KFold(n_splits=k)
    results = []

    for alpha in alphas:
        for method in methods:
            accuracies = []
            for train_index, test_index in kf.split(context):
                X_train, X_test = context[train_index], context[test_index]
                y_train, y_test = labels[train_index], labels[test_index]
                model = model_class(X_train, y_train, alpha=alpha, method=method)
                model.predict(X_test) 
                predictions = model.predictions  
                accuracy = np.mean(predictions == y_test)
                accuracies.append(accuracy)
            results.append({'alpha': alpha, 'method': method, 'accuracies': accuracies, 'average_accuracy': np.mean(accuracies)})

    return results

# Example usage:
results = cross_validation_fca(
    model_class=fcalc.classifier.BinarizedBinaryClassifier, 
    context=X.values, 
    labels=y.to_numpy(), 
    k=5, 
    alphas=[0.1, 0.2, 0.3], 
    methods=['standard', 'standard-support', 'ratio-support']
)

for result in results:
    print(f"Alpha: {result['alpha']}, Method: {result['method']}, Average Accuracy: {result['average_accuracy']}")


Alpha: 0.1, Method: standard, Average Accuracy: 0.9066666666666666
Alpha: 0.1, Method: standard-support, Average Accuracy: 0.9333333333333333
Alpha: 0.1, Method: ratio-support, Average Accuracy: 0.9666666666666666
Alpha: 0.2, Method: standard, Average Accuracy: 0.9199999999999999
Alpha: 0.2, Method: standard-support, Average Accuracy: 0.8933333333333333
Alpha: 0.2, Method: ratio-support, Average Accuracy: 0.9666666666666666
Alpha: 0.3, Method: standard, Average Accuracy: 0.8866666666666667
Alpha: 0.3, Method: standard-support, Average Accuracy: 0.7
Alpha: 0.3, Method: ratio-support, Average Accuracy: 0.96


## Winequality

In [343]:
df = pd.read_csv('data_sets/winequality-red.csv')

In [344]:
quartiles = df.drop(columns=['quality']).quantile([0.25, 0.5, 0.75])
binarized_dataset = pd.DataFrame()
for column in df.columns:
    if column != 'quality':
        binarized_dataset[f'{column} Q1'] = (df[column] <= quartiles.loc[0.25, column]).astype(int)
        binarized_dataset[f'{column} Q2'] = ((df[column] > quartiles.loc[0.25, column]) & (df[column] <= quartiles.loc[0.5, column])).astype(int)
        binarized_dataset[f'{column} Q3'] = ((df[column] > quartiles.loc[0.5, column]) & (df[column] <= quartiles.loc[0.75, column])).astype(int)
        binarized_dataset[f'{column} Q4'] = (df[column] > quartiles.loc[0.75, column]).astype(int)
binarized_dataset['quality'] = (df['quality'] > 7).astype(int)
binarized_dataset.head()

Unnamed: 0,fixed acidity Q1,fixed acidity Q2,fixed acidity Q3,fixed acidity Q4,volatile acidity Q1,volatile acidity Q2,volatile acidity Q3,volatile acidity Q4,citric acid Q1,citric acid Q2,...,pH Q4,sulphates Q1,sulphates Q2,sulphates Q3,sulphates Q4,alcohol Q1,alcohol Q2,alcohol Q3,alcohol Q4,quality
0,0,1,0,0,0,0,0,1,1,0,...,1,0,1,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,1,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,1,0,0,0
3,0,0,0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,1,1,0,...,1,0,1,0,0,1,0,0,0,0


In [345]:
binarized_dataset['quality'].unique()

array([0, 1])

In [346]:
binarized_dataset  = binarized_dataset.sample(150)

In [347]:
X = binarized_dataset.iloc[:,:-1]
y = binarized_dataset['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [348]:
pat_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy(), method="support", alpha=1)

In [349]:
pat_cls.predict(X_test.values)

In [350]:
print("accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("f1 score:", round(f1_score(y_test, pat_cls.predictions, average='weighted'), 4))

accuracy: 0.9833
f1 score: 0.9751


In [351]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5, alphas=[0.], methods=['standard']):
    kf = KFold(n_splits=k)
    results = []

    for alpha in alphas:
        for method in methods:
            accuracies = []
            for train_index, test_index in kf.split(context):
                X_train, X_test = context[train_index], context[test_index]
                y_train, y_test = labels[train_index], labels[test_index]
                model = model_class(X_train, y_train, alpha=alpha, method=method)
                model.predict(X_test) 
                predictions = model.predictions  
                accuracy = np.mean(predictions == y_test)
                accuracies.append(accuracy)
            results.append({'alpha': alpha, 'method': method, 'accuracies': accuracies, 'average_accuracy': np.mean(accuracies)})

    return results

# Example usage:
results = cross_validation_fca(
    model_class=fcalc.classifier.BinarizedBinaryClassifier, 
    context=X.values, 
    labels=y.to_numpy(), 
    k=5, 
    alphas=[0.1, 0.2, 0.3], 
    methods=['standard', 'standard-support', 'ratio-support']
)

for result in results:
    print(f"Alpha: {result['alpha']}, Method: {result['method']}, Average Accuracy: {result['average_accuracy']}")


Alpha: 0.1, Method: standard, Average Accuracy: 0.12666666666666665
Alpha: 0.1, Method: standard-support, Average Accuracy: 0.0
Alpha: 0.1, Method: ratio-support, Average Accuracy: 0.9733333333333334
Alpha: 0.2, Method: standard, Average Accuracy: 0.12666666666666665
Alpha: 0.2, Method: standard-support, Average Accuracy: 0.0
Alpha: 0.2, Method: ratio-support, Average Accuracy: 0.9733333333333334
Alpha: 0.3, Method: standard, Average Accuracy: 0.12666666666666665
Alpha: 0.3, Method: standard-support, Average Accuracy: 0.0
Alpha: 0.3, Method: ratio-support, Average Accuracy: 0.9733333333333334


## Diabetes

In [363]:
df = pd.read_csv('data_sets/diabetes.csv')

In [364]:
quartiles = df.quantile([0.25, 0.5, 0.75])
binarized_dataset = pd.DataFrame()
for column in df.columns:
    if column != 'Outcome':
        binarized_dataset[f'{column} Q1'] = (df[column] <= quartiles.loc[0.25, column]).astype(int)
        binarized_dataset[f'{column} Q2'] = ((df[column] > quartiles.loc[0.25, column]) & (df[column] <= quartiles.loc[0.5, column])).astype(int)
        binarized_dataset[f'{column} Q3'] = ((df[column] > quartiles.loc[0.5, column]) & (df[column] <= quartiles.loc[0.75, column])).astype(int)
        binarized_dataset[f'{column} Q4'] = (df[column] > quartiles.loc[0.75, column]).astype(int)
binarized_dataset['Outcome'] = df['Outcome']
binarized_dataset.head()

Unnamed: 0,Pregnancies Q1,Pregnancies Q2,Pregnancies Q3,Pregnancies Q4,Glucose Q1,Glucose Q2,Glucose Q3,Glucose Q4,BloodPressure Q1,BloodPressure Q2,...,BMI Q4,DiabetesPedigreeFunction Q1,DiabetesPedigreeFunction Q2,DiabetesPedigreeFunction Q3,DiabetesPedigreeFunction Q4,Age Q1,Age Q2,Age Q3,Age Q4,Outcome
0,0,0,1,0,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,1,1
1,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,1,0,0,0,1,0,1,...,0,0,0,0,1,0,0,1,0,1
3,1,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,1,0,...,1,0,0,0,1,0,0,1,0,1


In [365]:
X = binarized_dataset.iloc[:,:-1]
y = binarized_dataset['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [366]:
pat_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy(), method="support", alpha=1)

In [367]:
pat_cls.predict(X_test.values)

In [368]:
print("accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("f1 score:", round(f1_score(y_test, pat_cls.predictions, average='weighted'), 4))

accuracy: 0.6688
f1 score: 0.5361


In [369]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5, alphas=[0.], methods=['standard']):
    kf = KFold(n_splits=k)
    results = []

    for alpha in alphas:
        for method in methods:
            accuracies = []
            for train_index, test_index in kf.split(context):
                X_train, X_test = context[train_index], context[test_index]
                y_train, y_test = labels[train_index], labels[test_index]
                model = model_class(X_train, y_train, alpha=alpha, method=method)
                model.predict(X_test) 
                predictions = model.predictions  
                accuracy = np.mean(predictions == y_test)
                accuracies.append(accuracy)
            results.append({'alpha': alpha, 'method': method, 'accuracies': accuracies, 'average_accuracy': np.mean(accuracies)})

    return results

# Example usage:
results = cross_validation_fca(
    model_class=fcalc.classifier.BinarizedBinaryClassifier, 
    context=X.values, 
    labels=y.to_numpy(), 
    k=5, 
    alphas=[0.1, 0.2, 0.3], 
    methods=['standard', 'standard-support', 'ratio-support']
)

for result in results:
    print(f"Alpha: {result['alpha']}, Method: {result['method']}, Average Accuracy: {result['average_accuracy']}")


Alpha: 0.1, Method: standard, Average Accuracy: 0.7084033613445377
Alpha: 0.1, Method: standard-support, Average Accuracy: 0.0
Alpha: 0.1, Method: ratio-support, Average Accuracy: 0.44143111790170614
Alpha: 0.2, Method: standard, Average Accuracy: 0.7084033613445377
Alpha: 0.2, Method: standard-support, Average Accuracy: 0.0
Alpha: 0.2, Method: ratio-support, Average Accuracy: 0.44143111790170614
Alpha: 0.3, Method: standard, Average Accuracy: 0.7084033613445377
Alpha: 0.3, Method: standard-support, Average Accuracy: 0.0
Alpha: 0.3, Method: ratio-support, Average Accuracy: 0.44143111790170614


# PatternBinaryClassifier

## Bank 

In [370]:
df = pd.read_csv('data_sets/bank.csv')

In [371]:
df = df.sample(1000)

In [372]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
10538,51,management,married,tertiary,yes,0,no,yes,cellular,29,jul,204,6,-1,0,unknown,no
438,44,admin.,married,secondary,no,1074,yes,no,cellular,11,jul,911,2,-1,0,unknown,yes
3564,28,technician,married,secondary,no,742,no,no,cellular,26,may,236,1,-1,0,unknown,yes
4073,47,admin.,single,secondary,no,1693,yes,no,cellular,22,oct,608,2,92,4,other,yes
2974,31,technician,married,secondary,no,318,no,no,cellular,2,dec,310,1,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7301,31,services,single,secondary,no,222,yes,no,cellular,13,may,168,2,-1,0,unknown,no
6046,59,retired,married,primary,no,-411,no,no,cellular,29,jan,225,1,-1,0,unknown,no
5306,34,blue-collar,single,secondary,no,3723,yes,no,unknown,27,may,47,5,-1,0,unknown,no
5427,35,technician,married,secondary,no,840,yes,no,cellular,29,jul,281,4,-1,0,unknown,no


In [373]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']
label_encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

In [374]:
df['deposit'].unique()

array([0, 1])

In [375]:
X = df.iloc[:,:-1]
y = df['deposit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [376]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), )
pat_cls.predict(X_test.values)

In [377]:
print(accuracy_score(y_test, pat_cls.predictions))
print(f1_score(y_test, pat_cls.predictions))

0.7133333333333334
0.6692307692307692


In [378]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5, alphas=[0.], methods=['standard']):
    kf = KFold(n_splits=k)
    results = []

    for alpha in alphas:
        for method in methods:
            accuracies = []
            for train_index, test_index in kf.split(context):
                X_train, X_test = context[train_index], context[test_index]
                y_train, y_test = labels[train_index], labels[test_index]
                model = model_class(X_train, y_train, alpha=alpha, method=method)
                model.predict(X_test) 
                predictions = model.predictions  
                accuracy = np.mean(predictions == y_test)
                accuracies.append(accuracy)
            results.append({'alpha': alpha, 'method': method, 'accuracies': accuracies, 'average_accuracy': np.mean(accuracies)})

    return results

# Example usage:
results = cross_validation_fca(
    model_class=fcalc.classifier.PatternBinaryClassifier, 
    context=X.values, 
    labels=y.to_numpy(), 
    k=5, 
    alphas=[0.1, 0.2, 0.3], 
    methods=['standard', 'standard-support', 'ratio-support']
)

for result in results:
    print(f"Alpha: {result['alpha']}, Method: {result['method']}, Average Accuracy: {result['average_accuracy']}")


Alpha: 0.1, Method: standard, Average Accuracy: 0.0
Alpha: 0.1, Method: standard-support, Average Accuracy: 0.471
Alpha: 0.1, Method: ratio-support, Average Accuracy: 0.752
Alpha: 0.2, Method: standard, Average Accuracy: 0.0
Alpha: 0.2, Method: standard-support, Average Accuracy: 0.471
Alpha: 0.2, Method: ratio-support, Average Accuracy: 0.753
Alpha: 0.3, Method: standard, Average Accuracy: 0.0
Alpha: 0.3, Method: standard-support, Average Accuracy: 0.471
Alpha: 0.3, Method: ratio-support, Average Accuracy: 0.752


## Winequality

In [407]:
df = pd.read_csv('data_sets/winequality-red.csv')


In [408]:
df['quality'] = (df['quality'] > 7).astype(int)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [409]:
df = df.sample(300)

In [410]:
X = df.iloc[:,:-1]
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [412]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), method='ratio-support')
pat_cls.predict(X_test.values)

In [413]:
print("accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("f1 score:", round(f1_score(y_test, pat_cls.predictions, average='weighted'), 4))

accuracy: 0.9111
f1 score: 0.9362


In [414]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5, alphas=[0.], methods=['standard']):
    kf = KFold(n_splits=k)
    results = []

    for alpha in alphas:
        for method in methods:
            accuracies = []
            for train_index, test_index in kf.split(context):
                X_train, X_test = context[train_index], context[test_index]
                y_train, y_test = labels[train_index], labels[test_index]
                model = model_class(X_train, y_train, alpha=alpha, method=method)
                model.predict(X_test) 
                predictions = model.predictions  
                accuracy = np.mean(predictions == y_test)
                accuracies.append(accuracy)
            results.append({'alpha': alpha, 'method': method, 'accuracies': accuracies, 'average_accuracy': np.mean(accuracies)})

    return results

# Example usage:
results = cross_validation_fca(
    model_class=fcalc.classifier.PatternBinaryClassifier, 
    context=X.values, 
    labels=y.to_numpy(), 
    k=5, 
    alphas=[0.1, 0.2, 0.3], 
    methods=['standard', 'standard-support', 'ratio-support']
)

for result in results:
    print(f"Alpha: {result['alpha']}, Method: {result['method']}, Average Accuracy: {result['average_accuracy']}")


Alpha: 0.1, Method: standard, Average Accuracy: 0.003333333333333333
Alpha: 0.1, Method: standard-support, Average Accuracy: 0.013333333333333332
Alpha: 0.1, Method: ratio-support, Average Accuracy: 0.9800000000000001
Alpha: 0.2, Method: standard, Average Accuracy: 0.003333333333333333
Alpha: 0.2, Method: standard-support, Average Accuracy: 0.013333333333333332
Alpha: 0.2, Method: ratio-support, Average Accuracy: 0.9833333333333334
Alpha: 0.3, Method: standard, Average Accuracy: 0.003333333333333333
Alpha: 0.3, Method: standard-support, Average Accuracy: 0.013333333333333332
Alpha: 0.3, Method: ratio-support, Average Accuracy: 0.9833333333333334


## Diabetes

In [428]:
df = pd.read_csv('data_sets/diabetes.csv')

In [429]:
X = df.iloc[:,:-1]
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [430]:
df['Outcome'].unique()

array([1, 0])

In [431]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), method="support", alpha=1)

In [432]:
pat_cls.predict(X_test.values)

In [434]:
print("accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("f1 score:", round(f1_score(y_test, pat_cls.predictions, average='weighted'), 4))

accuracy: 0.6688
f1 score: 0.5361


In [435]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5, alphas=[0.], methods=['standard']):
    kf = KFold(n_splits=k)
    results = []

    for alpha in alphas:
        for method in methods:
            accuracies = []
            for train_index, test_index in kf.split(context):
                X_train, X_test = context[train_index], context[test_index]
                y_train, y_test = labels[train_index], labels[test_index]
                model = model_class(X_train, y_train, alpha=alpha, method=method)
                model.predict(X_test) 
                predictions = model.predictions  
                accuracy = np.mean(predictions == y_test)
                accuracies.append(accuracy)
            results.append({'alpha': alpha, 'method': method, 'accuracies': accuracies, 'average_accuracy': np.mean(accuracies)})

    return results

results = cross_validation_fca(
    model_class=fcalc.classifier.PatternBinaryClassifier, 
    context=X.values, 
    labels=y.to_numpy(), 
    k=5, 
    alphas=[0.1, 0.2, 0.3], 
    methods=['standard', 'standard-support', 'ratio-support']
)

for result in results:
    print(f"Alpha: {result['alpha']}, Method: {result['method']}, Average Accuracy: {result['average_accuracy']}")


Alpha: 0.1, Method: standard, Average Accuracy: 0.04425770308123249
Alpha: 0.1, Method: standard-support, Average Accuracy: 0.32539682539682535
Alpha: 0.1, Method: ratio-support, Average Accuracy: 0.6366267719208897
Alpha: 0.2, Method: standard, Average Accuracy: 0.006493506493506494
Alpha: 0.2, Method: standard-support, Average Accuracy: 0.32019353195823785
Alpha: 0.2, Method: ratio-support, Average Accuracy: 0.6418470418470419
Alpha: 0.3, Method: standard, Average Accuracy: 0.0012987012987012987
Alpha: 0.3, Method: standard-support, Average Accuracy: 0.31889483065953655
Alpha: 0.3, Method: ratio-support, Average Accuracy: 0.6522960699431287
