In [11]:
import fcalc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Binarized data

In [12]:
column_names = [
        'top-left-square', 'top-middle-square', 'top-right-square',
        'middle-left-square', 'middle-middle-square', 'middle-right-square',
        'bottom-left-square', 'bottom-middle-square', 'bottom-right-square',
        'Class'
    ]
df = pd.read_csv('data_sets/tic-tac-toe.data', names = column_names)
df['Class'] = [x == 'positive' for x in df['Class']]
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,Class
0,x,x,x,x,o,o,x,o,o,True
1,x,x,x,x,o,o,o,x,o,True
2,x,x,x,x,o,o,o,o,x,True
3,x,x,x,x,o,o,o,b,b,True
4,x,x,x,x,o,o,b,o,b,True


In [13]:
X = pd.get_dummies(df[column_names[:-1]], prefix=column_names[:-1]).astype(bool)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [14]:
bin_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy(), method="standard-support")

In [15]:
bin_cls.predict(X_test.values)

In [16]:
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, bin_cls.predictions))
print(f1_score(y_test, bin_cls.predictions))

0.9965277777777778
0.9974160206718347


# Pattern structures

In [17]:
column_names = [
        'top-left-square', 'top-middle-square', 'top-right-square',
        'middle-left-square', 'middle-middle-square', 'middle-right-square',
        'bottom-left-square', 'bottom-middle-square', 'bottom-right-square',
        'Class'
    ]
df = pd.read_csv('data_sets/tic-tac-toe.data', names = column_names)
df['Class'] = [x == 'positive' for x in df['Class']]
df.head()

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,Class
0,x,x,x,x,o,o,x,o,o,True
1,x,x,x,x,o,o,o,x,o,True
2,x,x,x,x,o,o,o,o,x,True
3,x,x,x,x,o,o,o,b,b,True
4,x,x,x,x,o,o,b,o,b,True


In [18]:
X = df[column_names[:-1]]
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), 
                                             categorical=np.arange(X_train.shape[1]))

In [20]:
pat_cls.predict(X_test.values)

In [21]:
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, pat_cls.predictions))
print(f1_score(y_test, pat_cls.predictions))

0.9930555555555556
0.9948453608247423


In [38]:
df = pd.read_csv('data_sets/iris.data', names=['sepal_length',	'sepal_width',	'petal_length',	'petal_width','species'])
df['species'] = [x == 'Iris-setosa' for x in df['species']]
df.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
141,6.9,3.1,5.1,2.3,False
23,5.1,3.3,1.7,0.5,True
124,6.7,3.3,5.7,2.1,False
115,6.4,3.2,5.3,2.3,False
15,5.7,4.4,1.5,0.4,True
50,7.0,3.2,4.7,1.4,False
12,4.8,3.0,1.4,0.1,True
133,6.3,2.8,5.1,1.5,False
29,4.7,3.2,1.6,0.2,True
87,6.3,2.3,4.4,1.3,False


In [39]:
X = df.iloc[:,:-1]
y = df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy())

In [41]:
pat_cls.predict(X_test.values)

In [42]:
from sklearn.metrics import accuracy_score, f1_score
print("accuracy:",round(accuracy_score(y_test, pat_cls.predictions),4))
print("f1 score:",round(f1_score(y_test, pat_cls.predictions),4))

accuracy: 1.0
f1 score: 1.0


In [45]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5):
    kf = KFold(n_splits=k)
    accuracies = []

    for train_index, test_index in kf.split(context):
        X_train, X_test = context[train_index], context[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = model_class(X_train, y_train)
        model.predict(X_test) 
        predictions = model.predictions  
        accuracy = np.mean(predictions == y_test)
        accuracies.append(accuracy)

    return accuracies

accuracies = cross_validation_fca(fcalc.classifier.PatternBinaryClassifier, X.values, y.to_numpy(), k=5)
print("Cross-validation accuracies:", accuracies)
print("Average accuracy:", np.mean(accuracies))


Cross-validation accuracies: [0.9333333333333333, 0.9, 1.0, 0.9666666666666667, 1.0]
Average accuracy: 0.9600000000000002


In [27]:
df = pd.read_csv('data_sets/heart_failure_clinical_records_dataset.csv')
df.sample(5)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
259,53.0,0,56,0,50,0,308000.0,0.7,135,1,1,231,0
169,70.0,0,835,0,35,1,305000.0,0.8,133,0,0,145,0
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
166,53.0,0,196,0,60,0,220000.0,0.7,133,1,1,134,0
225,75.0,0,675,1,60,0,265000.0,1.4,125,0,0,205,0


In [28]:
X = df.iloc[:,:-1]
y = df['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [29]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy(), 
                                                   categorical=np.array([1,3,5,9,10]))

In [30]:
pat_cls.predict(X_test.values)

In [33]:
print("accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("f1 score:", round(f1_score(y_test, pat_cls.predictions, average='weighted'), 4))

accuracy: 0.6778
f1 score: 0.6652


# Bank 

In [61]:
df = pd.read_csv('data_sets/bank.csv')

In [63]:
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']
unique_values = {column: df[column].unique() for column in categorical_columns}
unique_values

{'job': array(['admin.', 'technician', 'services', 'management', 'retired',
        'blue-collar', 'unemployed', 'entrepreneur', 'housemaid',
        'unknown', 'self-employed', 'student'], dtype=object),
 'marital': array(['married', 'single', 'divorced'], dtype=object),
 'education': array(['secondary', 'tertiary', 'primary', 'unknown'], dtype=object),
 'default': array(['no', 'yes'], dtype=object),
 'housing': array(['yes', 'no'], dtype=object),
 'loan': array(['no', 'yes'], dtype=object),
 'contact': array(['unknown', 'cellular', 'telephone'], dtype=object),
 'month': array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
        'mar', 'apr', 'sep'], dtype=object),
 'poutcome': array(['unknown', 'other', 'failure', 'success'], dtype=object),
 'deposit': array(['yes', 'no'], dtype=object)}

In [65]:
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
numerical_stats = df[numerical_columns].describe()
numerical_stats

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,15.658036,371.993818,2.508421,51.330407,0.832557
std,11.913369,3225.413326,8.42074,347.128386,2.722077,108.758282,2.292007
min,18.0,-6847.0,1.0,2.0,1.0,-1.0,0.0
25%,32.0,122.0,8.0,138.0,1.0,-1.0,0.0
50%,39.0,550.0,15.0,255.0,2.0,-1.0,0.0
75%,49.0,1708.0,22.0,496.0,3.0,20.75,1.0
max,95.0,81204.0,31.0,3881.0,63.0,854.0,58.0


In [67]:
def bin_numerical_data(data, column, bins, labels):
    data[column] = pd.cut(data[column], bins=bins, labels=labels, right=False)
    return pd.get_dummies(data, columns=[column])

# Определение диапазонов и меток для каждой числовой переменной
age_bins = [18, 31, 41, 51, 61, 96]
age_labels = ['18-30', '31-40', '41-50', '51-60', '61+']

balance_bins = [-float('inf'), 0, 1001, 3001, 5001, float('inf')]
balance_labels = ['negative', '0-1000', '1001-3000', '3001-5000', '5001+']

day_bins = [1, 11, 21, 32]
day_labels = ['1-10', '11-20', '21-31']

duration_bins = [0, 101, 201, 301, 401, float('inf')]
duration_labels = ['0-100', '101-200', '201-300', '301-400', '401+']

campaign_bins = [1, 3, 5, float('inf')]
campaign_labels = ['1-2', '3-4', '5+']

pdays_bins = [-float('inf'), 0, 101, 201, float('inf')]
pdays_labels = ['not_contacted', '0-100', '101-200', '201+']

previous_bins = [0, 1, 3, float('inf')]
previous_labels = ['0', '1-2', '3+']
for column, bins, labels in zip(numerical_columns, 
                                [age_bins, balance_bins, day_bins, duration_bins, campaign_bins, pdays_bins, previous_bins], 
                                [age_labels, balance_labels, day_labels, duration_labels, campaign_labels, pdays_labels, previous_labels]):
    df = bin_numerical_data(df, column, bins, labels)
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,deposit,...,campaign_1-2,campaign_3-4,campaign_5+,pdays_not_contacted,pdays_0-100,pdays_101-200,pdays_201+,previous_0,previous_1-2,previous_3+
0,admin.,married,secondary,no,yes,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False
1,admin.,married,secondary,no,no,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False
2,technician,married,secondary,no,yes,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False
3,services,married,secondary,no,yes,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False
4,admin.,married,tertiary,no,no,no,unknown,may,unknown,yes,...,True,False,False,True,False,False,False,True,False,False


In [68]:
df = pd.get_dummies(df, columns=categorical_columns)
df.head()

Unnamed: 0,age_18-30,age_31-40,age_41-50,age_51-60,age_61+,balance_negative,balance_0-1000,balance_1001-3000,balance_3001-5000,balance_5001+,...,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,deposit_no,deposit_yes
0,False,False,False,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,True,False,True
1,False,False,False,True,False,False,True,False,False,False,...,True,False,False,False,False,False,False,True,False,True
2,False,False,True,False,False,False,False,True,False,False,...,True,False,False,False,False,False,False,True,False,True
3,False,False,False,True,False,False,False,True,False,False,...,True,False,False,False,False,False,False,True,False,True
4,False,False,False,True,False,False,True,False,False,False,...,True,False,False,False,False,False,False,True,False,True


In [73]:
df = df.sample(150)

In [74]:
X = df.iloc[:,:-1]
y = df['deposit_yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [75]:
pat_cls = fcalc.classifier.BinarizedBinaryClassifier(X_train.values, y_train.to_numpy())

In [76]:
pat_cls.predict(X_test.values)

In [77]:
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, pat_cls.predictions))
print(f1_score(y_test, pat_cls.predictions))

0.8222222222222222
0.8181818181818181


In [78]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5, **model_kwargs):
    kf = KFold(n_splits=k)
    accuracies = []

    for train_index, test_index in kf.split(context):
        X_train, X_test = context[train_index], context[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = model_class(X_train, y_train, **model_kwargs)
        model.predict(X_test) 
        predictions = model.predictions  
        accuracy = np.mean(predictions == y_test)
        accuracies.append(accuracy)

    return accuracies

accuracies = cross_validation_fca(fcalc.classifier.BinarizedBinaryClassifier, X.values, y.to_numpy(), k=5, alpha=0.1)
print("Cross-validation accuracies:", accuracies)
print("Average accuracy:", np.mean(accuracies))


Cross-validation accuracies: [0.9333333333333333, 0.7666666666666667, 0.9333333333333333, 0.9333333333333333, 0.9333333333333333]
Average accuracy: 0.9000000000000001


# Winequality

In [105]:
df = pd.read_csv('data_sets/winequality-red.csv')

In [106]:
df['quality'] = (df['quality'] >= 7).astype(bool)

In [107]:
X = df.iloc[:,:-1]
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [108]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy())

In [109]:
pat_cls.predict(X_test.values)

In [110]:
print("accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("f1 score:", round(f1_score(y_test, pat_cls.predictions, average='weighted'), 4))

accuracy: 0.8729
f1 score: 0.8708


In [111]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5, **model_kwargs):
    kf = KFold(n_splits=k)
    accuracies = []

    for train_index, test_index in kf.split(context):
        X_train, X_test = context[train_index], context[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = model_class(X_train, y_train, **model_kwargs)
        model.predict(X_test) 
        predictions = model.predictions  
        accuracy = np.mean(predictions == y_test)
        accuracies.append(accuracy)

    return accuracies

accuracies = cross_validation_fca(fcalc.classifier.PatternBinaryClassifier, X.values, y.to_numpy(), k=5, alpha=0.001)
print("Cross-validation accuracies:", accuracies)
print("Average accuracy:", np.mean(accuracies))


Cross-validation accuracies: [0.8375, 0.715625, 0.809375, 0.653125, 0.7429467084639498]
Average accuracy: 0.75171434169279


# Diabetes

In [92]:
df = pd.read_csv('data_sets/diabetes.csv')

In [94]:
df['Outcome'] = df['Outcome'].astype(bool)

In [98]:
X = df.iloc[:, :-1]
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [99]:
pat_cls = fcalc.classifier.PatternBinaryClassifier(X_train.values, y_train.to_numpy())

In [100]:
pat_cls.predict(X_test.values)

In [101]:
print("accuracy:", round(accuracy_score(y_test, pat_cls.predictions), 4))
print("f1 score:", round(f1_score(y_test, pat_cls.predictions, average='weighted'), 4))

accuracy: 0.697
f1 score: 0.7023


In [104]:
from sklearn.model_selection import KFold
import numpy as np

def cross_validation_fca(model_class, context, labels, k=5, **model_kwargs):
    kf = KFold(n_splits=k)
    accuracies = []

    for train_index, test_index in kf.split(context):
        X_train, X_test = context[train_index], context[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = model_class(X_train, y_train, **model_kwargs)
        model.predict(X_test) 
        predictions = model.predictions  
        accuracy = np.mean(predictions == y_test)
        accuracies.append(accuracy)

    return accuracies

accuracies = cross_validation_fca(fcalc.classifier.PatternBinaryClassifier, X.values, y.to_numpy(), k=5, alpha=0.001)
print("Cross-validation accuracies:", accuracies)
print("Average accuracy:", np.mean(accuracies))


Cross-validation accuracies: [0.7597402597402597, 0.6883116883116883, 0.7792207792207793, 0.8235294117647058, 0.7908496732026143]
Average accuracy: 0.7683303624480095
