In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
from sklearn.preprocessing import StandardScaler

path = "../../dane/8CPU_20RAM/3600s/2repl/merged.csv"

### Wczytanie danych

In [2]:
data = pd.read_csv(path)
data = data.drop('replicaId', axis=1)
data = data.drop(["applicationTime_methods", "databaseTime_methods", "databaseTime_trading", 
                  "applicationTime_trading", "timestamp"], axis=1)

### Normalizacja danych

In [3]:
endpoint_map = {
    'do-register': 0,
    'add-sell-offer': 1,
    'add-buy-offer': 2,
    'get-stock-data': 3,
    'add-company': 4,
    'get-stock-users-and-companies': 5
}
data['endpointUrl_methods'] = data['endpointUrl_methods'].replace(endpoint_map)

data['cpuUsage_stock'] = round(data['cpuUsage_stock'], 2)
data['memoryUsage_stock'] = round(data['memoryUsage_stock'], 2)
data['cpuUsage_traffic'] = round(data['cpuUsage_traffic'], 2)
data['memoryUsage_traffic'] = round(data['memoryUsage_traffic'], 2)

### Dyskretyzacja wartości ciągłych

In [4]:
transformer = KBinsDiscretizer(encode='onehot-dense', strategy='kmeans' , n_bins=5)
df_discret = pd.DataFrame(transformer.fit_transform(data).astype(np.int8))

### Utworzenie nowych nazw cech

In [5]:
df_discret.columns = [
    f"{data.columns[idx]}: {round(edge, 4)} - {round(next_edge, 4)}"
    for idx, edges in enumerate(transformer.bin_edges_)
    for edge, next_edge in zip(edges, edges[1:])
]

# Wyrzucenie zdyskretyzowanych cech o zerowej wariancji:
df_discret = df_discret.loc[:, (df_discret.var() != 0)]

### Apriori

In [7]:

test = association_rules(apriori(df_discret, min_support=0.1, use_colnames=True, max_len=4), metric='lift').sort_values(by='lift', ascending=False)

test

test.to_excel("test.xlsx", index=False)

