In [53]:
import pandas as pd
import numpy as np
import ast
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules


In [54]:
df = pd.read_csv("omniart-paintings-filtered-clean.csv")

In [55]:
import random

# Defina quantos clusters fictícios você quer simular
num_fake_clusters = 10
fake_cluster_labels = [f"color_cluster_{i}" for i in range(num_fake_clusters)]

def random_clusters_for_painting():
    # Sorteia entre 1 e 3 clusters aleatórios para cada pintura
    n = random.randint(1, 3)
    return random.sample(fake_cluster_labels, n)

# Cria a coluna com listas de clusters aleatórios
df['painting_cluster_labels_list'] = df.apply(lambda row: random_clusters_for_painting(), axis=1)
df.to_csv("omniart-paintings-with-fake-clusters.csv", index=False)

In [56]:
# --- Data Cleaning and Preprocessing ---
CLUSTER_COLUMN_NAME = 'painting_cluster_labels_list'

required_columns = [CLUSTER_COLUMN_NAME, 'school', 'creation_year']

df_clean = df.dropna(subset=required_columns).copy()
print(f"Linhas após remover valores nulos: {df_clean.shape[0]}")

try:
    df_clean[CLUSTER_COLUMN_NAME] = df_clean[CLUSTER_COLUMN_NAME].apply(ast.literal_eval)
    print("\nColuna de clusters convertida de string para lista com sucesso.")
except (ValueError, SyntaxError) as e:
    print(f"\nErro ao converter a coluna de clusters: {e}")
    print("Verifique se a coluna contém strings de listas Python válidas.")

print(df_clean[[CLUSTER_COLUMN_NAME, CLUSTER_COLUMN_NAME, 'school', 'creation_year']].head())

Linhas após remover valores nulos: 264247

Erro ao converter a coluna de clusters: malformed node or string: ['color_cluster_3']
Verifique se a coluna contém strings de listas Python válidas.
  painting_cluster_labels_list painting_cluster_labels_list   school  \
0            [color_cluster_3]            [color_cluster_3]   modern   
1            [color_cluster_4]            [color_cluster_4]    Italy   
2            [color_cluster_0]            [color_cluster_0]    Italy   
3            [color_cluster_6]            [color_cluster_6]   modern   
4            [color_cluster_6]            [color_cluster_6]   modern   

   creation_year  
0         2007.0  
1         1366.0  
2         1754.0  
3         2010.0  
4         2009.0  


In [57]:
# --- Feature Engineering (Creating Items) ---

# 1. Criar item para a escola artística
df_clean['school_item'] = 'school_' + df_clean['school'].str.lower().str.replace(' ', '_', regex=False)

# 2. Criar item para o século
def year_to_century_item(year):
    if pd.isna(year) or not str(year).isdigit():
        return None
    return f"century_{int(year) // 100 + 1}"

df_clean['century_item'] = df_clean['creation_year'].apply(year_to_century_item)

print("Colunas 'school_item' e 'century_item' criadas.")
print("\nExemplo do DataFrame com os novos itens:")
print(df_clean[['school', 'school_item', 'creation_year', 'century_item']].head())

Colunas 'school_item' e 'century_item' criadas.

Exemplo do DataFrame com os novos itens:
    school     school_item  creation_year century_item
0   modern  school__modern         2007.0         None
1    Italy   school__italy         1366.0         None
2    Italy   school__italy         1754.0         None
3   modern  school__modern         2010.0         None
4   modern  school__modern         2009.0         None


In [58]:
# --- Build Transactions ---

# Criar a lista de transações combinando os itens de cada pintura
transactions = []
for index, row in df_clean.iterrows():
    transaction = row[PARSED_CLUSTER_COLUMN_NAME]
    transaction.append(row['school_item'])
    if row['century_item']:
        transaction.append(row['century_item'])
    transactions.append(transaction)

print("Exemplo das 5 primeiras transações:")
for i in range(min(5, len(transactions))):
    print(transactions[i])

Exemplo das 5 primeiras transações:
['color_cluster_3', 'school__modern']
['color_cluster_4', 'school__italy']
['color_cluster_0', 'school__italy']
['color_cluster_6', 'school__modern']
['color_cluster_6', 'school__modern']


In [59]:
# --- Find Frequent Itemsets (Apriori Algorithm) ---

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)

print(f"Encontrados {len(frequent_itemsets)} itemsets frequentes com suporte mínimo de 0.01")
print("\n15 itemsets mais frequentes:")
print(frequent_itemsets.head(15))

Encontrados 129 itemsets frequentes com suporte mínimo de 0.01

15 itemsets mais frequentes:
     support                           itemsets
13  0.696950                   (school__modern)
9   0.200956                  (color_cluster_9)
0   0.200736                  (color_cluster_0)
5   0.200653                  (color_cluster_5)
8   0.200566                  (color_cluster_8)
7   0.199946                  (color_cluster_7)
3   0.199919                  (color_cluster_3)
2   0.199529                  (color_cluster_2)
4   0.199257                  (color_cluster_4)
1   0.199230                  (color_cluster_1)
6   0.198205                  (color_cluster_6)
15  0.177334                  (school__unknown)
82  0.141175  (color_cluster_9, school__modern)
26  0.140429  (school__modern, color_cluster_0)
68  0.139551  (school__modern, color_cluster_5)


In [60]:
# --- Generate and Analyze Association Rules ---

# Gerar regras a partir dos itemsets frequentes
# Usamos a métrica 'lift' e um limiar de 1.0, pois regras com lift > 1 são geralmente consideradas interessantes.
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Filtrar e ordenar as regras para melhor visualização
# Ordenamos por 'lift' e 'confidence' para ver as regras mais fortes primeiro.
sorted_rules = rules.sort_values(by=['lift', 'confidence'], ascending=[False, False])

print(f"Geradas {len(sorted_rules)} regras de associação com lift >= 1.0")
print("\nAs 20 regras mais fortes encontradas (ordenadas por Lift e Confiança):")

# Exibir as colunas mais importantes
print(sorted_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(20))

Geradas 68 regras de associação com lift >= 1.0

As 20 regras mais fortes encontradas (ordenadas por Lift e Confiança):
                           antecedents                         consequents  \
63                     (school__italy)                   (color_cluster_2)   
62                   (color_cluster_2)                     (school__italy)   
22  (color_cluster_9, color_cluster_4)                    (school__modern)   
23                    (school__modern)  (color_cluster_9, color_cluster_4)   
11                   (school__unknown)                   (color_cluster_7)   
10                   (color_cluster_7)                   (school__unknown)   
65                     (school__italy)                   (color_cluster_0)   
64                   (color_cluster_0)                     (school__italy)   
46  (color_cluster_6, color_cluster_5)                    (school__modern)   
47                    (school__modern)  (color_cluster_6, color_cluster_5)   
67                    

In [61]:
# --- Visualização e Filtragem Avançada das Regras ---

viz_rules = sorted_rules.copy()

viz_rules['antecedents'] = viz_rules['antecedents'].apply(lambda a: ', '.join(list(a)))
viz_rules['consequents'] = viz_rules['consequents'].apply(lambda c: ', '.join(list(c)))

print("--- Top 20 Regras (Formato Limpo) ---")
display(viz_rules.head(20))

--- Top 20 Regras (Formato Limpo) ---


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
63,school__italy,color_cluster_2,0.049329,0.199529,0.010097,0.20468,1.025813,1.0,0.000254,1.006476,0.026469,0.042287,0.006434,0.127641
62,color_cluster_2,school__italy,0.199529,0.049329,0.010097,0.050602,1.025813,1.0,0.000254,1.001341,0.031436,0.042287,0.001339,0.127641
22,"color_cluster_9, color_cluster_4",school__modern,0.029957,0.69695,0.021276,0.710207,1.019021,1.0,0.000397,1.045746,0.019243,0.030151,0.043745,0.370367
23,school__modern,"color_cluster_9, color_cluster_4",0.69695,0.029957,0.021276,0.030527,1.019021,1.0,0.000397,1.000588,0.061595,0.030151,0.000587,0.370367
11,school__unknown,color_cluster_7,0.177334,0.199946,0.036008,0.203052,1.015535,1.0,0.000551,1.003898,0.018595,0.105511,0.003882,0.19157
10,color_cluster_7,school__unknown,0.199946,0.177334,0.036008,0.180089,1.015535,1.0,0.000551,1.00336,0.01912,0.105511,0.003349,0.19157
65,school__italy,color_cluster_0,0.049329,0.200736,0.010055,0.203836,1.01544,1.0,0.000153,1.003893,0.015994,0.041894,0.003878,0.126963
64,color_cluster_0,school__italy,0.200736,0.049329,0.010055,0.05009,1.01544,1.0,0.000153,1.000802,0.019024,0.041894,0.000801,0.126963
46,"color_cluster_6, color_cluster_5",school__modern,0.029283,0.69695,0.020704,0.70703,1.014463,1.0,0.000295,1.034406,0.014687,0.029346,0.033262,0.368368
47,school__modern,"color_cluster_6, color_cluster_5",0.69695,0.029283,0.020704,0.029707,1.014463,1.0,0.000295,1.000436,0.047045,0.029346,0.000436,0.368368
