In [95]:
import pandas as pd
import numpy as np
import skfuzzy as fuzz
import sklearn as sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


In [96]:
def fuzzyEvaluator(data):
    # Load your data and select relevant columns

    selected_columns = [
        'BIA-BIA_ICW',
        'BIA-BIA_TBW',
        'BIA-BIA_FFM',
        'BIA-BIA_BMR',
        'BIA-BIA_LST'
    ]

    # selected_columns = [
    #     'BIA-BIA_ICW',
    #     'BIA-BIA_TBW',
    #     'BIA-BIA_FFM',
    #     'BIA-BIA_BMR',
    #     'BIA-BIA_LST',
    #     'Physical-Weight',
    #     'Physical-Height',
    #     'Physical-BMI',
    #     'BIA-BIA_BMI',
    #     'Basic_Demos-Age'
    # ]

    
    subset_df = data[selected_columns].copy()

    # Impute missing values and scale the data
    imputer = SimpleImputer(strategy='median')
    subset_df_imputed = imputer.fit_transform(subset_df)
    scaler = StandardScaler()
    subset_df_scaled = scaler.fit_transform(subset_df_imputed)

    # Transpose data for skfuzzy (expects features as rows)
    subset_df_scaled_T = subset_df_scaled.T

    # Perform fuzzy C-means clustering
    n_clusters = 4
    cntr, u, _, _, _, _, _ = fuzz.cluster.cmeans(
        subset_df_scaled_T, n_clusters, m=2, error=0.005, maxiter=1000, init=None, seed=42
    )

    # Assign each sample to the cluster with the highest membership probability
    fuzzy_clusters = np.argmax(u, axis=0)
    data['fuzzy_cluster'] = fuzzy_clusters

    # # View clustering results
    # print(train_df[['id', 'fuzzy_cluster'] + selected_columns].head(20))

    cluster_result = data[['id', 'fuzzy_cluster']]

    # Display the first few rows to verify
    print(cluster_result.head())
    return cluster_result

In [97]:
train_df = pd.read_csv('train.csv')
train_df = train_df.dropna(subset=['PCIAT-PCIAT_Total'])
PCIAT_Values = train_df
cluster_result = fuzzyEvaluator(train_df)

         id  fuzzy_cluster
0  00008ff9              2
1  000fd460              2
2  00105258              0
3  00115b9f              0
5  001f3379              0


In [98]:
# PCIAT_Values = train_df['PCIAT-PCIAT_Total']


bins = [-1, 30, 49, 79, 100]
labels = [0, 1, 2, 3]

# Categorize and assign back to the original column
PCIAT_Values['PCIAT-PCIAT_Total'] = pd.cut(PCIAT_Values['PCIAT-PCIAT_Total'], bins=bins, labels=labels, include_lowest=True)
# print(PCIAT_Values['PCIAT-PCIAT_Total'])

# print(cluster_result.shape)
# print(PCIAT_Values.shape)

# create a new column match in cluster_result dataframe called 'match' if row in cluster_result['fuzzy_cluster'] == PCIAT_Values['PCIAT-PCIAT_Total']
cluster_result['match'] = np.where(cluster_result['fuzzy_cluster'] == PCIAT_Values['PCIAT-PCIAT_Total'], 1, 0)
# print(cluster_result['match'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_result['match'] = np.where(cluster_result['fuzzy_cluster'] == PCIAT_Values['PCIAT-PCIAT_Total'], 1, 0)


In [99]:
print("number correct", cluster_result['match'].sum())
print("total valid samples", cluster_result.shape[0])
print("testing accuracy", cluster_result['match'].sum()/cluster_result.shape[0])

number correct 1086
total valid samples 2736
testing accuracy 0.3969298245614035


In [79]:
test_df = pd.read_csv('test.csv')
fuzzyEvaluator(test_df)

         id  fuzzy_cluster
0  00008ff9              0
1  000fd460              0
2  00105258              3
3  00115b9f              3
4  0016bb22              3


Unnamed: 0,id,fuzzy_cluster
0,00008ff9,0
1,000fd460,0
2,00105258,3
3,00115b9f,3
4,0016bb22,3
5,001f3379,2
6,0038ba98,3
7,0068a485,3
8,0069fbed,3
9,0083e397,3
