In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

# Load and preprocess the dataset
df = pd.read_csv(r'C:/Users/chara/Downloads/ifood_df.csv')

# Remove duplicates
df = df.drop_duplicates()

# Fix negative values in 'MntRegularProds'
df['MntRegularProds'] = df['MntRegularProds'].apply(lambda x: max(x, 0))

# Add Total Money Spent
df['MntTotal'] = df[['MntWines', 'MntFruits', 'MntMeatProducts', 
                     'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis=1)

# Select features for clustering
features = ['Income', 'Kidhome', 'Teenhome', 'Recency',
            'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
            'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases',
            'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
            'NumWebVisitsMonth', 'MntTotal']

# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(df[features])

# Perform K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(normalized_features)

# Start Dash app
app = dash.Dash(__name__)
app.title = 'Customer Segmentation Dashboard'

# Layout
app.layout = html.Div([
    html.H1('Customer Segmentation Analysis', style={'textAlign': 'center'}),

    dcc.Graph(id='cluster-scatter'),

    html.Div([
        html.Label('Select X-axis:'),
        dcc.Dropdown(
            id='x-axis',
            options=[{'label': col, 'value': col} for col in features],
            value='Income'
        ),

        html.Label('Select Y-axis:'),
        dcc.Dropdown(
            id='y-axis',
            options=[{'label': col, 'value': col} for col in features],
            value='MntTotal'
        )
    ], style={'width': '48%', 'display': 'inline-block'}),

    html.Div([
        dcc.Graph(id='cluster-stats')
    ])
])

# Callbacks
@app.callback(
    Output('cluster-scatter', 'figure'),
    [Input('x-axis', 'value'),
     Input('y-axis', 'value')]
)
def update_scatter(x_col, y_col):
    fig = px.scatter(
        df, x=x_col, y=y_col, color='Cluster',
        title=f'Clusters by {x_col} and {y_col}',
        labels={'Cluster': 'Cluster'},
        hover_data=features
    )
    return fig

@app.callback(
    Output('cluster-stats', 'figure'),
    [Input('x-axis', 'value'),
     Input('y-axis', 'value')]
)
def update_cluster_stats(x_col, y_col):
    # Compute cluster statistics (mean of selected features)
    cluster_summary = df.groupby('Cluster').mean().reset_index()
    fig = px.bar(
        cluster_summary, x='Cluster', y='MntTotal',
        title='Average Total Spending by Cluster',
        labels={'MntTotal': 'Average Spending'}
    )
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=8.

