In [None]:
#here will create efficiency column --> will create feature column later on

In [None]:
df.head()

📌 1. Basic Summary & Distributions --> Numeric cols

Why: Understand ranges, outliers, and skewed variables.

In [None]:
# Summary
df.describe()

In [None]:
df[df['income'] < -20000]

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['income'], bins=50, kde=True)
plt.title('Distribution of income')
plt.xlabel('income')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

#what does negative income mean? Does it balance out with positive income when aggregating?


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['number_of_orders'], bins=50, kde=True)
plt.title('Distribution of number_of_orders')
plt.xlabel('number_of_orders')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['median_ticket'], bins=50, kde=True)
plt.title('Distribution of median ticket')
plt.xlabel('median_ticket')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

#same thing happens here as with income --> not sure if this balances out later on.

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['prom_contacts_month'], bins=50, kde=True)
plt.title('Distribution of prom_contacts_month')
plt.xlabel('prom_contacts_month')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

#most contacts are between 0 and 5 per month --> more might be outlier?

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['tel_contacts_month'], bins=50, kde=True)
plt.title(f'Distribution of tel_contacts_month')
plt.xlabel('tel_contacts_month')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

📌 2. Efficiency Analysis

Why: Your objective includes increasing promotor efficiency.

df['efficiency'] = df['number_of_orders'] / df['prom_contacts_month']

# Handle division by zero if needed
df['efficiency'] = df['efficiency'].replace([float('inf'), -float('inf')], None)

# Visualize
plt.figure(figsize=(6,4))
sns.histplot(df['efficiency'], bins=50, kde=True)
plt.title('Efficiency (Orders per Promotor Contact)')
plt.xlabel('Efficiency')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# In theory, the number of contacts and orders should be aligned --> efficiency should equal 1
## less than 1 (most cases) --> The number of orders is lower than the number of contacts meaning team inefficiency

df[df['efficiency'] < 1].shape  # Inefficient clients

📌 3. Low Ticket Clients

Why: Ticket < €80 is flagged as risky in the business rules

In [None]:
low_ticket_clients = df[df['median_ticket'] < 80]
print(f"Number of clients with median ticket < 80: {len(low_ticket_clients)}")

# Histogram of ticket sizes
plt.figure(figsize=(6,4))
sns.histplot(df['median_ticket'], bins=50, kde=True)
plt.axvline(80, color='red', linestyle='--')
plt.title('Median Ticket per Client')
plt.xlabel('Median Ticket (€)')
plt.ylabel('Number of Clients')
plt.tight_layout()
plt.show()


📌 4. Contact vs Orders Alignment

Why: Misalignment = inefficiency, key optimization goal.

In [None]:
plt.figure(figsize=(7,5))
sns.scatterplot(data=df, x='prom_contacts_month', y='number_of_orders', hue='channel', alpha=0.6)
plt.plot([0, df['prom_contacts_month'].max()], [0, df['prom_contacts_month'].max()], '--', color='grey')
plt.title('Orders vs Promotor Contacts')
plt.xlabel('Promotor Contacts per Month')
plt.ylabel('Number of Orders per Month')
plt.tight_layout()
plt.show()


This shows you:

How many clients are below the diagonal = more contacts than orders = inefficiency.

In [None]:
# Boxplots to detect outliers
plt.figure(figsize=(6,4))
sns.boxplot(data=df, x='income')
plt.title('Boxplot of Income')
plt.tight_layout()
plt.show()


In [None]:
# question --> are you sure what you are doing with the aggregation? Why would you sum the monhtly tel/prom contacts? Arent they monthly?

In [None]:
df.head()

In [None]:
df.sort_values(by = ['client_id', 'date'])

aggregation

In [None]:
# 📦 Pascual Capstone: Aggregation + Feature Creation
# Step-by-step structured flow from clean daily data to aggregated client-level dataset

import pandas as pd

# --- STEP 0: Load Cleaned Daily-Level Data ---
df = pd.read_csv('clean_orders_data.csv', parse_dates=['date'])

# --- STEP 1: Create `frequency` (median orders per month per client) ---
df['month'] = df['date'].dt.to_period('M')

monthly_orders = df.groupby(['client_id', 'month'])['number_of_orders'].sum().reset_index()

frequency_df = monthly_orders.groupby('client_id')['number_of_orders'].median().reset_index()
frequency_df.rename(columns={'number_of_orders': 'frequency'}, inplace=True)

# --- STEP 1.5: Validate that 'channel' and 'city' are unique per client ---
multi_channel = df.groupby('client_id')['channel'].nunique()
print("Clients with >1 unique channel:", (multi_channel > 1).sum())

multi_city = df.groupby('client_id')['city'].nunique()
print("Clients with >1 unique city:", (multi_city > 1).sum())

# Optional: Investigate any client_ids that violate uniqueness
multi_channel_clients = multi_channel[multi_channel > 1].index.tolist()
multi_city_clients = multi_city[multi_city > 1].index.tolist()

# Uncomment to inspect those rows
# print(df[df['client_id'].isin(multi_channel_clients)])
# print(df[df['client_id'].isin(multi_city_clients)])

# --- STEP 2: Aggregate Remaining Data Per Client ---
# Notes:
# - 'income', 'volume', 'number_of_orders' -> summed: represents cumulative behavior
# - 'prom_contacts_month', 'tel_contacts_month' -> averaged: avoid inflation from repetition
# - 'median_ticket' -> median to reduce outlier skew
# - 'channel', 'city' -> assumed to be static, validated above

client_df = df.groupby('client_id').agg({
    'income': 'sum',
    'volume': 'sum',  # volume = total weight/space across orders, relevant for logistics
    'number_of_orders': 'sum',
    'prom_contacts_month': 'mean',
    'tel_contacts_month': 'mean',
    'median_ticket': 'median',
    'channel': 'first',
    'city': 'first'
}).reset_index()

# --- STEP 3: Merge Frequency ---
client_df = client_df.merge(frequency_df, on='client_id', how='left')

# --- STEP 4: Create Efficiency Feature ---
client_df['efficiency'] = client_df['number_of_orders'] / client_df['prom_contacts_month']

# --- STEP 5: Handle Division by Zero or NaNs ---
client_df['efficiency'] = client_df['efficiency'].replace([float('inf'), -float('inf')], None)
client_df['efficiency'] = client_df['efficiency'].fillna(0)

# --- STEP 6: Save Aggregated Dataset ---
client_df.to_csv('aggregated_client_data.csv', index=False)

print("\u2705 Aggregated client dataset saved as 'aggregated_client_data.csv'")
