In [None]:
#here will create efficiency column --> will create feature column later on

In [None]:
df.head()

📌 1. Basic Summary & Distributions --> Numeric cols

Why: Understand ranges, outliers, and skewed variables.

In [None]:
# Summary
df.describe()

In [None]:
df[df['income'] < -20000]

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['income'], bins=50, kde=True)
plt.title('Distribution of income')
plt.xlabel('income')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

#what does negative income mean? Does it balance out with positive income when aggregating?


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['number_of_orders'], bins=50, kde=True)
plt.title('Distribution of number_of_orders')
plt.xlabel('number_of_orders')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['median_ticket'], bins=50, kde=True)
plt.title('Distribution of median ticket')
plt.xlabel('median_ticket')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

#same thing happens here as with income --> not sure if this balances out later on.

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['prom_contacts_month'], bins=50, kde=True)
plt.title('Distribution of prom_contacts_month')
plt.xlabel('prom_contacts_month')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

#most contacts are between 0 and 5 per month --> more might be outlier?

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['tel_contacts_month'], bins=50, kde=True)
plt.title(f'Distribution of tel_contacts_month')
plt.xlabel('tel_contacts_month')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

📌 2. Efficiency Analysis

Why: Your objective includes increasing promotor efficiency.

df['efficiency'] = df['number_of_orders'] / df['prom_contacts_month']

# Handle division by zero if needed
df['efficiency'] = df['efficiency'].replace([float('inf'), -float('inf')], None)

# Visualize
plt.figure(figsize=(6,4))
sns.histplot(df['efficiency'], bins=50, kde=True)
plt.title('Efficiency (Orders per Promotor Contact)')
plt.xlabel('Efficiency')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# In theory, the number of contacts and orders should be aligned --> efficiency should equal 1
## less than 1 (most cases) --> The number of orders is lower than the number of contacts meaning team inefficiency

df[df['efficiency'] < 1].shape  # Inefficient clients

📌 3. Low Ticket Clients

Why: Ticket < €80 is flagged as risky in the business rules

In [None]:
low_ticket_clients = df[df['median_ticket'] < 80]
print(f"Number of clients with median ticket < 80: {len(low_ticket_clients)}")

# Histogram of ticket sizes
plt.figure(figsize=(6,4))
sns.histplot(df['median_ticket'], bins=50, kde=True)
plt.axvline(80, color='red', linestyle='--')
plt.title('Median Ticket per Client')
plt.xlabel('Median Ticket (€)')
plt.ylabel('Number of Clients')
plt.tight_layout()
plt.show()


📌 4. Contact vs Orders Alignment

Why: Misalignment = inefficiency, key optimization goal.

In [None]:
plt.figure(figsize=(7,5))
sns.scatterplot(data=df, x='prom_contacts_month', y='number_of_orders', hue='channel', alpha=0.6)
plt.plot([0, df['prom_contacts_month'].max()], [0, df['prom_contacts_month'].max()], '--', color='grey')
plt.title('Orders vs Promotor Contacts')
plt.xlabel('Promotor Contacts per Month')
plt.ylabel('Number of Orders per Month')
plt.tight_layout()
plt.show()


This shows you:

How many clients are below the diagonal = more contacts than orders = inefficiency.

In [None]:
# Boxplots to detect outliers
plt.figure(figsize=(6,4))
sns.boxplot(data=df, x='income')
plt.title('Boxplot of Income')
plt.tight_layout()
plt.show()


In [None]:
# question --> are you sure what you are doing with the aggregation? Why would you sum the monhtly tel/prom contacts? Arent they monthly?

In [None]:
df.head()

In [None]:
df.sort_values(by = ['client_id', 'date'])

aggregation

In [None]:
# 📦 Pascual Capstone: Aggregation + Feature Creation
# Step-by-step structured flow from clean daily data to aggregated client-level dataset

import pandas as pd

# --- STEP 0: Load Cleaned Daily-Level Data ---
df = pd.read_csv('clean_orders_data.csv', parse_dates=['date'])

# --- STEP 1: Create `frequency` (median orders per month per client) ---
df['month'] = df['date'].dt.to_period('M')

monthly_orders = df.groupby(['client_id', 'month'])['number_of_orders'].sum().reset_index()

frequency_df = monthly_orders.groupby('client_id')['number_of_orders'].median().reset_index()
frequency_df.rename(columns={'number_of_orders': 'frequency'}, inplace=True)

# --- STEP 1.5: Validate that 'channel' and 'city' are unique per client ---
multi_channel = df.groupby('client_id')['channel'].nunique()
print("Clients with >1 unique channel:", (multi_channel > 1).sum())

multi_city = df.groupby('client_id')['city'].nunique()
print("Clients with >1 unique city:", (multi_city > 1).sum())

# Optional: Investigate any client_ids that violate uniqueness
multi_channel_clients = multi_channel[multi_channel > 1].index.tolist()
multi_city_clients = multi_city[multi_city > 1].index.tolist()

# Uncomment to inspect those rows
# print(df[df['client_id'].isin(multi_channel_clients)])
# print(df[df['client_id'].isin(multi_city_clients)])

# --- STEP 2: Aggregate Remaining Data Per Client ---
# Notes:
# - 'income', 'volume', 'number_of_orders' -> summed: represents cumulative behavior
# - 'prom_contacts_month', 'tel_contacts_month' -> averaged: avoid inflation from repetition
# - 'median_ticket' -> median to reduce outlier skew
# - 'channel', 'city' -> assumed to be static, validated above

client_df = df.groupby('client_id').agg({
    'income': 'sum',
    'volume': 'sum',  # volume = total weight/space across orders, relevant for logistics
    'number_of_orders': 'sum',
    'prom_contacts_month': 'mean',
    'tel_contacts_month': 'mean',
    'median_ticket': 'median',
    'channel': 'first',
    'city': 'first'
}).reset_index()

# --- STEP 3: Merge Frequency ---
client_df = client_df.merge(frequency_df, on='client_id', how='left')

# --- STEP 4: Create Efficiency Feature ---
client_df['efficiency'] = client_df['number_of_orders'] / client_df['prom_contacts_month']

# --- STEP 5: Handle Division by Zero or NaNs ---
client_df['efficiency'] = client_df['efficiency'].replace([float('inf'), -float('inf')], None)
client_df['efficiency'] = client_df['efficiency'].fillna(0)

# --- STEP 6: Save Aggregated Dataset ---
client_df.to_csv('aggregated_client_data.csv', index=False)

print("\u2705 Aggregated client dataset saved as 'aggregated_client_data.csv'")


In [None]:
# Check if each promotor id is unqiue to a client 
# Group by client and count the number of unique promotors per client
promotors_per_client = df.groupby('client_id')['promotor_id'].nunique()

# Filter clients who have more than one unique promotor
clients_with_multiple_promotors = promotors_per_client[promotors_per_client > 1]

# Print result
if clients_with_multiple_promotors.empty:
    print("✅ Each client has only one unique promotor_id assigned.")
else:
    print("❌ Some clients have more than one promotor_id assigned.")
    print(clients_with_multiple_promotors)
# Group by promotor and count how many unique clients each handles
clients_per_promotor = df.groupby('promotor_id')['client_id'].nunique()

# Filter promotors that handle more than one client
promotors_with_multiple_clients = clients_per_promotor[clients_per_promotor > 1]

# Print the result
print(f"Number of promotors assigned to more than one client: {promotors_with_multiple_clients.shape[0]}")
df.head()

 * [Part 4.1.4 - Validation of Unique `tel_contacts_month` Assignments](#4.1.4)

<a id='4.1.4'></a>
#### Part 4.1.4 – Validation of Unique `tel_contacts_month` Assignments --> GO OVER

Although the number of monthly telephone contacts (`tel_contacts_month`) was not critical to achieving the project’s primary objective, we considered it potentially useful for future stages of the analysis. Therefore, we performed a validation step to ensure that the number of monthly telephone contacts  was consistent for each client across the entire dataset. As with previous checks, our goal was to confirm that each client had a unique `tel_contacts_month` value assigned, which would allow us to reliably aggregate clients using this variable if needed.

In [None]:
# Group by client_id and count unique values of tel_contacts_month
tel_contact_variability = df.groupby('client_id')['tel_contacts_month'].nunique().reset_index()
tel_contact_variability.columns = ['client_id', 'unique_tel_contacts_values']

# Filter clients with more than one unique value
inconsistent_clients_tel = tel_contact_variability[tel_contact_variability['unique_tel_contacts_values'] > 1]

print("Number of clients with inconsistent tel_contacts_month:", inconsistent_clients_tel.shape[0])
print("List of clients with inconsistencies:")
print(inconsistent_clients_tel)

Number of clients with inconsistent tel_contacts_month: 532
List of clients with inconsistencies:
       client_id  unique_tel_contacts_values
163    103024993                           2
279    105568281                           2
356    107224090                           2
431    108470306                           2
747    115583694                           2
...          ...                         ...
41578  994146921                           2
41704  996820053                           2
41764  998060984                           2
41839  999413800                           2
41869  999941988                           2

[532 rows x 2 columns]


In [None]:
inconsistent_clients_tel['unique_tel_contacts_values'].value_counts()

unique_tel_contacts_values
2    532
Name: count, dtype: int64

In [None]:
# Identifying clients who had more than one unique value for tel_contacts_month
clients_with_2_tel_values = df.groupby('client_id')['tel_contacts_month'].nunique()
clients_with_2_tel_values = clients_with_2_tel_values[clients_with_2_tel_values > 1].index

# Checking if all these clients had 0 as one of their values
has_only_0_and_one_other_tel = True

for client in clients_with_2_tel_values:
    values_tel = df[df['client_id'] == client]['tel_contacts_month'].unique()
    if 0 not in values_tel:
        print(f"⚠️ Client {client} does NOT have 0 as one of the values. Values: {values_tel}")
        has_only_0_and_one_other_tel = False

if has_only_0_and_one_other_tel:
    print("✅ All clients with 2 values have 0 as one of them.")
else:
    print("❌ Some clients have 2 values, but one of them is not 0.")

✅ All clients with 2 values have 0 as one of them.


We observed that the same issue affecting `prom_contacts_month` was also present in the `tel_contacts_month` variable. For all clients with more than one unique value, the additional (non-assigned) value was always zero. Based on this consistent pattern, we applied the same methodology as before: replacing the zero values with the corresponding non-zero value specific to each client to ensure consistency across the dataset.

In [None]:
# For each client with inconsistent values, identifying the correct (non-zero) tel_contacts_month value
client_correct_value_tel = {}

for client in clients_with_2_tel_values:
    values_tel = df[df['client_id'] == client]['tel_contacts_month'].unique()
    non_zero_value = [v for v in values if v != 0][0]  # the correct value
    client_correct_value_tel[client] = non_zero_value

# Defining a function to replace 0s with the correct value for each client
def replace_zero_with_correct_tel(row):
    if row['client_id'] in client_correct_value_tel and row['tel_contacts_month'] == 0:
        return client_correct_value_tel[row['client_id']]
    return row['tel_contacts_month']

# Applying the function
df['tel_contacts_month'] = df.apply(replace_zero_with_correct_tel, axis=1)

In [None]:
# Group by client_id and count unique values of tel_contacts_month
tel_contact_variability = df.groupby('client_id')['tel_contacts_month'].nunique().reset_index()
tel_contact_variability.columns = ['client_id', 'unique_tel_contacts_values']

# Filter clients with more than one unique value
inconsistent_clients_tel = tel_contact_variability[tel_contact_variability['unique_tel_contacts_values'] > 1]

print("Number of clients with inconsistent tel_contacts_month:", inconsistent_clients_tel.shape[0])
print("List of clients with inconsistencies:")
print(inconsistent_clients_tel)

Number of clients with inconsistent tel_contacts_month: 341
List of clients with inconsistencies:
       client_id  unique_tel_contacts_values
356    107224090                           2
431    108470306                           2
747    115583694                           2
994    120746611                           2
1100   122763063                           2
...          ...                         ...
41258  986864675                           2
41458  991748575                           2
41704  996820053                           2
41764  998060984                           2
41839  999413800                           2

[341 rows x 2 columns]


In [None]:
df[df['client_id']=='108470306']

Unnamed: 0,date,city,channel,client_id,promotor_id,volume,income,number_of_orders,median_ticket,prom_contacts_month,tel_contacts_month,month
137415,2024-03-28,Barcelona,AR,108470306,9820087,3294.0,927.36,1,927.36,1,3,2024-03
146392,2024-04-16,Barcelona,AR,108470306,9820087,1465.24,1249.38,1,1249.38,1,3,2024-04
147270,2024-04-17,Barcelona,AR,108470306,9820087,3294.0,927.36,1,927.36,1,3,2024-04
150701,2024-04-23,Barcelona,AR,108470306,9820087,48.0,0.0,1,0.0,1,2,2024-04
159051,2024-05-08,Barcelona,AR,108470306,9820087,4941.0,1391.04,1,1391.04,1,3,2024-05
164194,2024-05-16,Barcelona,AR,108470306,9820087,567.84,1600.92,1,1600.92,1,3,2024-05
165159,2024-05-17,Barcelona,AR,108470306,9820087,511.08,1600.92,1,1600.92,1,3,2024-05
166827,2024-05-22,Barcelona,AR,108470306,9820087,624.6,1600.92,1,1600.92,1,3,2024-05
167969,2024-05-23,Barcelona,AR,108470306,9820087,5508.84,2991.96,1,1495.98,1,3,2024-05
168973,2024-05-24,Barcelona,AR,108470306,9820087,567.84,1600.92,1,1600.92,1,3,2024-05


We were able to procede with the next validation step having sorted this out.

In [None]:
# 4,3 yearly - final aggregated table

# ROUGH BELOW 

In [None]:
yearly_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41871 entries, 0 to 41870
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   client_id                41871 non-null  object 
 1   city                     41871 non-null  object 
 2   channel                  41871 non-null  object 
 3   promotor_id              41871 non-null  object 
 4   yearly_volume            41871 non-null  float64
 5   yearly_income            41871 non-null  float64
 6   yearly_number_of_orders  41871 non-null  int64  
 7   median_ticket            41871 non-null  float64
 8   prom_contacts_month      41871 non-null  int64  
 9   tel_contacts_month       41871 non-null  int64  
 10  frequency                41871 non-null  float64
 11  efficiency               41871 non-null  float64
 12  logistics_cost           41871 non-null  int64  
 13  visit_cost               41871 non-null  int64  
 14  total_cost            

In [None]:
yearly_df.isna().sum()

client_id                  0
city                       0
channel                    0
promotor_id                0
yearly_volume              0
yearly_income              0
yearly_number_of_orders    0
median_ticket              0
prom_contacts_month        0
tel_contacts_month         0
frequency                  0
efficiency                 0
logistics_cost             0
visit_cost                 0
total_cost                 0
dtype: int64

In [None]:
yearly_df.duplicated().any()

False

In [None]:
#Checking there are no clients with negative yearly income
yearly_df[yearly_df['yearly_income'] < 0]

Unnamed: 0,client_id,city,channel,promotor_id,yearly_volume,yearly_income,yearly_number_of_orders,median_ticket,prom_contacts_month,tel_contacts_month,frequency,efficiency,logistics_cost,visit_cost,total_cost


In [None]:
#Checking there are no clients with negative yearly volumes
yearly_df[yearly_df['yearly_volume'] < 0]

Unnamed: 0,client_id,city,channel,promotor_id,yearly_volume,yearly_income,yearly_number_of_orders,median_ticket,prom_contacts_month,tel_contacts_month,frequency,efficiency,logistics_cost,visit_cost,total_cost


In [None]:
yearly_df[yearly_df['client_id']=='999976985']

In [None]:
df[df['client_id']=='999976985']

In [None]:
#save to csv

<a id='5'></a>
## Part 5 - Exploratory Data Analysis

In [None]:
# Set plot style
sns.set(style='whitegrid')

### Income & Median ticket per client

In [None]:
client_df['income'].describe()

In [None]:
# 1. Distribution of Total Income
plt.figure(figsize=(6, 4))
sns.histplot(client_df['income'], bins=50, kde=True)
plt.title('Distribution of Total Income per Client')
plt.xlabel('Total Income (€)')
plt.ylabel('Number of Clients')
plt.tight_layout()
plt.show()

In [None]:
neg_income = client_df[client_df['income'] < 0]
neg_ticket = client_df[client_df['median_ticket'] < 0]
print(f"Negative income clients: {len(neg_income)}")
print(f"Negative ticket clients: {len(neg_ticket)}")

# Optional: see overlap
neg_both = client_df[(client_df['income'] < 0) & (client_df['median_ticket'] < 0)]

In [None]:
len(neg_both)

In [None]:
#to drop
#client_df = client_df[(client_df['income'] >= 0) & (client_df['median_ticket'] >= 0)]


In [None]:
client_df['median_ticket'].describe()

In [None]:
# 2. Distribution of Median Ticket
plt.figure(figsize=(6, 4))
sns.histplot(client_df['median_ticket'], bins=50, kde=True)
plt.axvline(80, color='red', linestyle='--', label='Ticket Threshold (80€)')
plt.title('Distribution of Median Ticket per Client')
plt.xlabel('Median Ticket (€)')
plt.ylabel('Number of Clients')
plt.legend()
plt.tight_layout()
plt.show()

# 3. Efficiency Distribution

In [None]:
# 3. Efficiency Distribution
plt.figure(figsize=(6, 4))
sns.histplot(client_df['efficiency'], bins=50, kde=True)
plt.title('Distribution of Client Efficiency (Orders / Physical Contacts)')
plt.xlabel('Efficiency')
plt.ylabel('Number of Clients')
plt.tight_layout()
plt.show()

# 4. Orders vs Promotor Contacts (Scatter)

In [None]:
# 4. Orders vs Promotor Contacts (Scatter)
plt.figure(figsize=(6, 5))
sns.scatterplot(data=client_df, x='prom_contacts_month', y='number_of_orders', hue='channel', alpha=0.6, edgecolor='w')
plt.plot([0, client_df['prom_contacts_month'].max()], [0, client_df['prom_contacts_month'].max()], '--', color='grey', label='Ideal 1:1 Line')
plt.title('Number of Orders vs. Promotor Contacts')
plt.xlabel('Total Promotor Contacts')
plt.ylabel('Total Orders')
plt.legend()
plt.tight_layout()
plt.show()

# 5. Average Efficiency by Channel (Fixed)

In [None]:
# 5. Average Efficiency by Channel (Fixed)
plt.figure(figsize=(6, 4))
sns.barplot(data=client_df, x='channel', y='efficiency', estimator=np.mean)
plt.title('Average Efficiency by Channel')
plt.xlabel('Channel')
plt.ylabel('Avg Efficiency')
plt.tight_layout()
plt.show()

# 6. Correlation Matrix

In [None]:
# 6. Correlation Matrix
plt.figure(figsize=(8, 6))
corr = client_df[['income', 'volume', 'number_of_orders', 'prom_contacts_month', 'median_ticket', 'frequency', 'efficiency']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

---