In [1]:
import pandas as pd

In [2]:
# Get the processed dataset

df = pd.read_csv('../etl/data/processed/merged_data.csv')

In [3]:
# Calculate the total amount spent by each customer
total_spent = df.groupby('customer_id')['amount'].sum().reset_index()

# Rename the column to reflect the total spent
total_spent.rename(columns={'amount': 'total_spent'}, inplace=True)

# Display the result
print(total_spent)

      customer_id  total_spent
0               1       664.27
1               2        41.98
2               3       326.52
3               4       373.57
4               5       540.85
...           ...          ...
3700         5192      1477.32
3701         5193       736.36
3702         5195       671.70
3703         5197      1173.74
3704         5200       332.39

[3705 rows x 2 columns]


In [4]:
# Calculate the average transaction value for each customer
avg_transaction_value = df.groupby('customer_id')['amount'].mean().reset_index()

# Rename the column to reflect the average transaction value
avg_transaction_value.rename(columns={'amount': 'avg_transaction_value'}, inplace=True)

# Display the result
print(avg_transaction_value)

      customer_id  avg_transaction_value
0               1             221.423333
1               2              41.980000
2               3             163.260000
3               4             186.785000
4               5             270.425000
...           ...                    ...
3700         5192             369.330000
3701         5193             245.453333
3702         5195             167.925000
3703         5197             391.246667
3704         5200             332.390000

[3705 rows x 2 columns]


In [5]:
# Merge the total_spent and avg_transaction_value datasets on 'customer_id'
combined_df = pd.merge(total_spent, avg_transaction_value, on='customer_id')

# Display the combined dataset
print(combined_df)
# Calculate the total amount spent and average transaction value for each customer directly from the original dataset
combined_df = df.groupby('customer_id').agg(
    total_spent=('amount', 'sum'),
    avg_transaction_value=('amount', 'mean'),
    name=('name', 'first'),
    age=('age', 'first'),
    country=('country', 'first'),
    is_active=('is_active', 'first')
).reset_index()

# Display the combined dataset
print(combined_df)

      customer_id  total_spent  avg_transaction_value
0               1       664.27             221.423333
1               2        41.98              41.980000
2               3       326.52             163.260000
3               4       373.57             186.785000
4               5       540.85             270.425000
...           ...          ...                    ...
3700         5192      1477.32             369.330000
3701         5193       736.36             245.453333
3702         5195       671.70             167.925000
3703         5197      1173.74             391.246667
3704         5200       332.39             332.390000

[3705 rows x 3 columns]
      customer_id  total_spent  avg_transaction_value               name  \
0               1       664.27             221.423333       Allison Hill   
1               2        41.98              41.980000         Brian Yang   
2               3       326.52             163.260000     Javier Johnson   
3               4      

In [6]:
# Save the results for expected test results

combined_df.to_csv('../tests/test_data/expected_high_value_customers.csv', index=False)