In [12]:
"""
Pandas fundamentals practice.

Basic exercises with filtering and groupby.
Used to understand how pandas helps answer simple business questions.
"""


import pandas as pd

# Sample dataset
df = pd.DataFrame({
    "user_id": [1, 2, 3, 4, 5, 6],
    "country": ["PL", "PL", "DE", "PL", "DE", "PL"],
    "revenue": [50, 0, 200, 300, 80, 150]
})

df


Unnamed: 0,user_id,country,revenue
0,1,PL,50
1,2,PL,0
2,3,DE,200
3,4,PL,300
4,5,DE,80
5,6,PL,150


In [11]:
# Check average revenue by country

avg_revenue_by_country = (
    df[df["revenue"] > 0]
    .groupby("country")["revenue"]
    .mean()
)

avg_revenue_by_country


Unnamed: 0_level_0,revenue
country,Unnamed: 1_level_1
DE,140.0
PL,166.666667


In [10]:
# Business question 2:
# What is the total and average revenue by country?

revenue_stats = (
    df[df["revenue"] > 0]
    .groupby("country")["revenue"]
    .agg(
        total_revenue="sum",
        avg_revenue="mean",
        user_count="count"
    )
)

revenue_stats


Unnamed: 0_level_0,total_revenue,avg_revenue,user_count
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DE,280,140.0,2
PL,500,166.666667,3


In [5]:
# Business question 3:
# Which users have revenue >= 150?

high_value_users = df[df["revenue"] >= 150]
high_value_users


Unnamed: 0,user_id,country,revenue
2,3,DE,200
3,4,PL,300
5,6,PL,150


In [9]:

# Comparison: manual loop vs pandas


# Manual loop (not recommended for analytics)
total_revenue_pl = 0
for _, row in df.iterrows():
    if row["country"] == "PL":
        total_revenue_pl += row["revenue"]

total_revenue_pl


500

In [7]:
# Same result using pandas (recommended)
df[df["country"] == "PL"]["revenue"].sum()


np.int64(500)