# Exercise 9: Data Analysis with Pandas

In [3]:
import pandas as pd
import numpy as np

# --- 1. Create a sample DataFrame ---
df = pd.DataFrame({
    "event": ["A", "B", "A", "C", "B", "A", "C"],
    "value": [10, 20, 15, 30, 25, 12, 28],
    "error": [1.0, 2.0, 1.5, 3.0, 2.5, 1.2, 2.8]
})

print("DataFrame:")
print(df)


DataFrame:
  event  value  error
0     A     10    1.0
1     B     20    2.0
2     A     15    1.5
3     C     30    3.0
4     B     25    2.5
5     A     12    1.2
6     C     28    2.8


In [4]:
# --- 2. Grouping and aggregation ---
print("\nGroup by event (mean value):")
print(df.groupby("event")["value"].mean())

print("\nGroup by event (mean and std):")
print(df.groupby("event")["value"].agg(["mean", "std"]))



Group by event (mean value):
event
A    12.333333
B    22.500000
C    29.000000
Name: value, dtype: float64

Group by event (mean and std):
            mean       std
event                     
A      12.333333  2.516611
B      22.500000  3.535534
C      29.000000  1.414214


In [5]:
# --- 3. Sorting and filtering ---
print("\nSorted by value:")
print(df.sort_values("value"))

print("\nFilter rows where value > 20:")
print(df[df["value"] > 20])


Sorted by value:
  event  value  error
0     A     10    1.0
5     A     12    1.2
2     A     15    1.5
1     B     20    2.0
4     B     25    2.5
6     C     28    2.8
3     C     30    3.0

Filter rows where value > 20:
  event  value  error
3     C     30    3.0
4     B     25    2.5
6     C     28    2.8


In [6]:
# --- 4. Merging DataFrames ---
errors = pd.DataFrame({
    "event": ["A", "B", "C"],
    "systematic_error": [0.5, 1.0, 0.8]
})

merged = pd.merge(df, errors, on="event")
print("\nMerged DataFrame:")
print(merged)



Merged DataFrame:
  event  value  error  systematic_error
0     A     10    1.0               0.5
1     B     20    2.0               1.0
2     A     15    1.5               0.5
3     C     30    3.0               0.8
4     B     25    2.5               1.0
5     A     12    1.2               0.5
6     C     28    2.8               0.8


In [10]:
# --- 5. Selecting rows ---
print("Select a single row (index=2):")
print(df.loc[2])

print("\nSelect multiple rows (index 0 and 3):")
print(df.loc[[0, 3]])

print("\nSelect rows with condition (value > 20):")
print(df.loc[df["value"] > 20])

print("\nSelect specific rows and columns:")
print(df.loc[0:2, ["event", "value"]])

print("\nUpdate a cell (row 1, column 'value'):")
df.loc[1, "value"] = 99
print(df)

Select a single row (index=2):
event      A
value     15
error    1.5
Name: 2, dtype: object

Select multiple rows (index 0 and 3):
  event  value  error
0     A     10    1.0
3     C     30    3.0

Select rows with condition (value > 20):
  event  value  error
3     C     30    3.0
4     B     25    2.5
6     C     28    2.8

Select specific rows and columns:
  event  value
0     A     10
1     B     20
2     A     15

Update a cell (row 1, column 'value'):
  event  value  error
0     A     10    1.0
1     B     99    2.0
2     A     15    1.5
3     C     30    3.0
4     B     25    2.5
5     A     12    1.2
6     C     28    2.8


In [14]:
# --- 6. Handling missing data ---
# Introduce NaN
df_missing = df.copy()
df_missing.loc[2, "value"] = np.nan 
print("\nDataFrame with missing value:")
print(df_missing)

print("\nDrop missing:")
print(df_missing.dropna()) 

print("\nFill missing with mean:")
print(df_missing.fillna(df_missing["value"].mean())) 



DataFrame with missing value:
  event  value  error
0     A   10.0    1.0
1     B   99.0    2.0
2     A    NaN    1.5
3     C   30.0    3.0
4     B   25.0    2.5
5     A   12.0    1.2
6     C   28.0    2.8

Drop missing:
  event  value  error
0     A   10.0    1.0
1     B   99.0    2.0
3     C   30.0    3.0
4     B   25.0    2.5
5     A   12.0    1.2
6     C   28.0    2.8

Fill missing with mean:
  event  value  error
0     A   10.0    1.0
1     B   99.0    2.0
2     A   34.0    1.5
3     C   30.0    3.0
4     B   25.0    2.5
5     A   12.0    1.2
6     C   28.0    2.8


In [13]:
# --- 7. Mini Exercise ---
# Suppose you have multiple experimental runs, compute average value per event,
# then normalize by the maximum.

grouped = df.groupby("event")["value"].mean()
normalized = grouped / grouped.max()

print("\nNormalized mean values per event:")
print(normalized)



Normalized mean values per event:
event
A    0.198925
B    1.000000
C    0.467742
Name: value, dtype: float64
