# Lecture 9: Data Analysis with Pandas

In [4]:
import pandas as pd
import numpy as np

# --- 1. Create a sample DataFrame ---
df = pd.DataFrame({
    "event": ["A", "B", "A", "C", "B", "A", "C"],
    "value": [10, 20, 15, 30, 25, 12, 28],
    "error": [1.0, 2.0, 1.5, 3.0, 2.5, 1.2, 2.8]
})

print("DataFrame:")
print(df)


DataFrame:
  event  value  error
0     A     10    1.0
1     B     20    2.0
2     A     15    1.5
3     C     30    3.0
4     B     25    2.5
5     A     12    1.2
6     C     28    2.8


In [8]:
# --- 2. Grouping and aggregation ---
print("\nGroup by event (mean value):")
print(df.groupby("event")["value"].mean())

print("\nGroup by event (mean and std):")
print(df.groupby("event")["value"].agg(["mean", "std"]))



Group by event (mean value):
event
A    12.333333
B    22.500000
C    29.000000
Name: value, dtype: float64

Group by event (mean and std):
            mean       std
event                     
A      12.333333  2.516611
B      22.500000  3.535534
C      29.000000  1.414214


In [9]:
# --- 3. Sorting and filtering ---
print("\nSorted by value:")
print(df.sort_values("value"))

print("\nFilter rows where value > 20:")
print(df[df["value"] > 20])


Sorted by value:
  event  value  error
0     A     10    1.0
5     A     12    1.2
2     A     15    1.5
1     B     20    2.0
4     B     25    2.5
6     C     28    2.8
3     C     30    3.0

Filter rows where value > 20:
  event  value  error
3     C     30    3.0
4     B     25    2.5
6     C     28    2.8


In [10]:
# --- 4. Merging DataFrames ---
errors = pd.DataFrame({
    "event": ["A", "B", "C"],
    "systematic_error": [0.5, 1.0, 0.8]
})

merged = pd.merge(df, errors, on="event")
print("\nMerged DataFrame:")
print(merged)



Merged DataFrame:
  event  value  error  systematic_error
0     A     10    1.0               0.5
1     B     20    2.0               1.0
2     A     15    1.5               0.5
3     C     30    3.0               0.8
4     B     25    2.5               1.0
5     A     12    1.2               0.5
6     C     28    2.8               0.8


In [11]:
# --- 5. Handling missing data ---
# Introduce NaN
df_missing = df.copy()
df_missing.loc[2, "value"] = np.nan # loc: allows selection by label or index (rows, columns)
print("\nDataFrame with missing value:")
print(df_missing)

print("\nDrop missing:")
print(df_missing.dropna()) # dropna(): drop rows containing NaN (simple but reduces information)

print("\nFill missing with mean:")
print(df_missing.fillna(df_missing["value"].mean())) # fillna(): fill NaN with something (mean, median, forward/backward fill, etc.)



DataFrame with missing value:
  event  value  error
0     A   10.0    1.0
1     B   20.0    2.0
2     A    NaN    1.5
3     C   30.0    3.0
4     B   25.0    2.5
5     A   12.0    1.2
6     C   28.0    2.8

Drop missing:
  event  value  error
0     A   10.0    1.0
1     B   20.0    2.0
3     C   30.0    3.0
4     B   25.0    2.5
5     A   12.0    1.2
6     C   28.0    2.8

Fill missing with mean:
  event      value  error
0     A  10.000000    1.0
1     B  20.000000    2.0
2     A  20.833333    1.5
3     C  30.000000    3.0
4     B  25.000000    2.5
5     A  12.000000    1.2
6     C  28.000000    2.8


In [12]:
# --- 6. Mini Exercise ---
# Suppose you have multiple experimental runs, compute average value per event,
# then normalize by the maximum.

grouped = df.groupby("event")["value"].mean()
normalized = grouped / grouped.max()

print("\nNormalized mean values per event:")
print(normalized)



Normalized mean values per event:
event
A    0.425287
B    0.775862
C    1.000000
Name: value, dtype: float64
