<a href="https://colab.research.google.com/github/dlont/hep/blob/main/pandas/pandas_na_duplicates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)

Generate a time range representing one hour of data with measurements every minute

In [None]:
time_range = pd.date_range(start="2024-01-01 00:00", end="2024-01-01 01:00", freq="1min")

Simulate random event rates (number of triggered events) per minute

In [None]:
np.random.seed(42)  # For reproducibility
event_rates = np.random.poisson(lam=20, size=len(time_range))  # Mean event rate of 20 collisions per minute

Create a DataFrame with time-series data

In [None]:
df = pd.DataFrame({
    'Timestamp': time_range,
    'Event Rate': event_rates
})

Set 'Timestamp' as the index

In [None]:
df.set_index('Timestamp', inplace=True)

Simulate missing values in the 'Event Rate' column

In [None]:
df.loc['2024-01-01 00:05':'2024-01-01 00:10', 'Event Rate'] = np.nan  # Missing data for some minutes

Simulate duplicate rows (let's duplicate rows around 00:20)

In [None]:
df = pd.concat([df, df.loc['2024-01-01 00:20'].to_frame().T])  # Duplicating the row at 00:20
# Sort the DataFrame by the index (DatetimeIndex)**
df = df.sort_index()

In [None]:
print("Original DataFrame with missing values and duplicates:")
print(df.head(100))  # Show first 100 rows

Fill missing values (NaN) with the mean of the 'Event Rate'

In [None]:
df['Event Rate'] = df['Event Rate'].fillna(df['Event Rate'].mean())

In [None]:
print("\nDataFrame after handling missing values (filling NaN with mean):")
print(df.head(100))

Remove duplicate rows<br>
Drop duplicates by temporarily resetting the index

In [None]:
df_no_duplicates = df.reset_index().drop_duplicates().rename(columns={'index': 'Timestamp'}).set_index('Timestamp')
print(df_no_duplicates.head(100))

In [None]:
print("\nDataFrame after removing duplicate rows:")
print(df_no_duplicates.loc['2024-01-01 00:15':'2024-01-01 00:25'])  # Show rows around the duplicate