In [None]:
import pandas as pd
from pathlib import Path

# loading data
df = pd.read_csv(Path(r'data_source\all_data.csv'))

# month category
df['created_at'] = pd.to_datetime(df['created_at'])
df['month'] = df['created_at'].apply(lambda x: str(x.year) + '-' + str(x.month))

# seed
RANDOM_SEED = 42

# function for sampling
def stratified_sample(df, stratify_col, sample_size, random_state):
    # by month
    samples = []
    for month, group in df.groupby(stratify_col):
        n = min(sample_size, len(group))
        sample = group.sample(n, random_state=random_state)
        samples.append(sample)
    
    # merge
    result = pd.concat(samples)
    return result

# main
sampled_df = stratified_sample(df, 'month', 100, RANDOM_SEED)

# show sampling results
print(f"原始数据行数: {len(df)}")
print(f"抽样后数据行数: {len(sampled_df)}")
print("\n各月份抽样数量:")
print(sampled_df['month'].value_counts())

# save to excel
sampled_df.to_excel(Path(r'storage.xlsx'))