# Create a monthly binned posts counts of all StackOverflow posts

- Create a Parquet file that contains only id/creation date series
- This allows us to plot a baseline of StackOverflow raise and decline 
  over time
- We can use this to compare blockchain question popularity to overall StackOverflow popularity

In [1]:
import pandas as pd
from tqdm.auto import tqdm
from pandas.io.parsers.readers import TextFileReader

chunk_size = 2**16  # 64k rows at a time
result_df: pd.DataFrame = None
matched_chunks: list[pd.DataFrame] = []
match_count = row_count = 0

with tqdm() as progress_bar:

    reader: TextFileReader

    rows_read = 0

    with pd.read_csv("csv/Posts.csv", chunksize=chunk_size) as reader:
        chunk: pd.DataFrame
        for chunk in reader:
                        
            # Find posts in this chunk that match our tag filter
            matched_chunk = pd.DataFrame()
            # Parse the dates 
            # https://stackoverflow.com/a/61959823/315168
            matched_chunk["CreationDate"] = pd.to_datetime(chunk["CreationDate"]).dt.strftime('%Y-%m-%dT%H:%M:%SZ')
            matched_chunk["Id"] = chunk["Id"]
            matched_chunk.set_index("Id")
            
            matched_chunks.append(matched_chunk)
            row_count += len(chunk)

            last = chunk.iloc[-1]

            # Show the date where the filter progres is going.
            # We are finished when reaching 2023-06
            progress_bar.set_postfix({
                "Date": last["CreationDate"],      
                "Total rows": f"{row_count:,}",
            })

            # Display rows read as a progress bar,
            # but we do not know the end
            progress_bar.update(len(chunk))


result_df = pd.concat(matched_chunks)

0it [00:00, ?it/s]

  matched_chunk["CreationDate"] = pd.to_datetime(chunk["CreationDate"]).dt.strftime('%Y-%m-%dT%H:%M:%SZ')


In [None]:
# Write output
result_df.to_parquet("all-creation-dates.parquet")

# Count posts by month for a smaller dataset
# https://stackoverflow.com/a/55726226/315168
post_counts_month_df = pd.DataFrame()
post_counts_month_df["post_counts_month"] = result_df.groupby([pd.Grouper(key='CreationDate', freq='M')])['CreationDate'].transform('size').astype(int)
post_counts_month_df.to_parquet("post_counts_month.parquet")