# Comparison dataset creation

- Create a comparison dataset from common programming languages and frameworks like JavaScript, Python, Svelte, React
- [See details in create-reduced-dataset.ipynb](./create-reduced-dataset.ipynb)
- We tested both Brotli (default level) and Zstd compression (level 19). Because of the English language dataset, both compressions are par at Brotli 1.3GB vs. Zstd 1.3GB.

In [None]:
import pandas as pd

# Read tags
tags_df = pd.read_parquet("other-tags.parquet")

#: In SO export tags are concatenated string that looks like this
#: <winapi><visual-c++><mfc>
formatted_tags = [f"<{t}>" for t in tags_df["TagName"]]
print("Formatted tags look like:", formatted_tags[0:5])

tags_regex = "|".join(formatted_tags)

In [None]:
from tqdm.auto import tqdm
from pandas.io.parsers.readers import TextFileReader

chunk_size = 2**16  # 64k rows at a time
result_df: pd.DataFrame = None
matched_chunks: list[pd.DataFrame] = []
match_count = row_count = 0

with tqdm() as progress_bar:

    reader: TextFileReader

    rows_read = 0

    with pd.read_csv("csv/Posts.csv", chunksize=chunk_size) as reader:
        chunk: pd.DataFrame
        for chunk in reader:
            
            # Make Tags column regex friendly
            chunk["Tags"] = chunk["Tags"].fillna("")
            
            # Find posts in this chunk that match our tag filter
            matched_chunk = chunk.loc[chunk["Tags"].str.contains(tags_regex, case=False, regex=True)]
            
            matched_chunks.append(matched_chunk)

            match_count += len(matched_chunk)
            row_count += len(chunk)

            last = chunk.iloc[-1]

            # Show the date where the filter progres is going.
            # We are finished when reaching 2023-06
            progress_bar.set_postfix({
                "Date": last["CreationDate"],      
                "Matches": match_count,      
                "Total rows": f"{row_count:,}",
            })

            # Display rows read as a progress bar,
            # but we do not know the end
            progress_bar.update(len(chunk))


result_df = pd.concat(matched_chunks)

# Parse dates
result_df["CreationDate"] = pd.to_datetime(result_df["CreationDate"], format='ISO8601')


In [None]:
# Write out to disk
# 1.3GB
result_df.to_parquet("other-questions.parquet", compression="brotli")

# zstd level=19 1.3GB
# result_df.to_parquet("other-questions.parquet", engine='pyarrow', compression='zstd', compression_level=15)

print(f"We have total {len(result_df)} other questions and answers")