# Tagged dataset creation

- Because the full StackOverflow dataset is too large to fit into the RAM,
  we create a smaller dataset that contains data for questions 
  only on our tag list
- We do this by filtering out CSV data chunk by chunk
- We dump the resulting dataset to a Parquest file
- Any further analysis is done on this reduced dataset  
- Display interactive progress information during the filtering using `tqdm`
- This notebook seem to process about 120,000 rows/sec
- First match is around 2010
- Total 24,000 

## Blockchains not represent and lack of data issues

- `radix-dlt` tag was created, but there are couple of posts under `scrypto`
- `tezos` tag was created
- `chia` had zero posts, so no tag was created  


In [None]:
import pandas as pd

# Read tags
tags_df = pd.read_parquet("tags.parquet")

#: In SO export tags are concatenated string that looks like this
#: <winapi><visual-c++><mfc>
formatted_tags = [f"<{t}>" for t in tags_df["TagName"]]
print("Formatted tags look like:", formatted_tags[0:5])

tags_regex = "|".join(formatted_tags)

In [None]:
from tqdm.auto import tqdm
from pandas.io.parsers.readers import TextFileReader

chunk_size = 2**16  # 64k rows at a time
result_df: pd.DataFrame = None
matched_chunks: list[pd.DataFrame] = []
match_count = row_count = 0

with tqdm() as progress_bar:

    reader: TextFileReader

    rows_read = 0

    with pd.read_csv("csv/Posts.csv", chunksize=chunk_size) as reader:
        chunk: pd.DataFrame
        for chunk in reader:
            
            # Make Tags column regex friendly
            chunk["Tags"] = chunk["Tags"].fillna("")
            
            # Find posts in this chunk that match our tag filter
            matched_chunk = chunk.loc[chunk["Tags"].str.contains(tags_regex, case=False, regex=True)]
            
            matched_chunks.append(matched_chunk)

            match_count += len(matched_chunk)
            row_count += len(chunk)

            last = chunk.iloc[-1]

            # Show the date where the filter progres is going.
            # We are finished when reaching 2023-06
            progress_bar.set_postfix({
                "Date": last["CreationDate"],      
                "Matches": match_count,      
                "Total rows": f"{row_count:,}",
            })

            # Display rows read as a progress bar,
            # but we do not know the end
            progress_bar.update(len(chunk))


result_df = pd.concat(matched_chunks)

# Parse dates
result_df["CreationDate"] = pd.to_datetime(result_df["CreationDate"], format='ISO8601')


In [None]:
# Write out to disk
result_df.to_parquet("blockchain-questions.parquet")
print(f"We have total {len(result_df)} tagged questions and answers")