# Tagged dataset creation

- Because the full StackOverflow dataset is too large to fit into the RAM,
  we create a smaller dataset that contains data for questions 
  only on our tag list
- We do this by filtering out CSV data chunk by chunk
- We dump the resulting dataset to a Parquest file
- Any further analysis is done on this reduced dataset  

In [14]:
import pandas as pd

# Read tags
tags_df = pd.read_parquet("tags.parquet")

#: In SO export tags are concatenaed string that looks like this
#: <winapi><visual-c++><mfc>
formatted_tags = [f"<{t}>" for t in tags_df["TagName"]]
print("Formatted tags look like:", formatted_tags[0:5])

tags_regex = "|".join(formatted_tags)

Formatted tags look like: ['<diem>', '<bitcoin>', '<cosmos>', '<blockchain>', '<ethereum>']


In [15]:
from tqdm.auto import tqdm
from pandas.io.parsers.readers import TextFileReader

# Operate 64k rows at a time
chunk_size = 2**16

result_df: pd.DataFrame = None


with tqdm() as progress_bar:

    reader: TextFileReader

    rows_read = 0

    with pd.read_csv("csv/Posts.csv", chunksize=chunk_size) as reader:
        chunk: pd.DataFrame
        for chunk in reader:
            
            # Make Tags column regex friendly
            chunk["Tags"] = chunk["Tags"].fillna("")
            
            # Find posts in this chunk that match our tag filter
            matched_chunk = chunk.loc[chunk["Tags"].str.contains(tags_regex, case=False, regex=True)]
            if result_df is None:
                result_df = matched_chunk
            else:
                result_df = pd.concat((result_df, matched_chunk,))

            last = chunk.iloc[-1]

            # Show the date where the filter progres is going.
            # We are finished when reaching 2023-06
            progress_bar.set_postfix({
                "Date": last["CreationDate"],      
                "Matches": len(result_df)         
            })

            # Display rows read as a progress bar,
            # but we do not know the end
            progress_bar.update(len(chunk))



0it [00:00, ?it/s]

3211264it [00:26, 116077.02it/s, Date=2010-10-24T14:10:43.843]

In [None]:
# Write out to disk
result_df.write_parquet("blockchain-questions.parquet")
print(f"We have total {len(result_df)} tagged questions and answers")

AttributeError: 'DataFrame' object has no attribute 'write_parquet'