In [17]:
import pandas as pd
import os
import re
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 50)

df = pd.DataFrame()

# category = "groupbuys"
category = "hardware"
# category = "miners"
# category = "mining"
# category = "mining_speculation"
# category = "mining_support"
# category = "pools"

# folder2 = "sorted-preprocessed-data"
folder2 = "raw-data"

#load every csv in the folder and append them
csv = os.listdir(folder2+"/"+category)
for file in csv:
    df = pd.concat([df, pd.read_csv(folder2+"/"+category+"/"+file)], axis=0)

df["original_info"] = df["original_info"].astype(str)
df = df.reset_index(drop=True)

#shorten df to test code
# df = df[:300]

def clean_row(row):
    html = row["original_info"]
    soup = BeautifulSoup(html, 'html.parser')
    #the date is in <div class="smalltext">
    date = soup.find("div", {"class": "smalltext"}).text
    #if the date contains "Last edit: ", use that instead
    if "Last edit:" in date:
        date = date.split("Last edit: ")[1]
    if " by " in date:
        date = date.split(" by ")[0]
    #delete <div class="quote">
    for quote in soup.find_all("div", {"class": "quote"}):
        quote.decompose()
    for quote in soup.find_all("div", {"class": "quoteheader"}):
        quote.decompose()
    #post is in <div class="post">
    post = soup.find("div", {"class": "post"}).text
    #remove all links
    post = re.sub(r'http\S+', '<link>', post)
    #remove duplicate spaces
    post = re.sub(' +', ' ', post)
    #remove all non-ascii characters
    post = post.encode("ascii", errors="ignore").decode()
    #remove all words that have a length above 20 (most likely a btc address or something useless)
    post = " ".join([(word if len(word) < 20 else "<truncated>") for word in post.split()])

    return post, date


In [18]:
df["post"], df["date"] = zip(*df.apply(clean_row, axis=1))

In [19]:
df = df[['topic', 'post', 'date']]
# replace "Today at " with "October 1, 2023, "
df["date"] = df["date"].str.replace("Today at ", "September 21, 2023, ")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=["date"])
#remove "Re: " from topic
df["topic"] = df["topic"].str.replace("Re: ", "")
df

Unnamed: 0,topic,post,date
328027,Bounty: development of efficient open-source F...,20 more BTC here.,2011-05-17 22:00:08
328029,Bounty: development of efficient open-source F...,I too will pledge 20 BTC for the satisfaction ...,2011-05-17 22:05:39
328028,Bounty: development of efficient open-source F...,-----BEGIN PGP SIGNED MESSAGE-----Hash: SHA1Gr...,2011-05-17 22:06:22
328030,Bounty: development of efficient open-source F...,I hate to be the one to be the destroyer of ho...,2011-05-18 06:19:31
328033,Bounty: development of efficient open-source F...,The numbers in the bounty requirements that I ...,2011-05-18 11:10:57
...,...,...,...
281498,Antminer S19 95TH setup,"Thank you Dave,I am letting it run for the tim...",2023-10-29 18:10:34
281502,Antminer S19 95TH setup,try to restart it maybe it will go back to 3 b...,2023-10-29 18:14:31
281499,Antminer S19 95TH setup,"Hi philipma1957,I already did that 2 times, ho...",2023-10-29 18:19:10
281503,Antminer S19 95TH setup,Can you share your miner logs I just want to c...,2023-10-29 22:23:47


In [20]:
# group by topic and append all the strings from the post column
df["date"] = df["date"].astype(str)
df = df.groupby('topic').agg({'post': '<sep>'.join, 'date': '<sep>'.join}).reset_index()

In [21]:
df["dates"] = df["date"]
df['date'] = df['date'].apply(lambda x: x.split("<sep>")[-1])
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=["date"])

In [22]:
df

Unnamed: 0,topic,post,date,dates
2094,Bounty: development of efficient open-source F...,20 more BTC here.<sep>I too will pledge 20 BTC...,2011-05-20 20:49:21,2011-05-17 22:00:08<sep>2011-05-17 22:05:39<se...
2095,Bounty: development of efficient open-source F...,I'm splitting this off from the FPGA mining fo...,2011-05-31 16:32:19,2011-05-31 16:32:19
4398,Official Open Source FPGA Bitcoin Miner (Just ...,"Awesome!<sep>Dude, that's awesome! If I buy on...",2011-06-05 21:29:51,2011-05-20 02:40:58<sep>2011-05-20 02:56:47<se...
2736,FPGA Inflection Point,I'd like to know where I can get a PSU that su...,2011-06-21 05:39:02,2011-06-02 05:53:57<sep>2011-06-02 05:58:58<se...
6794,[help]what is needed to make a fpga/asic,what software is need to design one?whats chip...,2011-06-23 16:55:22,2011-06-22 19:38:06<sep>2011-06-23 16:55:22
...,...,...,...,...
2079,Board Bitmain A113D Amlogic (S19 90TH) not wor...,"I have an Amlogic A113D Control Board, which e...",2023-10-24 14:18:31,2023-10-23 13:27:37<sep>2023-10-23 18:25:40<se...
7191,s19k pro dashboard minerlog,Is there a way to view share difficulty in rea...,2023-10-25 01:52:45,2023-10-24 21:03:06<sep>2023-10-24 22:40:40<se...
7259,testing epic controller. on antminer s19xp,power setting is at 120thmeter set at 12:06 am...,2023-10-29 07:11:42,2023-10-20 13:18:29<sep>2023-10-20 13:22:56<se...
761,Antminer S19 95TH setup,"Hi BitMaxz,Thank you for the offer. The only t...",2023-10-29 22:23:47,2023-09-21 02:42:19<sep>2023-09-21 11:50:04<se...


In [23]:
# import matplotlib.pyplot as plt
# plt.hist(df["start_edit"], bins=10, edgecolor='black')

# # Add labels and title
# plt.xlabel('date')
# plt.ylabel('Value')
# plt.title('Bar Plot of Integers')

# # Show the plot
# plt.show()

# print(df["start_edit"].max())
# print(df["start_edit"].min())

In [24]:
import gzip
import pickle
#save to gzip
with gzip.open('cleaned-data/'+category+'.pkl.gz', 'wb') as f:
    pickle.dump(df, f)