In [49]:
import pandas as pd
import os
import re
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 50)

df = pd.DataFrame()

# category = "groupbuys"
# category = "hardware"
# category = "miners"
# category = "mining"
# category = "mining_speculation"
# category = "mining_support"
category = "pools"


folder2 = "raw-data"

#load every csv in the folder and append them
csv = os.listdir(folder2+"/"+category)
for file in csv:
    df = pd.concat([df, pd.read_csv(folder2+"/"+category+"/"+file)], axis=0)

df["original_info"] = df["original_info"].astype(str)
df = df.reset_index(drop=True)

#shorten df to test code
# df = df[:300]

def clean_row(row):
    html = row["original_info"]
    soup = BeautifulSoup(html, 'html.parser')
    #the date is in <div class="smalltext">
    date = soup.find("div", {"class": "smalltext"}).text
    #if the date contains "Last edit: ", use that instead
    if "Last edit:" in date:
        date = date.split("Last edit: ")[1]
    if " by " in date:
        date = date.split(" by ")[0]
    #delete <div class="quote">
    for quote in soup.find_all("div", {"class": "quote"}):
        quote.decompose()
    for quote in soup.find_all("div", {"class": "quoteheader"}):
        quote.decompose()
    #post is in <div class="post">
    post = soup.find("div", {"class": "post"}).text
    #remove all links
    post = re.sub(r'http\S+', '<link>', post)
    #remove duplicate spaces
    post = re.sub(' +', ' ', post)
    #remove all non-ascii characters
    post = post.encode("ascii", errors="ignore").decode()
    #remove all words that have a length above 20 (most likely a btc address or something useless)
    post = " ".join([(word if len(word) < 20 else "<truncated>") for word in post.split()])

    return post, date


In [50]:
df["post"], df["date"] = zip(*df.apply(clean_row, axis=1))

In [51]:
df = df[['topic', 'post', 'date']]
# replace "Today at " with "October 1, 2023, "
df["date"] = df["date"].str.replace("Today at ", "September 21, 2023, ")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=["date"])
#remove "Re: " from topic
df["topic"] = df["topic"].str.replace("Re: ", "")
df

Unnamed: 0,topic,post,date
47128,Cooperative mining,"Actually, they are equal. The low-power machin...",2010-11-27 14:35:56
47138,Cooperative mining,I will offer a Six-Way AMD Processor to the Co...,2010-11-27 14:48:03
47137,Cooperative mining,"i always liked the idea of distributed mining,...",2010-11-27 14:48:49
47136,Cooperative mining,Sure seems fair its better then waiting 1 year...,2010-11-27 16:12:24
47135,Cooperative mining,You are true for long term. But I think many p...,2010-11-27 16:20:26
...,...,...,...
122306,[∞ YH] solo.ckpool.org 2% fee solo mining 277...,I would love to add another 0 to get 20 th. Un...,2023-10-25 04:39:10
122303,[∞ YH] solo.ckpool.org 2% fee solo mining 277...,Even if you add a zero go from 2 to 20 th you ...,2023-10-25 13:25:21
122308,[∞ YH] solo.ckpool.org 2% fee solo mining 277...,You still mining on that small pool? is it sti...,2023-10-25 17:50:03
122304,[∞ YH] solo.ckpool.org 2% fee solo mining 277...,"Hello all,The Blockparty 47 is over, no block ...",2023-10-25 19:33:12


In [52]:
# group by topic and append all the strings from the post column
df["date"] = df["date"].astype(str)
df = df.groupby('topic').agg({'post': '<sep>'.join, 'date': '<sep>'.join}).reset_index()

In [53]:
df["dates"] = df["date"]
df['date'] = df['date'].apply(lambda x: x.split("<sep>")[-1])
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=["date"])

In [54]:
df

Unnamed: 0,topic,post,date,dates
757,"Cooperative mining (>900Mhash/s already, join ...",You might like to code this feature:When we fi...,2010-12-16 08:55:56,2010-12-16 08:11:14<sep>2010-12-16 08:23:31<se...
748,"Cooperative mining (>1100Mhash/s already, join...","yay, seems like we already solved our 2nd bloc...",2010-12-16 18:13:08,2010-12-16 15:46:03<sep>2010-12-16 16:07:10<se...
749,"Cooperative mining (>1300Mhash/s already, join...",It is explained on homepage - by my extremely ...,2010-12-17 01:55:12,2010-12-16 18:27:35<sep>2010-12-16 18:35:23<se...
750,"Cooperative mining (>1500Mhash/s already, join...",2 transactions received.12.08btc contributing ...,2010-12-18 02:08:13,2010-12-17 02:24:00<sep>2010-12-17 03:02:15<se...
751,"Cooperative mining (>1700Mhash/s, join us!)","Well done, fifth block is arriving!",2010-12-18 02:18:39,2010-12-18 02:18:39
...,...,...,...,...
5233,pool.vkbit.com - solo mining pool [stats @ vkb...,"Almost free service, from now on our fee is 0....",2023-09-11 13:10:05,2023-09-11 13:01:02<sep>2023-09-11 13:03:03<se...
5234,pool.vkbit.com - the ultimate Bitcoin solo min...,I hope this is useful for low hash rate miners...,2023-09-15 06:22:09,2023-09-15 06:22:09
5049,deleted,Thanks<sep>thanks<sep>deleted<sep>.,2023-09-19 18:37:58,2014-03-04 08:53:58<sep>2014-03-04 08:54:42<se...
1246,KanoPool since 2014 🐈 - PPLNS and Solo 0.5% fe...,<link> payout help and information: <link> mos...,2023-10-18 12:27:23,2023-08-05 16:20:00<sep>2023-08-05 22:53:30<se...


In [55]:
# import matplotlib.pyplot as plt
# plt.hist(df["start_edit"], bins=10, edgecolor='black')

# # Add labels and title
# plt.xlabel('date')
# plt.ylabel('Value')
# plt.title('Bar Plot of Integers')

# # Show the plot
# plt.show()

# print(df["start_edit"].max())
# print(df["start_edit"].min())

In [56]:
import gzip
import pickle
#save to gzip
with gzip.open('cleaned-data/'+category+'.pkl.gz', 'wb') as f:
    pickle.dump(df, f)