In [76]:
import pandas as pd
import os
import re
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 50)

df = pd.DataFrame()

category = "groupbuys"
# folder2 = "sorted-preprocessed-data"
folder2 = "raw-data"
#load every csv in the folder and append them
csv = os.listdir(folder2+"/"+category)
for file in csv:
    df = pd.concat([df, pd.read_csv(folder2+"/"+category+"/"+file)], axis=0)

df["original_info"] = df["original_info"].astype(str)
df = df.reset_index(drop=True)

#shorten df to test code
# df = df[:300]

def clean_row(row):
    html = row["original_info"]
    soup = BeautifulSoup(html, 'html.parser')
    #the date is in <div class="smalltext">
    date = soup.find("div", {"class": "smalltext"}).text
    #if the date contains "Last edit: ", use that instead
    if "Last edit:" in date:
        date = date.split("Last edit: ")[1]
    if " by " in date:
        date = date.split(" by ")[0]
    #delete <div class="quote">
    for quote in soup.find_all("div", {"class": "quote"}):
        quote.decompose()
    for quote in soup.find_all("div", {"class": "quoteheader"}):
        quote.decompose()
    #post is in <div class="post">
    post = soup.find("div", {"class": "post"}).text
    #remove all links
    post = re.sub(r'http\S+', '<link>', post)
    #remove duplicate spaces
    post = re.sub(' +', ' ', post)
    #remove all non-ascii characters
    post = post.encode("ascii", errors="ignore").decode()
    #remove all words that have a length above 20 (most likely a btc address or something useless)
    post = " ".join([(word if len(word) < 20 else "<truncated>") for word in post.split()])

    return post, date


In [77]:
df["post"], df["date"] = zip(*df.apply(clean_row, axis=1))

In [78]:
df = df[['topic', 'post', 'date']]
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=["date"])
#remove "Re: " from topic
df["topic"] = df["topic"].str.replace("Re: ", "")
df

  df['date'] = pd.to_datetime(df['date'])


Unnamed: 0,topic,post,date
2977,[Group Buy in China] Avalon batch #3,List of <truncated> Forum Name - Bitcoin Addre...,2013-03-25 10:16:23
2973,[Group Buy in China] Avalon batch #3,Interesting!I am in China too.I will consider ...,2013-03-25 10:42:45
2978,[Group Buy in China] Avalon batch #3,"Thanks for trusting me,225BTC received! <trunc...",2013-03-25 11:06:33
2970,CANCELLED - [Group Buy for Europe] Pooling for...,I am willing to organize a community purchase ...,2013-03-25 11:39:13
2986,Avalon Group buy - Germany,"Hi, I'm in time to invest?",2013-03-25 12:47:34
...,...,...,...
66119,"Antminer T19 Hydro for 2175$, any interest?",Googling a bit i found this: <link> the T vers...,2022-08-15 10:26:46
66120,"Antminer T19 Hydro for 2175$, any interest?","As i said, T versions have fewer chips at high...",2022-08-15 14:33:30
66122,"Antminer T19 Hydro for 2175$, any interest?",i asked the same thing from Bitmain support an...,2022-08-16 09:16:14
66121,"Antminer T19 Hydro for 2175$, any interest?",hi Did you see the pictures in those links i s...,2022-08-16 09:17:43


In [79]:
# group by topic and append all the strings from the post column
df["date"] = df["date"].astype(str)
df = df.groupby('topic').agg({'post': '<sep>'.join, 'date': '<sep>'.join}).reset_index()

In [80]:
df["dates"] = df["date"]
df['date'] = df['date'].apply(lambda x: x.split("<sep>")[-1])
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=["date"])

In [81]:
df

Unnamed: 0,topic,post,date,dates
14,[Group Buy in China] Avalon batch #3,List of <truncated> Forum Name - Bitcoin Addre...,2013-03-25 11:06:33,2013-03-25 10:16:23<sep>2013-03-25 10:42:45<se...
224,Avalon Group buy - Germany,"Hi, I'm in time to invest?<sep>too late, sorry",2013-03-25 13:13:44,2013-03-25 12:47:34<sep>2013-03-25 13:13:44
225,Avalon Group buy - Germany - closed,I am willing to organize a community purchase ...,2013-03-25 13:14:29,2013-03-25 13:14:29
488,I'm about to buy Avalon asic 85,"it sold out, its gone.<sep>People just order b...",2013-03-25 14:26:29,2013-03-25 13:46:12<sep>2013-03-25 14:26:29
281,CANCELLED [Group Buy in China] Avalon batch #3,Do you not have enough BTC to buy a batch 3 Av...,2013-03-25 16:42:09,2013-03-25 16:42:09
...,...,...,...,...
485,I am placing an order for bitmain s17 pro psu's,So I am ordering 2 of these.I can order more i...,2020-09-18 17:02:19,2020-09-17 03:12:35<sep>2020-09-18 17:02:19
275,Bulk order splitting - Looking for a few peopl...,So after digging around and finding a reputabl...,2022-01-26 23:53:46,2021-10-28 23:45:11<sep>2021-10-29 17:49:47<se...
1269,"[CLOSED]#KN-R001 3TH/s Neptune KnCMiner, ord. ...",Awesome! Thanks James!<sep>Mail sent Thanks Ja...,2022-02-08 16:55:48,2013-12-13 19:00:20<sep>2013-12-13 20:09:11<se...
197,"Antminer T19 Hydro for 2175$, any interest?",Here's the link: <link> quite interested in th...,2022-08-16 09:17:43,2022-07-17 17:31:18<sep>2022-07-23 00:53:04<se...


In [82]:
# import matplotlib.pyplot as plt
# plt.hist(df["start_edit"], bins=10, edgecolor='black')

# # Add labels and title
# plt.xlabel('date')
# plt.ylabel('Value')
# plt.title('Bar Plot of Integers')

# # Show the plot
# plt.show()

# print(df["start_edit"].max())
# print(df["start_edit"].min())

In [83]:
import gzip
import pickle
#save to gzip
with gzip.open('cleaned-data/'+category+'.pkl.gz', 'wb') as f:
    pickle.dump(df, f)