In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from langdetect import detect

  from .autonotebook import tqdm as notebook_tqdm


### Bitcoin tweets dataset processing

In [None]:
dtypes = {
    "id": 'string',
    "user": 'string',
    "fullname": 'string',
    "url": 'string',
    "timestamp": 'string',
    "replies": 'Int64',
    "likes": 'Int64',
    "retweets": 'Int64',
    "text": 'string',
}

raw_data = pd.read_csv('./raw_data/tweets.csv', dtype=dtypes, sep=';', on_bad_lines='skip')

data = raw_data.drop(columns=['id', 'fullname', 'url'])
data['timestamp'] = pd.to_datetime(data['timestamp'])
data = data.dropna(subset=['timestamp'])
start = '01-Jan-2018'
end = '01-Jan-2023'
data = data.loc[(data['timestamp'] >= start) & (data['timestamp'] <= end)]
data = data.set_index('timestamp')
data = data.loc[data['replies'] + data["likes"] + data["retweets"] >= 1]
data.info()

data.head(2)

### Scraped tweets processing

In [3]:
dtypes = {
    "created_at": 'string',
    "username": 'string',
    "replies_count": 'Int64',
    "retweets_count": 'Int64',
    "likes_count": 'Int64',
    "retweet": 'bool',
    "tweet": 'string',
}

raw_data = pd.read_csv('./raw_data/bitcoin_scraped.csv', dtype=dtypes, on_bad_lines='skip')
raw_data.rename(columns = {
    'created_at':'timestamp',
    'username': 'user',
    'replies_count': 'replies',
    'retweets_count': 'retweets',
    'likes_count': 'likes',
    'tweet': 'text',
    }, inplace = True)

data = raw_data.drop(columns=['retweet'])
data['timestamp'] = data['timestamp'].apply(lambda x: x.replace(' Central Europe Summer Time', '+00:00'))
data['timestamp'] = pd.to_datetime(data['timestamp'])
data = data.dropna(subset=['timestamp'])
start = '01-Jan-2018'
end = '01-Jan-2023'
data = data.loc[(data['timestamp'] >= start) & (data['timestamp'] <= end)]
data = data.set_index('timestamp')
data = data.loc[data['replies'] + data["likes"] + data["retweets"] >= 1]
data.info()

data.head(2)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14101 entries, 2022-05-20 22:08:42+00:00 to 2022-05-20 14:36:55+00:00
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user      14101 non-null  string
 1   replies   14101 non-null  Int64 
 2   retweets  14101 non-null  Int64 
 3   likes     14101 non-null  Int64 
 4   text      14101 non-null  string
dtypes: Int64(3), string(2)
memory usage: 702.3 KB


Unnamed: 0_level_0,user,replies,retweets,likes,text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-05-20 22:08:42+00:00,lumberhawk,1,0,0,I need to get better at slow-pilling. I feel l...
2022-05-20 22:08:23+00:00,takethatcdc,0,0,1,Elon Musk dazzles world with plan to manufactu...


### Detect tweet language

In [4]:
langs = []

loop_obj = tqdm(data.iterrows(), total=data.shape[0], position=0, leave=False)
for i, s in loop_obj:
    try:
        langs.append(detect(s["text"]))
    except:
        text = str(s["text"]).replace("\n", "")
        loop_obj.set_postfix_str(f"\t|\tCan't determine language: %s" % text) 
        langs.append(np.nan)
            

data["language"] = langs
data.to_csv('./data/tweets_raw_2020-2022.csv')

data = data[data.language.eq('en')]
data.head(2)

                                                                                                               

Unnamed: 0_level_0,user,replies,retweets,likes,text,language
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-20 22:08:42+00:00,lumberhawk,1,0,0,I need to get better at slow-pilling. I feel l...,en
2022-05-20 22:08:23+00:00,takethatcdc,0,0,1,Elon Musk dazzles world with plan to manufactu...,en


### Combine datasets

In [5]:
dtypes = {
    "timestamp": 'string',
    "user": 'string',
    "replies": 'Int64',
    "retweets": 'Int64',
    "likes": 'Int64',
    "text": 'string',
    "language": 'string',
}

data_2018_2019 = pd.read_csv('./raw_data/tweets_raw_2018-2019.csv', dtype=dtypes, on_bad_lines='skip')
data_2020_2022 = pd.read_csv('./raw_data/tweets_raw_2020-2022.csv', dtype=dtypes, on_bad_lines='skip')
data_2018_2019.info()
data_2020_2022.info()

data_2018_2022 = pd.concat([data_2018_2019, data_2020_2022])
data_2018_2022 = data_2018_2022.drop(columns=['language'])
data_2018_2022.info()
data_2018_2022.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2195331 entries, 0 to 2195330
Data columns (total 7 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   timestamp  string
 1   user       string
 2   replies    Int64 
 3   likes      Int64 
 4   retweets   Int64 
 5   text       string
 6   language   string
dtypes: Int64(3), string(4)
memory usage: 123.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14101 entries, 0 to 14100
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  14101 non-null  string
 1   user       14101 non-null  string
 2   replies    14101 non-null  Int64 
 3   retweets   14101 non-null  Int64 
 4   likes      14101 non-null  Int64 
 5   text       14101 non-null  string
 6   language   14087 non-null  string
dtypes: Int64(3), string(4)
memory usage: 812.6 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2209432 entries, 0 to 14100
Data columns (total 6 columns):
 #   Colum

Unnamed: 0,timestamp,user,replies,likes,retweets,text
0,2019-05-27 11:49:06+00:00,3eyedbran,0,2,1,Another Test tweet that wasn't caught in the s...
1,2019-05-27 11:49:19+00:00,ltonews,0,14,2,One of the useful articles of Stefan; here is ...
2,2019-05-21 16:49:45+00:00,giving_airdrops,47,81,84,"BTC IS STILL GOING STRONG!! Thus, we are givi..."
3,2019-05-22 12:42:16+00:00,Cybintelligence,3,2,7,BestMixer has been seized by the Dutch Police ...
4,2019-05-27 11:49:30+00:00,optbus_hw45,1,1,1,Invested my Life Savings into Bitcoin and Ethe...


In [6]:
data_2018_2022 = data_2018_2022.dropna(subset=['text'])
data_2018_2022['text'] = data_2018_2022['text'].apply(lambda x: x.replace('\n', ' '))
data_2018_2022['text'] = data_2018_2022['text'].apply(lambda x: x.replace('\r', ' '))


data_2018_2022 = data_2018_2022.sort_values(by=['timestamp'], ascending=False)
data_2018_2022.set_index('timestamp', inplace=True)
data_2018_2022.to_csv('./data/btc_tweets_2018-2022.csv')
data_2018_2022.head()

Unnamed: 0_level_0,user,replies,likes,retweets,text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-05-20 22:08:42+00:00,lumberhawk,1,0,0,I need to get better at slow-pilling. I feel l...
2022-05-20 22:08:23+00:00,takethatcdc,0,1,0,Elon Musk dazzles world with plan to manufactu...
2022-05-20 22:08:21+00:00,dbonatoliv,0,1,0,"@BitcoinMagazine Its fine, we don't need more ..."
2022-05-20 22:08:20+00:00,theincomeblog,0,1,0,Bitmain Antminer APW7 PSU 1800W Power Supply f...
2022-05-20 22:07:55+00:00,doctoryev,0,1,0,"""Web3"" is uptrending the last 2 months. Other..."
