# Twitter Data Cleaning

In [3]:
from time import sleep
import pandas as pd
import re
from tqdm import tnrange, tqdm_notebook, tqdm

PRICE_FOLDER    = "data/price/" 
TWITTER_FOLDER  = "data/twitter/"
tweets_raw_file = './data/twitter/bitcoin_tweets_raw.csv'
tweets_clean_file = './data/twitter/bitcoin_tweets_clean.csv'

In [42]:
# Read in data
d = pd.read_csv(tweets_raw_file)
# Drop duplicate rows to manage dulpicate headers
d = d[d.ID !='ID']
d = d.reset_index(drop=True)

In [45]:
for i, s in enumerate(tqdm(d['Tweets'])):
    text = d.loc[i, 'Tweets']
    text = text.replace("#", "")
    text = text.replace("@", "")
    text = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', text, flags=re.MULTILINE)
    text = re.sub('@\\w+ *', '', text, flags=re.MULTILINE)
    d.loc[i, 'Tweets'] = text

# Remove keywords from dataframe
to_remove = ["Airdrop", "airdrop", "freebitcoin", "freebtc"]
d = d[~d.Tweets.str.contains("|".join(to_remove))]
d = d.reset_index(drop=True)

# Manage dataframe times
d['CreatedAt'] = pd.to_datetime(d['CreatedAt'])
twitter_hr = []
twitter_next_hr = []
for i, s in enumerate(tqdm(d['CreatedAt'])):
    t = s.floor(freq='H')
    n = s.ceil(freq='H')
    twitter_hr.append(t)
    twitter_next_hr.append(n)
d['Hour'] = twitter_hr
d['NextHour'] = twitter_next_hr

100%|██████████| 4500/4500 [00:04<00:00, 917.42it/s] 
100%|██████████| 4051/4051 [00:02<00:00, 1990.89it/s]


In [52]:
# Read clean file
try:
    clean_df = pd.read_csv(tweets_clean_file)
except pd.io.common.EmptyDataError:
    clean_df = pd.DataFrame()
# Merge raw and clean
clean_df_updated = pd.concat([clean_df, d])
# Drop duplicates
clean_df_updated = clean_df_updated.drop_duplicates(subset=['Tweets'])

In [12]:
# Write clean csv
f = open(tweets_clean_file, 'a+', encoding='utf-8')
clean_df_updated.to_csv(f, header=True, encoding='utf-8',index=False)
f.close()

In [19]:
# Update and clear raw csv
r = open(tweets_raw_file, 'w+', encoding='utf-8')
r.close()

In [17]:
print('Twitter data cleaned sucessfully')

Twitter data cleaned sucessfully


## Write script

In [21]:
!jupyter nbconvert --to script --no-prompt 02_TwitterDataCleaning.ipynb

[NbConvertApp] Converting notebook 02_TwitterDataCleaning.ipynb to script
[NbConvertApp] Writing 2488 bytes to 02_TwitterDataCleaning.py
