In [1]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
from wrappers import data_processing as dp
import nltk

%matplotlib inline

In [2]:
def remove_links_mentions(_str):
    _list = _str.split()
    _list = remove_words_by_str(_list, ['@', 't.co', 'http', '.com', '.net', '.org', '.io', 'rt'])
    new_str = ''
    for item in _list:
        new_str += item + ' '
    return new_str.strip()

def replace_punctuation(_str, punctuation_list):
    for p in punctuation_list:
        _str = _str.replace(p, ' ')
    return _str

def remove_words_by_str(word_list, char_list):
    
    for char in char_list:
        word_list = [word for word in word_list if char not in word]
    
    return word_list

# take the cleaned token list and concat it so it is one string
def concat_cleaned_tweet(token_list):
    concat_tweet = ''
    for t in token_list:
        concat_tweet += t + ' '
    concat_tweet = concat_tweet.replace(' .', '.').replace(' ,', ',')
    concat_tweet = concat_tweet.replace(' ?', '?').replace(' !', '!')
    concat_tweet = concat_tweet.replace('?.', '?').replace('!.', '!')
    return concat_tweet.strip()

# my methods are specific but not enough for amazon's mturk -- remove everything but alpha chars
def clean_with_nltk(_str):
    words = nltk.word_tokenize(_str)
    words = [word for word in words if word.isalpha()]
    
    re_str = ''
    for w in words:
        re_str += w + ' '
    
    return re_str.strip()

# Load Data

In [3]:
conn = sqlite3.connect('db/preprocessed.db')

get_cols_query = '''
SELECT id, tweets, tweets_bull_count, tweets_bear_count
FROM twitter
WHERE tweets_bull_count > 0
OR tweets_bear_count > 0
LIMIT 10000000;
'''

df = pd.read_sql(get_cols_query, conn)
conn.close()

# set index as id and drop orig col
df.index = df.id
df = df.drop('id', 1)
df = df.sort_index(ascending = True)

In [3]:
twitter_queries = ['bitcoin', '#BTC', '$BTC', 'ethereum', '#ETH', '$BTC']
n_tweets = 8000000

# load the most recent preprocessed tweets
df = dp.get_recent_preprocessed_twitter_data(twitter_queries, n_tweets)

# set index as id and drop orig col
df.index = df.id
df = df.drop('id', 1)
df = df.sort_index(ascending = True)

loading data from preprocessed db


In [4]:
# get a sample of both bull- and bear-likely tweets
bull_sample = df[df.tweets_bull_count > 0].sample(50000)
bear_sample = df[df.tweets_bear_count > 0].sample(50000)

# combine them into one, drop the duplicates, and sort the index
sample = bull_sample.append(bear_sample)
df = sample.drop_duplicates()
df = df.sort_index()

df = df[['tweets', 'tweets_bull_count', 'tweets_bear_count']]

In [8]:
df.head()

Unnamed: 0_level_0,tweets,tweets_bull_count,tweets_bear_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
907864088298082311,Strong words from JPM's Dimon but I think he's...,1,0
907864146615599104,"RT @yicaichina: BitKan Suspends OTC Trading, V...",0,1
907864148964282368,Bring on the $BTC sale day!!! Let's get the co...,0,1
907864192618655744,"RT @yicaichina: BitKan Suspends OTC Trading, V...",0,1
907864395853783040,Bitcoin has now broken the channel. $3150 seem...,0,2


# Show Sample

In [9]:
# SAMPLE
for t in df.tweets[:10]:
    print(t)
    print()

Strong words from JPM's Dimon but I think he's right. https://t.co/Kt0D4qNYK4

RT @yicaichina: BitKan Suspends OTC Trading, Via BTC Won't Shut Down: Chinese Bitcoin Exchanges Feel Regulatory Heat https://t.co/1cBVLcsUr…

Bring on the $BTC sale day!!! Let's get the correction out of the way already so we can continue the uptrend… https://t.co/uylL6mlaRe

RT @yicaichina: BitKan Suspends OTC Trading, Via BTC Won't Shut Down: Chinese Bitcoin Exchanges Feel Regulatory Heat https://t.co/1cBVLcsUr…

Bitcoin has now broken the channel. $3150 seems a valid target, but it tends to break down overnight and squeeze ba… https://t.co/9iwLfLYE2W

just wait for the tipping point of fiat or for fiat failure.. https://t.co/zVlOqc85lK

RT @carterthomas: Nothing like the fresh smell of fear from a $BTC free fall

How Low Could Bitcoin Go on Negative Coverage? https://t.co/27nmt1oN8V

#Bitcoin and the Jamie Dimon Bear: #Cryptocurrency Prices Approach 20% Slump https://t.co/WHdPA4UprX https://t.co/85VN5uLdW

# Steps:

 - make everything lowercase
 - remove all links and mentions
 - replace punctuation with a space (so when you split it, it is just words -- also to account for lack of space when people type)
 - remove common words and elements

In [10]:
# set everything to be lowercase
df.tweets = df.tweets.str.lower()

# remove links and mentions from tweets
df.tweets = df.tweets.apply(lambda x: remove_links_mentions(x))

# replace the punctuation
punctuation_to_replace = [
    
    '~', '`', '^', '*', '(', ')', '-', '_',
    '+', '=', '{', '}', '[', ']', '|', '\\',
    ':', ';', '"', "'", '<', '>', '/', ','

]

# replace all punctuation in list
df.tweets = df.tweets.apply(lambda x: replace_punctuation(x, punctuation_to_replace))

# create 'tweet' tokens :)
df['tokens'] = df.tweets.apply(lambda x: x.split())

# concat the tokens so they are a full string again
df['cleaned_tweets'] = df.tokens.apply(lambda x: concat_cleaned_tweet(x))

In [11]:
'''
EXTRA CLEAN (ONLY ALPHA, VIA NLTK)
'''
df.cleaned_tweets = df.cleaned_tweets.apply(lambda x: clean_with_nltk(x))

In [12]:
df.head()

Unnamed: 0_level_0,tweets,tweets_bull_count,tweets_bear_count,tokens,cleaned_tweets
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
907864088298082311,strong words from jpm s dimon but i think he s...,1,0,"[strong, words, from, jpm, s, dimon, but, i, t...",strong words from jpm s dimon but i think he s...
907864146615599104,bitkan suspends otc trading via btc won t shu...,0,1,"[bitkan, suspends, otc, trading, via, btc, won...",bitkan suspends otc trading via btc won t shut...
907864148964282368,bring on the $btc sale day!!! let s get the co...,0,1,"[bring, on, the, $btc, sale, day!!!, let, s, g...",bring on the btc sale day let s get the correc...
907864192618655744,bitkan suspends otc trading via btc won t shu...,0,1,"[bitkan, suspends, otc, trading, via, btc, won...",bitkan suspends otc trading via btc won t shut...
907864395853783040,bitcoin has now broken the channel. $3150 seem...,0,2,"[bitcoin, has, now, broken, the, channel., $31...",bitcoin has now broken the channel seems a val...


# EXPORT

In [13]:
only_cleaned_tweets = df.cleaned_tweets

In [14]:
only_cleaned_tweets.drop_duplicates().to_csv('tweets_for_classification.csv', encoding = 'utf-8')