# Data Analysis
## Tweet data

In [2]:
import sqlite3
import pandas as pd
from afinn import Afinn
import json

# Fetch data from db
db = sqlite3.connect('../tweets.db')
c = db.cursor()

In [3]:
c.execute("SELECT * FROM tweets")

rows = c.fetchall()

tweets = pd.DataFrame(columns=['id', 'date', 'json', 'filter'], data=rows)
tweets['date'] = pd.to_datetime(tweets['date'],format='%Y-%m-%d %H:%M:%S')

In [4]:
afinn = Afinn()

def tweet_text(data):
    info = json.loads(data)
    if 'extended_tweet' in info:
        text = info['extended_tweet']['full_text']
    else:
        text = info['text']
    return text

def sentiment_score(data):
    return afinn.score(tweet_text(data))

def is_retweet(data):
    info = json.loads(data)
    return 'retweeted_status' in info

def tweet_symbols(data):
    info = json.loads(data)
    return info['entities']['symbols']

def is_multiple_cashtag(tweet):
    info = json.loads(tweet)
    return len(info['entities']['symbols'])>1

In [5]:
tweets['sentiment_score'] = tweets['json'].apply(lambda x: sentiment_score(x))
tweets['text'] = tweets['json'].apply(lambda x: tweet_text(x))
tweets['retweet'] = tweets['json'].apply(lambda x: is_retweet(x))
tweets['symbols'] = tweets['json'].apply(lambda x: tweet_symbols(x))

In [6]:
pd.options.display.max_colwidth = 3000
tweets[['sentiment_score','text','retweet']][tweets['sentiment_score']>0].sample(5)


Unnamed: 0,sentiment_score,text,retweet
511,4.0,RT @iiblockchain: #Giveaway #14\n\nWhich one will be the future of payments? $XRP or $XLM or $LTC\n\nWe will giveaway $100 of the winning #cryp…,True
571,3.0,super https://t.co/zEJuHKm6ut,False
296,10.0,"Top 100 avg 1h return: -1.1±1.8%; 11 up, 89 down\n$BTC -1.3% $ETH -0.7%\nBest:\n12.1% $ZCL @ZclassicCoin\n3.8% $WAVES @wavesplatform\n1.7% $ICX @helloiconworld\nTop 101-200 avg 1h return: -1.8±1.9%; 6 up, 94 down\nBest:\n5.1% $WGR @wagerrx\n1.9% $LUN @LunyrInc\n0.4% $BCO @CryptoBridge",False
245,5.0,RT @TheK1ng33k: Feeling generous. Like and Retweet this Tweet and whenever #Bitcoin $BTC hit $20K I will give 3 random people $1k in Bitcoi…,True
505,2.0,"RT @Aruwba: 💥FLASH SALE!💥\n Come join my uncensored snap, with fresh daily posts! \n🚨Get it now for ONLY $25 FOR LIFE🚨\n\n⬇️Click here⬇️ \nhttps…",True


In [7]:
tweets[['retweet','text','symbols']].sample(5)

Unnamed: 0,retweet,text,symbols
444,True,"RT @Aruwba: 💥FLASH SALE!💥\n Come join my uncensored snap, with fresh daily posts! \n🚨Get it now for ONLY $25 FOR LIFE🚨\n\n⬇️Click here⬇️ \nhttps…",[]
102,False,"Watch this, very informative, summary everything in 1 video https://t.co/Y7kDw2Q3gE",[]
515,True,"RT @zloadr: #50Cent Not a #Bitcoin #Millionaire After All\n\n#Americas #rapartist, #actor and #entrepreneur, Curtis 50 Cent Jackson, filed…",[]
202,False,sorte https://t.co/O5P2A4pZMP,[]
230,False,$BTC $ETH $ADA $XRP $ICX $NEO $NCASH $ZIL $WPR $ELA $BCH $ETC\n\nBTC Bearish. Exit. https://t.co/7rGjWDLhwh,"[{'text': 'BTC', 'indices': [0, 4]}, {'text': 'ETH', 'indices': [5, 9]}, {'text': 'ADA', 'indices': [10, 14]}, {'text': 'XRP', 'indices': [15, 19]}, {'text': 'ICX', 'indices': [20, 24]}, {'text': 'NEO', 'indices': [25, 29]}, {'text': 'NCASH', 'indices': [30, 36]}, {'text': 'ZIL', 'indices': [37, 41]}, {'text': 'WPR', 'indices': [42, 46]}, {'text': 'ELA', 'indices': [47, 51]}, {'text': 'BCH', 'indices': [52, 56]}, {'text': 'ETC', 'indices': [57, 61]}]"


## Processed Data with different time resolution

In [8]:
c.execute("SELECT * FROM vart_10min")

rows = c.fetchall()
varT= pd.DataFrame(columns=['date', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13'], data=rows)

In [9]:
varT.tail()

Unnamed: 0,date,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
84,2018-02-28 08:30:00,0,0,0,0,0,0,0,0,0,0,0,0,0
85,2018-02-28 08:40:00,108,15,5,58,49,12,2,45,28,11,69,17,41
86,2018-02-28 08:50:00,167,18,18,67,55,17,4,82,46,16,105,27,50
87,2018-02-28 09:00:00,197,31,16,65,48,4,1,112,56,28,113,30,48
88,2018-02-28 09:10:00,111,17,16,53,35,18,8,57,44,10,57,26,27
