## Cryptocurrency sentiment analisis and its coorelation with coins price

### Import libraries

In [144]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import requests

%matplotlib inline
plt.style.use('seaborn')

import warnings
warnings.filterwarnings('ignore')

In [459]:
import tweepy
import datetime
import json
from datetime import datetime, timedelta

### Import data

Twitter Developer Account. In order to get access to the Tweepy API, it is important for you to create a developer account and this account must be approved from twitter

In [409]:
#Import key and tokens from config.py
from config import *

In [348]:
client = tweepy.Client(bearer_token=bearer_token, 
                       consumer_key=consumer_key, 
                       consumer_secret=consumer_secret, 
                       access_token=acces_token, 
                       access_token_secret=token_secret,
                       wait_on_rate_limit=True)

In [381]:
query="(#btc OR #bitcoin OR bitcoin) is:verified -has:media -is:retweet lang:en"

In [374]:


#Get dates in ISO format YYYY-MM-DDTHH:mm:ssZ (ISO 8601/RFC 3339).
#today('end_time') must be a minimum of 10 seconds prior to the request time. 
#We add an hour to avoid the newest tweets without comlete public metrics data 

today = datetime.now() - timedelta(hours=1) + timedelta(hours=5)
seven_days_back = today - timedelta(days=6, hours=22)

today = today.strftime('%Y-%m-%dT%H:%M:%SZ')
seven_days_back = seven_days_back.strftime('%Y-%m-%dT%H:%M:%SZ')

In [375]:
today

'2022-01-28T22:09:51Z'

In [376]:
seven_days_back

'2022-01-22T00:09:51Z'

In [377]:
import csv #Import csv

#Open/create a file to append data to
csvFile = open('api_csv/tweets_results.csv', 'a')

# # Use csv writer
csvWriter = csv.writer(csvFile)

csvWriter.writerow(['item.id',
                    'item.author_id',
                    'item.created_at',
                    'item.source', 
                    'item.public_metrics["retweet_count"]',
                    'item.public_metrics["reply_count"]',
                    'item.public_metrics["like_count"]',
                    'item.public_metrics["like_count"]',
                    'item.text'])

for response in tweepy.Paginator(client.search_recent_tweets, query=query, 
                              start_time=str(seven_days_back),
                              end_time=str(today),
                              tweet_fields='id,author_id,created_at,geo,public_metrics,source,text',
                              user_fields='id,name,username,public_metrics',
                              place_fields='full_name,country,country_code,geo',
                              expansions='author_id,geo.place_id',
                              max_results=100, limit=1200):
    
    # Write a row to the CSV file. I use encode UTF-8
    for item in response.data:
        # Write a row to the CSV file. I use encode UTF-8
        csvWriter.writerow([item.id,
                            item.author_id,
                            item.created_at,
                            item.source, 
                            item.public_metrics["retweet_count"],
                            item.public_metrics["reply_count"],
                            item.public_metrics["like_count"],
                            item.public_metrics["like_count"],
                            item.text])

csvFile.close()

In [382]:
#Open/create a file to append data to
csvFile = open('api_csv/author_results.csv', 'a')

# # Use csv writer
csvWriter = csv.writer(csvFile)

csvWriter.writerow(['item.name',
                    'item.id',
                    'item.username',
                    'item.public_metrics["followers_count"]',
                    'item.public_metrics["following_count"]',
                    'item.public_metrics["tweet_count"]',
                    'item.public_metrics["listed_count"]'])

for response in tweepy.Paginator(client.search_recent_tweets, query=query, 
                              start_time=str(seven_days_back),
                              end_time=str(today),
                              tweet_fields='id,author_id,created_at,geo,public_metrics,source,text',
                              user_fields='id,name,username,public_metrics',
                              place_fields='full_name,country,country_code,geo',
                              expansions='author_id,geo.place_id',
                              max_results=100, limit=1200):

    # Write a row to the CSV file. I use encode UTF-8
    for item in response.includes['users']:
        # Write a row to the CSV file. I use encode UTF-8
        csvWriter.writerow([item.name,
                    item.id,
                    item.username,
                    item.public_metrics["followers_count"],
                    item.public_metrics["following_count"],
                    item.public_metrics["tweet_count"],
                    item.public_metrics["listed_count"]])

csvFile.close()

In [379]:
query="(#btc OR #bitcoin OR bitcoin) lang:en"

#Open/create a file to append data to
csvFile = open('api_csv/tweets_counts.csv', 'a')

# # Use csv writer
csvWriter = csv.writer(csvFile)

csvWriter.writerow(['item.start',
                    'item.end',
                    'item.tweet_count'])

response = client.get_recent_tweets_count(query=query, start_time=str(seven_days_back),
                                                               end_time=str(today), 
                                                               granularity='hour')
#     # Write a row to the CSV file. I use encode UTF-8
for item in response.data:
    # Write a row to the CSV file. I use encode UTF-8
    csvWriter.writerow([item['start'],
                        item['end'],
                        item['tweet_count']])
                                 
csvFile.close()

### BTC price

In [449]:
apiKey = 'f6abe5070d5ed0669957e251a399b0aefd50288230480daf4b5e2e196c72c2ec'

url = "https://min-api.cryptocompare.com/data/v2/histohour"

payload = {
    "api_key": apiKey,
    "fsym": "BTC",
    "tsym": "USD",
    "limit": 250
}

result = requests.get(url, params=payload).json()

In [450]:
result['Data']['Data'][0]

{'time': 1642748400,
 'high': 39173.58,
 'low': 38575.8,
 'open': 38697.49,
 'volumefrom': 1758.17,
 'volumeto': 68334552.69,
 'close': 39142.31,
 'conversionType': 'direct',
 'conversionSymbol': ''}

In [451]:
df = pd.DataFrame(result['Data']['Data'])

df.head()

Unnamed: 0,time,high,low,open,volumefrom,volumeto,close,conversionType,conversionSymbol
0,1642748400,39173.58,38575.8,38697.49,1758.17,68334552.69,39142.31,direct,
1,1642752000,39294.17,39030.88,39142.31,1288.39,50434328.22,39170.84,direct,
2,1642755600,39191.28,38953.29,39170.84,1549.57,60508276.18,38980.45,direct,
3,1642759200,39090.91,38878.86,38980.45,1272.89,49626815.02,38981.85,direct,
4,1642762800,38986.5,38694.25,38981.85,1454.38,56453857.47,38901.98,direct,


In [452]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   time              251 non-null    int64  
 1   high              251 non-null    float64
 2   low               251 non-null    float64
 3   open              251 non-null    float64
 4   volumefrom        251 non-null    float64
 5   volumeto          251 non-null    float64
 6   close             251 non-null    float64
 7   conversionType    251 non-null    object 
 8   conversionSymbol  251 non-null    object 
dtypes: float64(6), int64(1), object(2)
memory usage: 17.8+ KB


In [453]:
def unix_to_utc(x):
    utc = datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')
    return utc

In [454]:
#Convert UNIX timestamp to readbale format (datetime64[ns])
df.time = df.time.apply(unix_to_utc)
df.time = pd.to_datetime(df.time)
#Drop reduntant columns
df.drop(columns=['conversionType','conversionSymbol'], inplace=True)

In [455]:
df.head()

Unnamed: 0,time,high,low,open,volumefrom,volumeto,close
0,2022-01-21 07:00:00,39173.58,38575.8,38697.49,1758.17,68334552.69,39142.31
1,2022-01-21 08:00:00,39294.17,39030.88,39142.31,1288.39,50434328.22,39170.84
2,2022-01-21 09:00:00,39191.28,38953.29,39170.84,1549.57,60508276.18,38980.45
3,2022-01-21 10:00:00,39090.91,38878.86,38980.45,1272.89,49626815.02,38981.85
4,2022-01-21 11:00:00,38986.5,38694.25,38981.85,1454.38,56453857.47,38901.98


In [456]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   time        251 non-null    datetime64[ns]
 1   high        251 non-null    float64       
 2   low         251 non-null    float64       
 3   open        251 non-null    float64       
 4   volumefrom  251 non-null    float64       
 5   volumeto    251 non-null    float64       
 6   close       251 non-null    float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 13.9 KB


In [458]:
df.to_csv('data/btc_usd_hourly.csv', index=False)