In [1]:
import pandas as pd
import numpy as np
import re
import time
import schedule
import datetime
import gc

In [2]:
# neg-1, neu-2, pos-3  
def senti_number(txt):
    if txt == 'Negative':
        txt = re.sub(r'Negative', 1, txt)
    elif txt == 'Neutral':
        txt = re.sub(r'Neutral', 2, txt)
    else:
        txt = re.sub(r'Positive', 3, txt)
    
    return txt

In [3]:
# 感情の数値化
# 引数：slist
def assign_section(df,timespan):
    df['Sentiment']  = df['Sentiment'].apply(senti_number)
    
    return df

In [4]:
# UTC -> UNIX
def utc_to_unix(txt):
    #pythonの挙動により、2021-01-01T00:00:00.000Zを日本時間で読んでしまうので、culcで9hずらす
    utc = datetime.datetime.strptime(txt, '%Y-%m-%dT%H:%M:%S.000Z')
    
    return utc.timestamp()  

In [5]:
# 区間ナンバー計算(2021-01-01T00:00を区間0とする)
def culc_section_s(unix,timespan):
    section = int((unix - 1609426800)/timespan)
    
    return section

In [6]:
# 区間ナンバー計算(2021-01-01T00:00を区間0とする)
def culc_section_b(unix,timespan):
    section = int((unix - 1609459200)/timespan)
    
    return section

In [7]:
# 区間ナンバー計算
# 引数：slist
def assign_section_s(dfs,timespan):
    tlis = {'1d':24*60*60, '12h':12*60*60, '4h':4*60*60, '1h':60*60, '30m':30*60, '15m':15*60, '5m':5*60}

    if timespan in tlis:
        s = tlis[timespan]
    else:
        print("KeyError")
        
    dfs['created_at'] = dfs['created_at'].apply(utc_to_unix)
    dfs['created_at'] = dfs['created_at'].apply(culc_section_s,args=(s,))
    
    return dfs

In [8]:
# 区間ナンバー計算
# 引数：BTCUSDT
def assign_section_b(dfb,timespan):
    tlis = {'1d':24*60*60, '12h':12*60*60, '4h':4*60*60, '1h':60*60, '30m':30*60, '15m':15*60, '5m':5*60}

    if timespan in tlis:
        s = tlis[timespan]
    else:
        print("KeyError")

    dfb['unix'] = dfb['unix'].apply(culc_section_b,args=(s,))
    
    return dfb

In [9]:
# 区間ナンバー
# 引数：slist_c,elist
def assign_section_e(dfs,dfe):
    dfe['section'] = dfs['created_at']
    
    return dfe

In [10]:
# 区間ごとにcomの平均をカウント
# 引数：slist,elist
def com_counter(dfs,dfe):
    df_groupby = dfs.groupby("created_at", as_index=False) 
    dfg = df_groupby.mean()
    dfe['com_ave'] = dfg['Compound']
    
    return dfg, dfe.reset_index(drop=True)

In [11]:
# pos,neg,newを区間ごとにカウント
# 引数：slist_s,elist
def senti_counter(dfs,dfe):    
    tmp_df = dfs.value_counts(sort=False).reset_index(name='count')
    
    n = tmp_df.shape[0]
    k = int(n/3)

    df3 = [tmp_df.loc[i:i+k-1, :] for i in range(0, n, k)]
    df0 = df3[0].reset_index(drop=True)
    df1 = df3[1].reset_index(drop=True)
    df2 = df3[2].reset_index(drop=True)
    
    dfe['neg_count'] = df0['count']
    dfe['neu_count'] = df1['count']
    dfe['pos_count'] = df2['count']
    
    return dfe

In [12]:
# Tweet数をカウント
# 引数：elist
def tweet_counter(dfe):
    dfe['tweet_count']=dfe['neg_count']+dfe['neu_count']+dfe['pos_count']
    
    return dfe

In [13]:
# 始値、ラベル
# 引数：BTCUSDT,elist
def merge(dfb,dfe):
    dfe = pd.merge(dfe, dfb, left_on = "section", right_on = "unix", how = 'outer')
    
    return dfe.drop(columns = 'unix')

In [14]:
# ラベル計算
# 0:down, 1:stay, 2:up
def culc_label(rate,a):
    if rate <= a*(-1):
        label = 0
    elif a <= rate:
        label = 2
    else:
        label = 1
        
    return label    

In [15]:
# ラベル付け(トレンド = (Pn+1-Pn)/pn )
# 引数：elist
def assign_label(dfe,timespan):
    alis = {'1d':1.7*0.01, '12h':1.3*0.01, '4h':0.75*0.01, '1h':0.33*0.01, '30m':0.23*0.01, '15m':0.17*0.01, '5m':0.1*0.01}

    if timespan in alis:
        a = alis[timespan]
    else:
        print("KeyError")
    
    dfr = pd.DataFrame(columns = ['open_rate'])
    dfr['open_rate'] = dfe['open'].pct_change()
    dfr['trend'] = dfr['open_rate'].apply(culc_label,args=(a,))
    dfe['trend'] = dfr['trend']
    
    return dfe

In [16]:
# main
month = int(input("monthを入力:"))
#timespan = str(input("timespanを入力:"))
print("month:{}".format(month))

tlist = ['1d','12h','4h','1h','30m','15m','5m']
for timespan in tlist:   

    # csv読み込み
    dfe  = pd.DataFrame(columns = ['section','neg_count','neu_count','pos_count','com_ave','tweet_count'])
    dfs_s = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Sentiment','created_at'])
    dfs_c = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Compound','created_at'])
    dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
    print('read')

    # 区間ナンバー
    dfs_s = assign_section_s(dfs_s,timespan)
    dfs_c = assign_section_s(dfs_c,timespan)
    dfb = assign_section_b(dfb,timespan)
    print('kukan')

    # neg,neu,pos,com
    dfs_c, dfe = com_counter(dfs_c,dfe)
    dfe = senti_counter(dfs_s,dfe)
    dfe = tweet_counter(dfe)
    print('senti')

    #区間ナンバー 
    dfe = assign_section_e(dfs_c,dfe)
    print('kukan_e')

    # merge
    dfe = merge(dfb,dfe)
    print('merge')

    # ラベル(トレンド)
    dfe = assign_label(dfe,timespan)
    print('label')

    # カラム名変更
    dfe = dfe.rename(columns={'open': 'open_price'})

    # csv出力
    dfe.to_csv(f'tweet-svm/{timespan}/2021-0{month}.csv',index=False)
    print(dfe)

    # メモリ開放
    del dfs_s,dfs_c,dfb,dfe
    gc.collect()

monthを入力: 7


month:7
read
kukan
senti
kukan_e
merge
label
    section  neg_count  neu_count  pos_count   com_ave  tweet_count  \
0       181       7299      22014      34467  0.259812        63780   
1       182       7387      20512      28332  0.229091        56231   
2       183       6927      17250      26068  0.241058        50245   
3       184       5654      17047      24248  0.247010        46949   
4       185       7338      20084      28130  0.234580        55552   
5       186       8311      20271      28064  0.221385        56646   
6       187       6792      19951      30077  0.260614        56820   
7       188       8084      20956      33320  0.254200        62360   
8       189       7987      20284      29915  0.233556        58186   
9       190       6687      17283      26222  0.234853        50192   
10      191       5524      17375      25572  0.257871        48471   
11      192       7136      20985      36595  0.299885        64716   
12      193       7272      2178

In [16]:
# binanceデータの第一処理
    #カラム付け
    #時間をUNIX秒に
    #close_timeを削除
'''
def div(unix):
    unix = int(unix/1000)
    return unix

tlist = ['1d','12h','4h','1h','30m','15m','5m']
for timespan in tlist:   
    df = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-07.csv', header=None)
    df.columns = ['unix',
                    'open',
                    'high',
                    'low',
                    'close',
                    'volume',
                    'close_time',
                    'quote_asset_volume',
                    'number_of_trades',
                    'taker_buy_base_asset_volume',
                    'taker_buy_quote_asset_volume',
                    'ignore']
    df['unix'] = df['unix'].apply(div)
    df.drop(columns=['close_time'], inplace=True)
    df.to_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-07.csv', index=False)
    del df
    gc.collect()
'''


In [None]:
# 閾値判定用

'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

month = 1
timespan = str(input("timespanを入力:"))

files = [f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+1}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+2}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+3}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+4}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+5}.csv']
datas = []
df = pd.DataFrame(columns = ['open'])
for file in files:
    data = pd.read_csv(file, usecols = ['open'])
    datas.append(data)

df = pd.concat(datas).reset_index(drop=True)

dfr = pd.DataFrame(columns = ['open_rate'])
dfr['open_rate'] = df['open'].pct_change()

print(dfr.describe(percentiles=[0.1,0.2,0.3,0.333,0.4,0.6,0.667,0.7,0.8,0.9]))

plt.hist(dfr['open_rate'], bins=100000, ec='navy')
plt.title('open_rate1-6')
plt.xlabel('rate')
plt.ylabel('count')
plt.show()

'''