In [1]:
import pandas as pd
import numpy as np
import re
import time
import schedule
import datetime
import gc

In [2]:
# neg-1, neu-2, pos-3  
def senti_number(txt):
    if txt == 'Negative':
        txt = re.sub(r'Negative', 1, txt)
    elif txt == 'Neutral':
        txt = re.sub(r'Neutral', 2, txt)
    else:
        txt = re.sub(r'Positive', 3, txt)
    
    return txt

In [3]:
# 感情の数値化
# 引数：slist
def assign_section(df,timespan):
    df['Sentiment']  = df['Sentiment'].apply(senti_number)
    
    return df

In [4]:
# UTC -> UNIX
def utc_to_unix(txt):
    #pythonの挙動により、2021-01-01T00:00:00.000Zを日本時間で読んでしまうので、culcで9hずらす
    utc = datetime.datetime.strptime(txt, '%Y-%m-%dT%H:%M:%S.000Z')
    
    return utc.timestamp()  

In [47]:
# 区間ナンバー計算(2021-01-01T00:00を区間0とする)
def culc_section_s(unix,timespan):
    section = int((unix - 1609426800)/timespan)
    
    return section

In [46]:
# 区間ナンバー計算(2021-01-01T00:00を区間0とする)
def culc_section_b(unix,timespan):
    section = int((unix - 1609459200)/timespan)
    
    return section

In [6]:
# 区間ナンバー計算
# 引数：slist
def assign_section_s(dfs,timespan):
    tlis = {'1d':24*60*60, '12h':12*60*60, '4h':4*60*60, '1h':60*60, '30m':30*60, '15m':15*60, '5m':5*60}

    if timespan in tlis:
        s = tlis[timespan]
    else:
        print("KeyError")
        
    dfs['created_at'] = dfs['created_at'].apply(utc_to_unix)
    dfs['created_at'] = dfs['created_at'].apply(culc_section,args=(s,))
    
    return dfs

In [7]:
# 区間ナンバー計算
# 引数：BTCUSDT
def assign_section_b(dfb,timespan):
    tlis = {'1d':24*60*60, '12h':12*60*60, '4h':4*60*60, '1h':60*60, '30m':30*60, '15m':15*60, '5m':5*60}

    if timespan in tlis:
        s = tlis[timespan]
    else:
        print("KeyError")

    dfb['unix'] = dfb['unix'].apply(culc_section,args=(s,))
    
    return dfb

In [8]:
# 区間ナンバー
# 引数：slist_c,elist
def assign_section_e(dfs,dfe):
    dfe['section'] = dfs['created_at']
    
    return dfe

In [9]:
# 区間ごとにcomの平均をカウント
# 引数：slist,elist
def com_counter(dfs,dfe):
    df_groupby = dfs.groupby("created_at", as_index=False) 
    dfg = df_groupby.mean()
    dfe['com_ave'] = dfg['Compound']
    
    return dfg, dfe.reset_index(drop=True)

In [10]:
# pos,neg,newを区間ごとにカウント
# 引数：slist_s,elist
def senti_counter(dfs,dfe):    
    tmp_df = dfs.value_counts(sort=False).reset_index(name='count')
    
    n = tmp_df.shape[0]
    k = int(n/3)

    df3 = [tmp_df.loc[i:i+k-1, :] for i in range(0, n, k)]
    df0 = df3[0].reset_index(drop=True)
    df1 = df3[1].reset_index(drop=True)
    df2 = df3[2].reset_index(drop=True)
    
    dfe['neg_count'] = df0['count']
    dfe['neu_count'] = df1['count']
    dfe['pos_count'] = df2['count']
    
    return dfe

In [11]:
# Tweet数をカウント
# 引数：elist
def tweet_counter(dfe):
    dfe['tweet_count']=dfe['neg_count']+dfe['neu_count']+dfe['pos_count']
    
    return dfe

In [12]:
# 始値、ラベル
# 引数：BTCUSDT,elist
def merge(dfb,dfe):
    dfe = pd.merge(dfe, dfb, left_on = "section", right_on = "unix", how = 'outer')
    
    return dfe.drop(columns = 'unix')

In [13]:
# ラベル計算
# 0:down, 1:stay, 2:up
def culc_label(rate,a):
    if rate <= a*(-1):
        label = 0
    elif a <= rate:
        label = 2
    else:
        label = 1
        
    return label    

In [23]:
# ラベル付け(トレンド = (Pn+1-Pn)/pn )
# 引数：elist
def assign_label(dfe,timespan):
    alis = {'1d':5*0.01, '12h':3*0.01, '4h':1.5*0.01, '1h':1*0.01, '30m':0.5*0.01, '15m':0.3*0.01, '5m':0.1*0.01}

    if timespan in alis:
        a = alis[timespan]
    else:
        print("KeyError")
    
    dfr = pd.DataFrame(columns = ['open'])
    dfr['open_rate'] = dfe['open'].pct_change()
    dfr['trend'] = dfr['open_rate'].apply(culc_label,args=(a,))
    dfe['trend'] = dfr['trend']
    
    return dfe

In [41]:
# main
month = int(input("monthを入力:"))
timespan = str(input("timespanを入力:"))

# csv読み込み
dfe  = pd.DataFrame(columns = ['section','neg_count','neu_count','pos_count','com_ave','tweet_count'])
dfs_s = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Sentiment','created_at'])
dfs_c = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Compound','created_at'])
dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
print('read')

# 区間ナンバー
dfs_s = assign_section_s(dfs_s,timespan)
dfs_c = assign_section_s(dfs_c,timespan)
dfb = assign_section_b(dfb,timespan)
print('kukan')

# neg,neu,pos,com
dfs_c, dfe = com_counter(dfs_c,dfe)
dfe = senti_counter(dfs_s,dfe)
dfe = tweet_counter(dfe)
print('senti')

#区間ナンバー 
dfe = assign_section_e(dfs_c,dfe)
print('kukan_e')

# merge
dfe = merge(dfb,dfe)
print('merge')

# ラベル(トレンド)
dfe = assign_label(dfe,timespan)
print('label')
# csv出力
dfe.to_csv(f'tweet-sentiment/{timespan}/2021-0{month}.csv',index=False)
print(dfe)



monthを入力: 1
timespanを入力: 1h


read
kukan
senti
kukan_e
merge
label
     section  neg_count  neu_count  pos_count   com_ave  tweet_count  \
0        0.0      244.0      550.0      690.0  0.196351       1484.0   
1        1.0      254.0      794.0      714.0  0.155161       1762.0   
2        2.0      177.0      523.0      528.0  0.167083       1228.0   
3        3.0      155.0      428.0      441.0  0.164698       1024.0   
4        4.0      141.0      420.0      503.0  0.205031       1064.0   
..       ...        ...        ...        ...       ...          ...   
748      NaN        NaN        NaN        NaN       NaN          NaN   
749      NaN        NaN        NaN        NaN       NaN          NaN   
750      NaN        NaN        NaN        NaN       NaN          NaN   
751      NaN        NaN        NaN        NaN       NaN          NaN   
752      NaN        NaN        NaN        NaN       NaN          NaN   

         open  trend  
0         NaN      1  
1         NaN      1  
2         NaN      1  
3     

In [36]:
dfe = pd.merge(dfe, dfb, left_on = "section", right_on = "unix", how = 'outer')    
print(dfe)

     section  neg_count  neu_count  pos_count   com_ave  tweet_count  open_x  \
0     3624.0      336.0     1034.0     1077.0  0.174753       2447.0     NaN   
1     3625.0      242.0      770.0      910.0  0.200025       1922.0     NaN   
2     3626.0      229.0      686.0      963.0  0.230150       1878.0     NaN   
3     3627.0      242.0      763.0      997.0  0.220363       2002.0     NaN   
4     3628.0      225.0      683.0      894.0  0.218390       1802.0     NaN   
..       ...        ...        ...        ...       ...          ...     ...   
733      NaN        NaN        NaN        NaN       NaN          NaN     NaN   
734      NaN        NaN        NaN        NaN       NaN          NaN     NaN   
735      NaN        NaN        NaN        NaN       NaN          NaN     NaN   
736      NaN        NaN        NaN        NaN       NaN          NaN     NaN   
737      NaN        NaN        NaN        NaN       NaN          NaN     NaN   

     trend    unix    open_y  
0      1

In [42]:
print(dfs_c)

     created_at  Compound
0             0  0.196351
1             1  0.155161
2             2  0.167083
3             3  0.164698
4             4  0.205031
..          ...       ...
739         739  0.165453
740         740  0.165794
741         741  0.156660
742         742  0.165267
743         743  0.177261

[744 rows x 2 columns]


In [43]:
print(dfs_s)

        Sentiment  created_at
0         Neutral           0
1         Neutral           0
2         Neutral           0
3         Neutral           0
4        Positive           0
...           ...         ...
1571369  Negative         743
1571370  Negative         743
1571371   Neutral         743
1571372  Positive         743
1571373  Positive         743

[1571374 rows x 2 columns]


In [44]:
print(dfb)

     unix      open
0       9  28923.63
1      10  28995.13
2      11  29410.00
3      12  29195.25
4      13  29278.41
..    ...       ...
739   748  32449.53
740   749  32807.29
741   750  32853.73
742   751  32561.35
743   752  32974.10

[744 rows x 2 columns]


In [40]:
# メモリ開放
del dfs_s,dfs_c,dfb,dfe
gc.collect()

132

In [None]:
# binanceデータの第一処理
    #カラム付け
    #時間をUNIX秒に
    #close_timeを削除
'''
def div(unix):
    unix = int(unix/1000)
    return unix

df = pd.read_csv('binance/BTCUSDT/15m/BTCUSDT-15m-2021-01.csv', header=None)
df.columns = ['unix',
                'open',
                'high',
                'low',
                'close',
                'volume',
                'close_time',
                'quote_asset_volume',
                'number_of_trades',
                'taker_buy_base_asset_volume',
                'taker_buy_quote_asset_volume',
                'ignore']
df['unix'] = df['unix'].apply(div)
df.drop(columns=['close_time'], inplace=True)
df.to_csv('binance/BTCUSDT/15m/BTCUSDT-15m-2021-01.csv', index=False)
del df
gc.collect()

'''

In [45]:
print(utc_to_unix('2021-01-01T00:00:00.000Z'))

1609426800.0
