In [1]:
import pandas as pd
import numpy as np
import re
import time
import schedule
import datetime
import gc

In [2]:
# neg-1, neu-2, pos-3  
def senti_number(txt):
    if txt == 'Negative':
        txt = re.sub(r'Negative', 1, txt)
    elif txt == 'Neutral':
        txt = re.sub(r'Neutral', 2, txt)
    else:
        txt = re.sub(r'Positive', 3, txt)
    
    return txt

In [3]:
# 感情の数値化
# 引数：slist
def assign_section(df,timespan):
    df['Sentiment']  = df['Sentiment'].apply(senti_number)
    
    return df

In [4]:
# UTC -> UNIX
def utc_to_unix(txt):
    #pythonの挙動により、2021-01-01T00:00:00.000Zを日本時間で読んでしまうので、culcで9hずらす
    utc = datetime.datetime.strptime(txt, '%Y-%m-%dT%H:%M:%S.000Z')
    
    return utc.timestamp()  

In [5]:
# 区間ナンバー計算(2021-01-01T00:00を区間0とする)
def culc_section_s(unix,timespan):
    section = int((unix - 1609426800)/timespan)
    
    return section

In [6]:
# 区間ナンバー計算(2021-01-01T00:00を区間0とする)
def culc_section_b(unix,timespan):
    section = int((unix - 1609459200)/timespan)
    
    return section

In [7]:
# 区間ナンバー計算
# 引数：slist
def assign_section_s(dfs,timespan):
    tlis = {'1d':24*60*60, '12h':12*60*60, '4h':4*60*60, '1h':60*60, '30m':30*60, '15m':15*60, '5m':5*60}

    if timespan in tlis:
        s = tlis[timespan]
    else:
        print("KeyError")
        
    dfs['created_at'] = dfs['created_at'].apply(utc_to_unix)
    dfs['created_at'] = dfs['created_at'].apply(culc_section_s,args=(s,))
    
    return dfs

In [8]:
# 区間ナンバー計算
# 引数：BTCUSDT
def assign_section_b(dfb,timespan):
    tlis = {'1d':24*60*60, '12h':12*60*60, '4h':4*60*60, '1h':60*60, '30m':30*60, '15m':15*60, '5m':5*60}

    if timespan in tlis:
        s = tlis[timespan]
    else:
        print("KeyError")

    dfb['unix'] = dfb['unix'].apply(culc_section_b,args=(s,))
    
    return dfb

In [9]:
# 区間ナンバー
# 引数：slist_c,elist
def assign_section_e(dfs,dfe):
    dfe['section'] = dfs['created_at']
    
    return dfe

In [10]:
# 区間ごとにcomの平均をカウント
# 引数：slist,elist
def com_counter(dfs,dfe):
    df_groupby = dfs.groupby("created_at", as_index=False) 
    dfg = df_groupby.mean()
    dfe['com_ave'] = dfg['Compound']
    
    return dfg, dfe.reset_index(drop=True)

In [11]:
# pos,neg,newを区間ごとにカウント
# 引数：slist_s,elist
def senti_counter(dfs,dfe):    
    tmp_df = dfs.value_counts(sort=False).reset_index(name='count')
    
    n = tmp_df.shape[0]
    k = int(n/3)

    df3 = [tmp_df.loc[i:i+k-1, :] for i in range(0, n, k)]
    df0 = df3[0].reset_index(drop=True)
    df1 = df3[1].reset_index(drop=True)
    df2 = df3[2].reset_index(drop=True)
    
    dfe['neg_count'] = df0['count']
    dfe['neu_count'] = df1['count']
    dfe['pos_count'] = df2['count']
    
    return dfe

In [12]:
# Tweet数をカウント
# 引数：elist
def tweet_counter(dfe):
    dfe['tweet_count']=dfe['neg_count']+dfe['neu_count']+dfe['pos_count']
    
    return dfe

In [13]:
# 始値、ラベル
# 引数：BTCUSDT,elist
def merge(dfb,dfe):
    dfe = pd.merge(dfe, dfb, left_on = "section", right_on = "unix", how = 'outer')
    
    return dfe.drop(columns = 'unix')

In [14]:
# ラベル計算
# 0:down, 1:stay, 2:up
def culc_label(rate,a):
    if rate <= a*(-1):
        label = 0
    elif a <= rate:
        label = 2
    else:
        label = 1
        
    return label    

In [15]:
# ラベル付け(トレンド = (Pn+1-Pn)/pn )
# 引数：elist
def assign_label(dfe,timespan):
    alis = {'1d':1.7*0.01, '12h':1.3*0.01, '4h':0.75*0.01, '1h':0.33*0.01, '30m':0.23*0.01, '15m':0.17*0.01, '5m':0.1*0.01}

    if timespan in alis:
        a = alis[timespan]
    else:
        print("KeyError")
    
    dfr = pd.DataFrame(columns = ['open_rate'])
    dfr['open_rate'] = dfe['open'].pct_change()
    dfr['trend'] = dfr['open_rate'].apply(culc_label,args=(a,))
    dfe['trend'] = dfr['trend']
    
    return dfe

In [53]:
# main
month = int(input("monthを入力:"))
timespan = str(input("timespanを入力:"))
print("month:{}".format(month))

# csv読み込み
dfe  = pd.DataFrame(columns = ['section','neg_count','neu_count','pos_count','com_ave','tweet_count'])
dfs_s = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Sentiment','created_at'])
dfs_c = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Compound','created_at'])
dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
print('read')

# 区間ナンバー
dfs_s = assign_section_s(dfs_s,timespan)
dfs_c = assign_section_s(dfs_c,timespan)
dfb = assign_section_b(dfb,timespan)
print('kukan')

# neg,neu,pos,com
dfs_c, dfe = com_counter(dfs_c,dfe)
dfe = senti_counter(dfs_s,dfe)
dfe = tweet_counter(dfe)
print('senti')

#区間ナンバー 
dfe = assign_section_e(dfs_c,dfe)
print('kukan_e')

# merge
dfe = merge(dfb,dfe)
print('merge')

# ラベル(トレンド)
dfe = assign_label(dfe,timespan)
print('label')

# カラム名変更
dfe = dfe.rename(columns={'open': 'open_price'})

# csv出力
dfe.to_csv(f'tweet-sentiment/{timespan}/2021-0{month}.csv',index=False)
print(dfe)

# メモリ開放
del dfs_s,dfs_c,dfb,dfe
gc.collect()

monthを入力: 1
timespanを入力: 5m


month:1
read
kukan
senti
kukan_e
merge
label
      section  neg_count  neu_count  pos_count   com_ave  tweet_count  \
0           0         39         89         92  0.161018          220   
1           1         24         63         77  0.214618          164   
2           2         13         65         61  0.222688          139   
3           3         14         41         62  0.254302          117   
4           4         11         38         55  0.293508          104   
...       ...        ...        ...        ...       ...          ...   
8923     8923         20         60         65  0.203290          145   
8924     8924         19         52         60  0.190378          131   
8925     8925         22         75         72  0.183589          169   
8926     8926         15         50         69  0.240078          134   
8927     8927         12         48         62  0.205616          122   

      open_price  trend  
0       28923.63      1  
1       28975.65      2  


0

In [54]:
# main
month = month+1
print("month:{}".format(month))

# csv読み込み
dfe  = pd.DataFrame(columns = ['section','neg_count','neu_count','pos_count','com_ave','tweet_count'])
dfs_s = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Sentiment','created_at'])
dfs_c = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Compound','created_at'])
dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
print('read')

# 区間ナンバー
dfs_s = assign_section_s(dfs_s,timespan)
dfs_c = assign_section_s(dfs_c,timespan)
dfb = assign_section_b(dfb,timespan)
print('kukan')

# neg,neu,pos,com
dfs_c, dfe = com_counter(dfs_c,dfe)
dfe = senti_counter(dfs_s,dfe)
dfe = tweet_counter(dfe)
print('senti')

#区間ナンバー 
dfe = assign_section_e(dfs_c,dfe)
print('kukan_e')

# merge
dfe = merge(dfb,dfe)
print('merge')

# ラベル(トレンド)
dfe = assign_label(dfe,timespan)
print('label')

# カラム名変更
dfe = dfe.rename(columns={'open': 'open_price'})

# csv出力
dfe.to_csv(f'tweet-sentiment/{timespan}/2021-0{month}.csv',index=False)
print(dfe)

# メモリ開放
del dfs_s,dfs_c,dfb,dfe
gc.collect()

month:2
read
kukan
senti
kukan_e
merge
label
      section  neg_count  neu_count  pos_count   com_ave  tweet_count  \
0        8928         25        133        106  0.171522          264   
1        8929         21         62         71  0.172229          154   
2        8930          9         56         64  0.232106          129   
3        8931         20         61         69  0.208115          150   
4        8932         23         42         71  0.209735          136   
...       ...        ...        ...        ...       ...          ...   
8059    16987         15         76         80  0.242389          171   
8060    16988         21         67         61  0.162493          149   
8061    16989         20         72         61  0.164341          153   
8062    16990         17         65         81  0.215347          163   
8063    16991         29         66         78  0.190436          173   

      open_price  trend  
0       33092.97      1  
1       32866.41      0  


0

In [55]:
# main
month = month+1
print("month:{}".format(month))

# csv読み込み
dfe  = pd.DataFrame(columns = ['section','neg_count','neu_count','pos_count','com_ave','tweet_count'])
dfs_s = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Sentiment','created_at'])
dfs_c = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Compound','created_at'])
dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
print('read')

# 区間ナンバー
dfs_s = assign_section_s(dfs_s,timespan)
dfs_c = assign_section_s(dfs_c,timespan)
dfb = assign_section_b(dfb,timespan)
print('kukan')

# neg,neu,pos,com
dfs_c, dfe = com_counter(dfs_c,dfe)
dfe = senti_counter(dfs_s,dfe)
dfe = tweet_counter(dfe)
print('senti')

#区間ナンバー 
dfe = assign_section_e(dfs_c,dfe)
print('kukan_e')

# merge
dfe = merge(dfb,dfe)
print('merge')

# ラベル(トレンド)
dfe = assign_label(dfe,timespan)
print('label')

# カラム名変更
dfe = dfe.rename(columns={'open': 'open_price'})

# csv出力
dfe.to_csv(f'tweet-sentiment/{timespan}/2021-0{month}.csv',index=False)
print(dfe)

# メモリ開放
del dfs_s,dfs_c,dfb,dfe
gc.collect()

month:3
read
kukan
senti
kukan_e
merge
label
      section  neg_count  neu_count  pos_count   com_ave  tweet_count  \
0       16992         32        153        105  0.156191          290   
1       16993         30         49         80  0.222191          159   
2       16994         25         65         75  0.191036          165   
3       16995         22        172         91  0.141046          285   
4       16996         28         84         81  0.161038          193   
...       ...        ...        ...        ...       ...          ...   
8923    25915         11         54        107  0.354083          172   
8924    25916         21         59         99  0.283106          179   
8925    25917         31         55        105  0.236597          191   
8926    25918         18         55         95  0.294566          168   
8927    25919         17         51         90  0.305859          158   

      open_price  trend  
0       45134.11      1  
1       45032.48      0  


0

In [56]:
# main
month = month+1
print("month:{}".format(month))

# csv読み込み
dfe  = pd.DataFrame(columns = ['section','neg_count','neu_count','pos_count','com_ave','tweet_count'])
dfs_s = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Sentiment','created_at'])
dfs_c = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Compound','created_at'])
dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
print('read')

# 区間ナンバー
dfs_s = assign_section_s(dfs_s,timespan)
dfs_c = assign_section_s(dfs_c,timespan)
dfb = assign_section_b(dfb,timespan)
print('kukan')

# neg,neu,pos,com
dfs_c, dfe = com_counter(dfs_c,dfe)
dfe = senti_counter(dfs_s,dfe)
dfe = tweet_counter(dfe)
print('senti')

#区間ナンバー 
dfe = assign_section_e(dfs_c,dfe)
print('kukan_e')

# merge
dfe = merge(dfb,dfe)
print('merge')

# ラベル(トレンド)
dfe = assign_label(dfe,timespan)
print('label')

# カラム名変更
dfe = dfe.rename(columns={'open': 'open_price'})

# csv出力
dfe.to_csv(f'tweet-sentiment/{timespan}/2021-0{month}.csv',index=False)
print(dfe)

# メモリ開放
del dfs_s,dfs_c,dfb,dfe
gc.collect()

month:4
read
kukan
senti
kukan_e
merge
label
      section  neg_count  neu_count  pos_count   com_ave  tweet_count  \
0       25920         29        147        157  0.236977          333   
1       25921         27         67        135  0.304423          229   
2       25922         30         54        141  0.307527          225   
3       25923         20         64        126  0.302876          210   
4       25924         24         48        118  0.316482          190   
...       ...        ...        ...        ...       ...          ...   
8635    34555         23         71         83  0.187949          177   
8636    34556         23         74         69  0.182055          166   
8637    34557         23         63         95  0.222536          181   
8638    34558         24         71         78  0.169514          173   
8639    34559         18         63         56  0.172942          137   

      open_price  trend  
0       58739.46      1  
1       59071.84      2  


0

In [57]:
# main
month = month+1
print("month:{}".format(month))

# csv読み込み
dfe  = pd.DataFrame(columns = ['section','neg_count','neu_count','pos_count','com_ave','tweet_count'])
dfs_s = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Sentiment','created_at'])
dfs_c = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Compound','created_at'])
dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
print('read')

# 区間ナンバー
dfs_s = assign_section_s(dfs_s,timespan)
dfs_c = assign_section_s(dfs_c,timespan)
dfb = assign_section_b(dfb,timespan)
print('kukan')

# neg,neu,pos,com
dfs_c, dfe = com_counter(dfs_c,dfe)
dfe = senti_counter(dfs_s,dfe)
dfe = tweet_counter(dfe)
print('senti')

#区間ナンバー 
dfe = assign_section_e(dfs_c,dfe)
print('kukan_e')

# merge
dfe = merge(dfb,dfe)
print('merge')

# ラベル(トレンド)
dfe = assign_label(dfe,timespan)
print('label')

# カラム名変更
dfe = dfe.rename(columns={'open': 'open_price'})

# csv出力
dfe.to_csv(f'tweet-sentiment/{timespan}/2021-0{month}.csv',index=False)
print(dfe)

# メモリ開放
del dfs_s,dfs_c,dfb,dfe
gc.collect()

month:5
read
kukan
senti
kukan_e
merge
label
      section  neg_count  neu_count  pos_count   com_ave  tweet_count  \
0       34560         35        157        122  0.165805          314   
1       34561         29         75         76  0.143494          180   
2       34562         19         78         80  0.194917          177   
3       34563         20         64         83  0.222073          167   
4       34564         23         76         91  0.210076          190   
...       ...        ...        ...        ...       ...          ...   
8923    43483         15         47         86  0.270678          148   
8924    43484         22         62         60  0.155353          144   
8925    43485         20         52         81  0.229703          153   
8926    43486         20         46         64  0.198910          130   
8927    43487         17         48         75  0.239001          140   

      open_price  trend  
0       57697.25      1  
1       57777.02      2  


0

In [58]:
# main
month = month+1
print("month:{}".format(month))

# csv読み込み
dfe  = pd.DataFrame(columns = ['section','neg_count','neu_count','pos_count','com_ave','tweet_count'])
dfs_s = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Sentiment','created_at'])
dfs_c = pd.read_csv(f'tweet-sentiment/2021-0{month}_slist.csv',usecols=['Compound','created_at'])
dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
print('read')

# 区間ナンバー
dfs_s = assign_section_s(dfs_s,timespan)
dfs_c = assign_section_s(dfs_c,timespan)
dfb = assign_section_b(dfb,timespan)
print('kukan')

# neg,neu,pos,com
dfs_c, dfe = com_counter(dfs_c,dfe)
dfe = senti_counter(dfs_s,dfe)
dfe = tweet_counter(dfe)
print('senti')

#区間ナンバー 
dfe = assign_section_e(dfs_c,dfe)
print('kukan_e')

# merge
dfe = merge(dfb,dfe)
print('merge')

# ラベル(トレンド)
dfe = assign_label(dfe,timespan)
print('label')

# カラム名変更
dfe = dfe.rename(columns={'open': 'open_price'})

# csv出力
dfe.to_csv(f'tweet-sentiment/{timespan}/2021-0{month}.csv',index=False)
print(dfe)

# メモリ開放
del dfs_s,dfs_c,dfb,dfe
gc.collect()

month:6
read
kukan
senti
kukan_e
merge
label
      section  neg_count  neu_count  pos_count   com_ave  tweet_count  \
0       43488         44        150        124  0.146325          318   
1       43489         39         79         99  0.163222          217   
2       43490         16         56         85  0.265532          157   
3       43491         22         79         81  0.195236          182   
4       43492         28         80         86  0.164121          194   
...       ...        ...        ...        ...       ...          ...   
8635    52123         19         81        103  0.255028          203   
8636    52124         19         60         96  0.269868          175   
8637    52125         21         87         86  0.199548          194   
8638    52126         20         68         74  0.205235          162   
8639    52127         20         76         81  0.192937          177   

      open_price  trend  
0       37253.82      1  
1       37599.99      2  


0

In [None]:
# binanceデータの第一処理
    #カラム付け
    #時間をUNIX秒に
    #close_timeを削除
'''
def div(unix):
    unix = int(unix/1000)
    return unix

df = pd.read_csv('binance/BTCUSDT/15m/BTCUSDT-15m-2021-01.csv', header=None)
df.columns = ['unix',
                'open',
                'high',
                'low',
                'close',
                'volume',
                'close_time',
                'quote_asset_volume',
                'number_of_trades',
                'taker_buy_base_asset_volume',
                'taker_buy_quote_asset_volume',
                'ignore']
df['unix'] = df['unix'].apply(div)
df.drop(columns=['close_time'], inplace=True)
df.to_csv('binance/BTCUSDT/15m/BTCUSDT-15m-2021-01.csv', index=False)
del df
gc.collect()

'''

In [None]:
# 閾値判定用

'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

month = 1
timespan = str(input("timespanを入力:"))

files = [f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+1}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+2}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+3}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+4}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+5}.csv']
datas = []
df = pd.DataFrame(columns = ['open'])
for file in files:
    data = pd.read_csv(file, usecols = ['open'])
    datas.append(data)

df = pd.concat(datas).reset_index(drop=True)

dfr = pd.DataFrame(columns = ['open_rate'])
dfr['open_rate'] = df['open'].pct_change()

print(dfr.describe(percentiles=[0.1,0.2,0.3,0.333,0.4,0.6,0.667,0.7,0.8,0.9]))

plt.hist(dfr['open_rate'], bins=100000, ec='navy')
plt.title('open_rate1-6')
plt.xlabel('rate')
plt.ylabel('count')
plt.show()

'''