In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import time
import schedule
import datetime
import gc

In [17]:
#テキスト前処理
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english') 

def clean_up_tweet(txt):
    # メンション削除
    txt = re.sub(r'@[A-Za-z0-9_]+', '', txt)
    # URL削除
    txt = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', txt)
    # ハッシュタグ削除
    txt = re.sub(r'#', '', txt)
    # ticker symbolの削除
    txt = re.sub(r'\$[A-Za-z0-9]*', '', txt)
    # 全角スペース、タブ、改行削除
    txt = re.sub(r'[\u3000\t\n]', '', txt)
    # 小文字に統一
    txt = txt.lower()  
    # 数値を0に置換
    #txt = re.sub(r'[0-9]', '0', txt) 
    # ストップワードの削除
    txt = ' '.join([word for word in txt.split() if word not in sw])
    
    return txt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zxxxs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# UTC -> UNIX
# 1-6月用
def utc_to_unix(txt):
    #pythonの挙動により、2021-01-01T00:00:00.000Zを日本時間で読んでしまうので、culcで9hずらす
    utc = datetime.datetime.strptime(txt, '%Y-%m-%dT%H:%M:%S.000Z')
    
    return utc.timestamp()  

In [28]:
# UTC -> UNIX
# 7月用
def utc_to_unix7(txt):
    #pythonの挙動により、2021-01-01T00:00:00.000Zを日本時間で読んでしまうので、culcで9hずらす
    utc = datetime.datetime.strptime(txt, '%Y-%m-%d %H:%M:%S+00:00')
    
    return utc.timestamp()  

In [20]:
# 区間ナンバー計算(2021-01-01T00:00を区間0とする)
def culc_section_t(unix,timespan):
    section = int((unix - 1609426800)/timespan)
    
    return section

In [21]:
# 区間ナンバー計算(2021-01-01T00:00を区間0とする)
def culc_section_b(unix,timespan):
    section = int((unix - 1609459200)/timespan)
    
    return section

In [22]:
# 区間ナンバー計算
# 引数：tlist
def assign_section_t(dft,timespan,month):
    tlis = {'1d':24*60*60, '12h':12*60*60, '4h':4*60*60, '1h':60*60, '30m':30*60, '15m':15*60, '5m':5*60}

    if timespan in tlis:
        s = tlis[timespan]
    else:
        print("KeyError")
    
    if month == 7:
        dft['created_at'] = dft['created_at'].apply(utc_to_unix7)
    else:
        dft['created_at'] = dft['created_at'].apply(utc_to_unix)
    
    dft['created_at'] = dft['created_at'].apply(culc_section_t,args=(s,))
    
    return dft

In [23]:
# 区間ナンバー計算
# 引数：BTCUSDT
def assign_section_b(dfb,timespan):
    tlis = {'1d':24*60*60, '12h':12*60*60, '4h':4*60*60, '1h':60*60, '30m':30*60, '15m':15*60, '5m':5*60}

    if timespan in tlis:
        s = tlis[timespan]
    else:
        print("KeyError")

    dfb['unix'] = dfb['unix'].apply(culc_section_b,args=(s,))
    
    return dfb

In [24]:
# ラベル計算
# 0:down, 1:stay, 2:up
def culc_label(rate,a):
    if rate <= a*(-1):
        label = 0
    elif a <= rate:
        label = 2
    else:
        label = 1
        
    return label    

In [25]:
# ラベル付け(トレンド = (Pn+1-Pn)/pn )
# 引数：BTCUSDT
def assign_label(dfb,timespan):
    alis = {'1d':1.7*0.01, '12h':1.3*0.01, '4h':0.75*0.01, '1h':0.33*0.01, '30m':0.23*0.01, '15m':0.17*0.01, '5m':0.1*0.01}

    if timespan in alis:
        a = alis[timespan]
    else:
        print("KeyError")
    
    dfr = pd.DataFrame(columns = ['open_rate'])
    dfr['open_rate'] = dfb['open'].pct_change()
    dfr['trend'] = dfr['open_rate'].apply(culc_label,args=(a,))
    dfb['trend'] = dfr['trend']
    
    return dfb

In [26]:
# 始値、ラベル
# 引数：tlist,BTCUSDT
def merge(dft,dfb):
    dfe = pd.merge(dft, dfb, left_on = 'created_at', right_on = 'unix', how = 'outer')
    
    return dfe.drop(columns = 'unix')

In [None]:
'''
# main
# テキスト前処理、区間ナンバー割り当て、ラベル付けのみ
# マージなし
tlist = ['1d','12h','4h','1h','30m','15m','5m']
mlist = [1,2,3,4,5,6,7]
for timespan in tlist: 
    for month in mlist: 
        print(timespan)
        print(month)

        # csv読み込み
        dft = pd.read_csv(f'tweet-of-btc/2021-0{month}_tlist.csv',usecols=['Tweet','created_at'])
        dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
        print('データ読み込み完了')
        
        # 区間ナンバー
        dft = assign_section_t(dft,timespan,month)
        dfb = assign_section_b(dfb,timespan)
        print('区間ナンバー割り当て完了')

        #ツイート前処理
        dft['Tweet'] = dft['Tweet'].apply(clean_up_tweet)
        print('tweet前処理')

        # ラベル(トレンド)
        dfb = assign_label(dfb,timespan)
        print('ラベル付け完了')

        # カラム名変更
        dft = dft.rename(columns={'created_at':'section',
                                  'Tweet':'tweet(n)'})
        dfb = dfb.rename(columns={'unix':'section',
                                  'open':'open_price(n)',
                                  'trend':'trend(n)'})
        
        # csv出力
        dfb.to_csv(f'tweet-transformer/{timespan}/2021-0{month}_b.csv',index=False)
        dft.to_csv(f'tweet-transformer/{timespan}/2021-0{month}_t.csv',index=False)
        print('出力完了')


        # メモリ開放
        del dft,dfb
        gc.collect()
        print('メモリ開放')
'''

In [None]:
# main
# Transformerの前処理下書き
tfiles = [f'tweet-of-btc/2021-01_tlist.csv',
          f'tweet-of-btc/2021-02_tlist.csv',
          f'tweet-of-btc/2021-03_tlist.csv',
          f'tweet-of-btc/2021-04_tlist.csv',
          f'tweet-of-btc/2021-05_tlist.csv',
          f'tweet-of-btc/2021-06_tlist.csv',
          f'tweet-of-btc/2021-07_tlist.csv']
tdatas = []
dft = pd.DataFrame()
for tfile in tfiles:
    tdata = pd.read_csv(tfile)
    tdatas.append(tdata)

# ファイルの結合
dft = pd.concat(tdatas).reset_index(drop=True)

bfiles = [f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-01.csv',
          f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-02.csv',
          f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-03.csv',
          f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-04.csv',
          f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-05.csv',
          f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-06.csv',
          f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-07.csv']
bdatas = []
dfb = pd.DataFrame()
for bfile in bfiles:
    bdata = pd.read_csv(bfile)
    bdatas.append(bdata)

# ファイルの結合
df = pd.concat(datas).reset_index(drop=True)
    

tlist = ['1d','12h','4h','1h','30m','15m','5m']
for timespan in tlist:   
    print(timespan)
    
    #目的変数、説明変数の整理
    dfb['end_price(n)'] = dfb['open_price(n)'].shift(-1)
    dfb['trend(n+1)'] = dfb['trend(n)'].shift(-1)
    dfb['trend(n+1)'] = dfb['trend(n+1)'].astype(int)
    dfb = dfb.drop(columns=['open_price(n)'])
    print('変数整理完了')
    
    # merge
    dfm = merge(dft,dfb)
    print('merge完了')

    # カラム名変更
    dfm = dfm.rename(columns={'created_at': 'section',
                              'Tweet':'tweet(n)'})

    # 欠損値を含む行を削除
    dfm = dfm.dropna(how='any')
    
    # csv出力
    dfm.to_csv(f'tweet-transformer/{timespan}/2021-0{month}.csv',index=False)
    print(dfm)

    # メモリ開放
    del dft,dfb,dfm
    gc.collect()


In [None]:
# Transformerテスト用
'''
# main
month = 7
#timespan = str(input("timespanを入力:"))
print("month:{}".format(month))

tlist = ['1d','12h','4h','1h','30m','15m','5m']
for timespan in tlist:   
    print(timespan)
    
    # csv読み込み
    dft = pd.read_csv(f'tweet-of-btc/2021-0{month}_tlist.csv',usecols=['Tweet','created_at'])
    dfb = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv', usecols = ['unix','open'])
    print('データ読み込み完了')

    # 区間ナンバー
    dft = assign_section_t(dft,timespan)
    dfb = assign_section_b(dfb,timespan)
    print('区間ナンバー割り当て完了')

    #ツイート前処理
    dft['Tweet'] = dft['Tweet'].apply(clean_up_tweet)
    print('tweet前処理')
    
    # ラベル(トレンド)
    dfb = assign_label(dfb,timespan)
    print('ラベル付け完了')
    
    # カラム名変更
    dfb = dfb.rename(columns={'open':'open_price(n)',
                              'trend':'trend(n)'})
    
    #目的変数、説明変数の整理
    dfb['end_price(n)'] = dfb['open_price(n)'].shift(-1)
    dfb['trend(n+1)'] = dfb['trend(n)'].shift(-1)
    dfb = dfb.drop(columns=['open_price(n)'])
    print('変数整理完了')
    
    # merge
    dfm = merge(dft,dfb)
    print('merge完了')

    # カラム名変更
    dfm = dfm.rename(columns={'created_at': 'section',
                              'Tweet':'tweet(n)'})

    # 欠損値を含む行を削除
    dfm = dfm.dropna(how='any')
    
    # 型変換
    dfm['trend(n+1)'] = dfm['trend(n+1)'].astype(int)
    
    # csv出力
    dfm.to_csv(f'tweet-transformer/{timespan}/2021-0{month}.csv',index=False)
    print(dfm)

    # メモリ開放
    del dft,dfb,dfm
    gc.collect()
'''

In [16]:
# binanceデータの第一処理
    #カラム付け
    #時間をUNIX秒に
    #close_timeを削除
'''
def div(unix):
    unix = int(unix/1000)
    return unix

tlist = ['1d','12h','4h','1h','30m','15m','5m']
for timespan in tlist:   
    df = pd.read_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-07.csv', header=None)
    df.columns = ['unix',
                    'open',
                    'high',
                    'low',
                    'close',
                    'volume',
                    'close_time',
                    'quote_asset_volume',
                    'number_of_trades',
                    'taker_buy_base_asset_volume',
                    'taker_buy_quote_asset_volume',
                    'ignore']
    df['unix'] = df['unix'].apply(div)
    df.drop(columns=['close_time'], inplace=True)
    df.to_csv(f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-07.csv', index=False)
    del df
    gc.collect()
'''


In [None]:
# 閾値判定用

'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

month = 1
timespan = str(input("timespanを入力:"))

files = [f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+1}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+2}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+3}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+4}.csv',
         f'binance/BTCUSDT/{timespan}/BTCUSDT-{timespan}-2021-0{month+5}.csv']
datas = []
df = pd.DataFrame(columns = ['open'])
for file in files:
    data = pd.read_csv(file, usecols = ['open'])
    datas.append(data)

df = pd.concat(datas).reset_index(drop=True)

dfr = pd.DataFrame(columns = ['open_rate'])
dfr['open_rate'] = df['open'].pct_change()

print(dfr.describe(percentiles=[0.1,0.2,0.3,0.333,0.4,0.6,0.667,0.7,0.8,0.9]))

plt.hist(dfr['open_rate'], bins=100000, ec='navy')
plt.title('open_rate1-6')
plt.xlabel('rate')
plt.ylabel('count')
plt.show()

'''

In [13]:
# ツイート時刻が早い順にソート
'''
df = pd.read_csv('tweet-of-btc/2021-07_tlist.csv')
df['created_at'] = pd.to_datetime(df['created_at'], infer_datetime_format= True)
df = df.sort_values(by = 'created_at', ascending = True) 
'''

In [52]:
#del dft,dfb
gc.collect()

59

In [2]:
df = pd.read_csv('tweet-transformer/1h/test7_v2.csv',)
df = df.drop(columns=['section', 'trend(n)','end_price(n)'])
df.to_csv('tweet-transformer/1h/test7_v2.csv',index=False)
print(df)

del df
gc.collect()

                                                   tweet_n  trend_next
0        top50 cryptocurrency in/out update last 12 hou...           0
1        bitcoin: .39💚 +159.43 last 1 hour (+0.46%)💚 +2...           0
2        bitcoincurrent price: 35060€ 29561.01cryptocur...           0
3                  gold | | gold waylong short btc trade8:           0
4        binance activity:🔵 bought worth 47.1m+ usdt🔴 s...           0
...                                                    ...         ...
1875228  bear market over? bitcoin could soar strong ho...           1
1875229  join waitlist blockfi credit card link we’ll s...           1
1875230  nobody make investments based someone’s twitte...           1
1875231  people expected bitcoin crash less two weeks a...           1
1875232                              bitcoin 25%this week.           1

[1875233 rows x 2 columns]


0