In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc
import time
import datetime

In [2]:
# データ読み込み、各月の結合、欠損値を含む行の削除
# 1 : neg_count, neu_count, pos_count
def read_data1(timespan,n):
    print("[read開始]\n")
    
    files = [f'tweet-sentiment/{timespan}/2021-01.csv',
             f'tweet-sentiment/{timespan}/2021-02.csv',
             f'tweet-sentiment/{timespan}/2021-03.csv',
             f'tweet-sentiment/{timespan}/2021-04.csv',
             f'tweet-sentiment/{timespan}/2021-05.csv',
             f'tweet-sentiment/{timespan}/2021-06.csv']
    datas = []
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file, usecols = ['section','neg_count','neu_count','pos_count','open_price','trend'])
        datas.append(data)

    # ファイルの結合
    df = pd.concat(datas).reset_index(drop=True)
    df = df.drop(columns=['section'])
    
    # カラム名変更(区間No.割り当て)
    df = df.rename(columns={"neg_count":"neg_count(n)", 
                        "neu_count":"neu_count(n)", 
                        "pos_count":"pos_count(n)", 
                        "open_price":"open_price(n)",
                        "trend":"trend(n)"})
    
    print('欠損値削除前の総データ数：{}'.format(len(df)))
    
    # 目的変数
    df['trend(n+1)'] = df['trend(n)'].shift(-1)

    # 説明変数
    df['open_price(n+1)'] = df['open_price(n)'].shift(-1)
    if n >= 2:
        for i in range(1,n):
            df[f'neg_count(n-{i})'] = df['neg_count(n)'].shift(i)
            df[f'neu_count(n-{i})'] = df['neu_count(n)'].shift(i)
            df[f'pos_count(n-{i})'] = df['pos_count(n)'].shift(i)
            df[f'trend(n-{i})'] = df['trend(n)'].shift(i)
            if i==n-1: break
            df[f'open_price(n-{i})'] = df['open_price(n)'].shift(i)
    else:
        df = df.drop(columns=['open_price(n)'])

    # 欠損値を含む行を削除
    df = df.dropna(how='any')
    print('欠損値削除後の総データ数：{}\n'.format(len(df)))
    
    print(df.columns.values)
    print(f'カラム数：{len(df.columns.values)}\n')
    
    print("[read終了]")
    
    return df

In [3]:
# データ読み込み、各月の結合、欠損値を含む行の削除
# 2 : com_ave, tweet_cont\n
def read_data2(timespan,n):
    print("[read開始]\n")
    
    files = [f'tweet-sentiment/{timespan}/2021-01.csv',
             f'tweet-sentiment/{timespan}/2021-02.csv',
             f'tweet-sentiment/{timespan}/2021-03.csv',
             f'tweet-sentiment/{timespan}/2021-04.csv',
             f'tweet-sentiment/{timespan}/2021-05.csv',
             f'tweet-sentiment/{timespan}/2021-06.csv']
    datas = []
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file, usecols = ['section','com_ave','tweet_count','open_price','trend'])
        datas.append(data)

    # ファイルの結合
    df = pd.concat(datas).reset_index(drop=True)
    df = df.drop(columns=['section'])
    
    # カラム名変更(区間No.割り当て)
    df = df.rename(columns={"com_ave":"com_ave(n)", 
                            "tweet_count":"tweet_count(n)", 
                            "open_price":"open_price(n)",
                            "trend":"trend(n)"})
    
    print('欠損値削除前の総データ数：{}'.format(len(df)))
    
    # 目的変数
    df['trend(n+1)'] = df['trend(n)'].shift(-1)

    # 説明変数
    df['open_price(n+1)'] = df['open_price(n)'].shift(-1)
    if n >= 2:
        for i in range(1,n):
            df[f'com_ave(n-{i})'] = df['com_ave(n)'].shift(i)
            df[f'tweet_count(n-{i})'] = df['tweet_count(n)'].shift(i)
            df[f'trend(n-{i})'] = df['trend(n)'].shift(i)
            if i==n-1: break
            df[f'open_price(n-{i})'] = df['open_price(n)'].shift(i)
    else:
        df = df.drop(columns=['open_price(n)'])

    # 欠損値を含む行を削除
    df = df.dropna(how='any')
    print('欠損値削除後の総データ数：{}\n'.format(len(df)))
    
    print(df.columns.values)
    print(f'カラム数：{len(df.columns.values)}\n')
    
    print("[read終了]")
    
    return df

In [4]:
# データ読み込み、各月の結合、欠損値を含む行の削除
# 3 : all
def read_data3(timespan,n):
    print("[read開始]\n")
    
    files = [f'tweet-sentiment/{timespan}/2021-01.csv',
             f'tweet-sentiment/{timespan}/2021-02.csv',
             f'tweet-sentiment/{timespan}/2021-03.csv',
             f'tweet-sentiment/{timespan}/2021-04.csv',
             f'tweet-sentiment/{timespan}/2021-05.csv',
             f'tweet-sentiment/{timespan}/2021-06.csv']
    datas = []
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file)
        datas.append(data)

    # ファイルの結合
    df = pd.concat(datas).reset_index(drop=True)
    df = df.drop(columns=['section'])
    
    # カラム名変更(区間No.割り当て)
    df = df.rename(columns={"neg_count":"neg_count(n)", 
                            "neu_count":"neu_count(n)", 
                            "pos_count":"pos_count(n)",
                            "com_ave":"com_ave(n)", 
                            "tweet_count":"tweet_count(n)", 
                            "open_price":"open_price(n)",
                            "trend":"trend(n)"})
    
    print('欠損値削除前の総データ数：{}'.format(len(df)))
    
    # 目的変数
    df['trend(n+1)'] = df['trend(n)'].shift(-1)

    # 説明変数
    df['open_price(n+1)'] = df['open_price(n)'].shift(-1)
    if n >= 2:
        for i in range(1,n):
            df[f'neg_count(n-{i})'] = df['neg_count(n)'].shift(i)
            df[f'neu_count(n-{i})'] = df['neu_count(n)'].shift(i)
            df[f'pos_count(n-{i})'] = df['pos_count(n)'].shift(i)
            df[f'com_ave(n-{i})'] = df['com_ave(n)'].shift(i)
            df[f'tweet_count(n-{i})'] = df['tweet_count(n)'].shift(i)
            df[f'trend(n-{i})'] = df['trend(n)'].shift(i)
            if i==n-1: break
            df[f'open_price(n-{i})'] = df['open_price(n)'].shift(i)
    else:
        df = df.drop(columns=['open_price(n)'])

    # 欠損値を含む行を削除
    df = df.dropna(how='any')
    print('欠損値削除後の総データ数：{}\n'.format(len(df)))
    
    print(df.columns.values)
    print(f'カラム数：{len(df.columns.values)}\n')
    
    print("[read終了]")
    
    return df

In [5]:
# スケーリング x' = (x - xmin)/(xmax - xmin)
def scaling(x_train,x_test):
    print("[scaling開始]")
    
    scaler = MinMaxScaler().fit(x_train)

    x_train_scaled = scaler.transform(x_train)
    x_test_scaled  = scaler.transform(x_test)
    
    print("[scalig終了]")
    
    return x_train_scaled, x_test_scaled

In [6]:
# 学習 One-versus-the-rest
# Default ver. C=1, gamma=auto
def default_svm(x_train, y_train, x_test):
    print("[学習開始(Default)]")
    
    model = OneVsRestClassifier(SVC())
    model.fit(x_train, y_train)
    pred_y = model.predict(x_test)
    
    print("[学習終了(Default)]\n")
    
    return pred_y

In [7]:
# C,gammmaの最適値探索ver.
def best_svm(x_train, y_train, x_test):
    model = OneVsRestClassifier(SVC())
    C_params = np.logspace(-4, 4, 5) #10^(-4) ~ 10^4 までで均等に5つの値
    gamma_params = np.logspace(-4, 4, 5)

    parameters = {'estimator__C': C_params,
                  'estimator__gamma': gamma_params}

    model_tuning = GridSearchCV(estimator = model,
                                param_grid = parameters,
                                n_jobs = -1,
                                verbose = 3
    )
    model_tuning.fit(x_train, y_train)
    pred_y2 = model_tuning.predict(test_x)

    # Best parameter
    model_tuning.best_params_
    
    return pred_y2

In [8]:
# main
print("学習に用いる感情データの種類\n1 : neg_count, neu_count, pos_count\n2 : com_ave, tweet_cont\n3 : all")
a = int(input())
print("学習に用いるデータの範囲\n(例1) n = 1 → 区間[n]\n(例2) n = 2 → 区間[n-1, n]")
n = int(input())


dt_start = datetime.datetime.now()
tlist = ['1d','12h','4h','1h','30m','15m','5m']
aculist = []

for timespan in tlist:
    print("********************{}：開始********************\n".format(timespan))
    
    #データの読み込み
    if a == 1:
        data = read_data1(timespan,n)
    else:
        data = read_data2(timespan,n)

    # 説明変数、目的変数をセット
    x = data.iloc[:, data.columns!='trend(n+1)']
    y = data.loc[:, 'trend(n+1)']

    #データを訓練用とテスト用に分割する
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/6, random_state=0)

    # スケーリング
    x_train_scaled, x_test_scaled = scaling(x_train, x_test)

    # 学習
    y_pred = default_svm(x_train_scaled, y_train, x_test_scaled)

    # 結果表示
    print ('accuracy(Default) : {:.5f}\n'.format(accuracy_score(y_test, y_pred)))
    #print ('最適地探索: {:.5f}'.format(accuracy_score(test_y, pred_y2)))
    
    # 混同行列
    print(confusion_matrix(y_test, y_pred))
    
    # precision, recall, f1-score, support
    print(classification_report(y_test, y_pred))
    
    # 結果を格納
    aculist.append(accuracy_score(y_test, y_pred))
    
    # メモリ開放
    del data,x,y,x_train,x_test,y_train,y_test,x_train_scaled,x_test_scaled,y_pred
    gc.collect()
    
    print("********************{}：終了********************\n".format(timespan))
    
dt_end = datetime.datetime.now()
print(aculist)

学習に用いる感情データの種類
1 : neg_count, neu_count, pos_count
2 : com_ave, tweet_cont
3 : all


 1


学習に用いるデータの範囲
(例1) n = 1 → 区間[n]
(例2) n = 2 → 区間[n-1, n]


 10


********************1d：開始********************

[read開始]

欠損値削除前の総データ数：181
欠損値削除後の総データ数：171

['neg_count(n)' 'neu_count(n)' 'pos_count(n)' 'open_price(n)' 'trend(n)'
 'trend(n+1)' 'open_price(n+1)' 'neg_count(n-1)' 'neu_count(n-1)'
 'pos_count(n-1)' 'trend(n-1)' 'open_price(n-1)' 'neg_count(n-2)'
 'neu_count(n-2)' 'pos_count(n-2)' 'trend(n-2)' 'open_price(n-2)'
 'neg_count(n-3)' 'neu_count(n-3)' 'pos_count(n-3)' 'trend(n-3)'
 'open_price(n-3)' 'neg_count(n-4)' 'neu_count(n-4)' 'pos_count(n-4)'
 'trend(n-4)' 'open_price(n-4)' 'neg_count(n-5)' 'neu_count(n-5)'
 'pos_count(n-5)' 'trend(n-5)' 'open_price(n-5)' 'neg_count(n-6)'
 'neu_count(n-6)' 'pos_count(n-6)' 'trend(n-6)' 'open_price(n-6)'
 'neg_count(n-7)' 'neu_count(n-7)' 'pos_count(n-7)' 'trend(n-7)'
 'open_price(n-7)' 'neg_count(n-8)' 'neu_count(n-8)' 'pos_count(n-8)'
 'trend(n-8)' 'open_price(n-8)' 'neg_count(n-9)' 'neu_count(n-9)'
 'pos_count(n-9)' 'trend(n-9)']
カラム数：51

[read終了]
[scaling開始]
[scalig終了]
[学習開始(Default)]
[学習終了(Default)