In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc
import time
import datetime

In [2]:
# データ読み込み、各月の結合、欠損値を含む行の削除
# 1 : neg_count, neu_count, pos_count
def read_data1(timespan,n):
    print("[read開始]\n")
    
    files = [f'tweet-svm/{timespan}/2021-01.csv',
             f'tweet-svm/{timespan}/2021-02.csv',
             f'tweet-svm/{timespan}/2021-03.csv',
             f'tweet-svm/{timespan}/2021-04.csv',
             f'tweet-svm/{timespan}/2021-05.csv',
             f'tweet-svm/{timespan}/2021-06.csv']
    datas = []
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file, usecols = ['section','neg_count','neu_count','pos_count','open_price','trend'])
        datas.append(data)

    # ファイルの結合
    df = pd.concat(datas).reset_index(drop=True)
    df = df.drop(columns=['section'])
    
    # カラム名変更(区間No.割り当て)
    df = df.rename(columns={"neg_count":"neg_count(n)", 
                        "neu_count":"neu_count(n)", 
                        "pos_count":"pos_count(n)", 
                        "open_price":"open_price(n)",
                        "trend":"trend(n)"})
    
    print('欠損値削除前の総データ数：{}'.format(len(df)))
    
    # 目的変数
    df['trend(n+1)'] = df['trend(n)'].shift(-1)

    # 説明変数
    df['open_price(n+1)'] = df['open_price(n)'].shift(-1)
    if n >= 2:
        for i in range(1,n):
            df[f'neg_count(n-{i})'] = df['neg_count(n)'].shift(i)
            df[f'neu_count(n-{i})'] = df['neu_count(n)'].shift(i)
            df[f'pos_count(n-{i})'] = df['pos_count(n)'].shift(i)
            df[f'trend(n-{i})'] = df['trend(n)'].shift(i)
            if i==n-1: break
            df[f'open_price(n-{i})'] = df['open_price(n)'].shift(i)
    else:
        df = df.drop(columns=['open_price(n)'])

    # 欠損値を含む行を削除
    df = df.dropna(how='any')
    print('欠損値削除後の総データ数：{}\n'.format(len(df)))
    
    print("[read終了]")
    
    return df

In [3]:
# データ読み込み、各月の結合、欠損値を含む行の削除
# 2 : com_ave, tweet_cont\n
def read_data2(timespan,n):
    print("[read開始]\n")
    
    files = [f'tweet-svm/{timespan}/2021-01.csv',
             f'tweet-svm/{timespan}/2021-02.csv',
             f'tweet-svm/{timespan}/2021-03.csv',
             f'tweet-svm/{timespan}/2021-04.csv',
             f'tweet-svm/{timespan}/2021-05.csv',
             f'tweet-svm/{timespan}/2021-06.csv']
    datas = []
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file, usecols = ['section','com_ave','tweet_count','open_price','trend'])
        datas.append(data)

    # ファイルの結合
    df = pd.concat(datas).reset_index(drop=True)
    df = df.drop(columns=['section'])
    
    # カラム名変更(区間No.割り当て)
    df = df.rename(columns={"com_ave":"com_ave(n)", 
                            "tweet_count":"tweet_count(n)", 
                            "open_price":"open_price(n)",
                            "trend":"trend(n)"})
    
    print('欠損値削除前の総データ数：{}'.format(len(df)))
    
    # 目的変数
    df['trend(n+1)'] = df['trend(n)'].shift(-1)

    # 説明変数
    df['open_price(n+1)'] = df['open_price(n)'].shift(-1)
    if n >= 2:
        for i in range(1,n):
            df[f'com_ave(n-{i})'] = df['com_ave(n)'].shift(i)
            df[f'tweet_count(n-{i})'] = df['tweet_count(n)'].shift(i)
            df[f'trend(n-{i})'] = df['trend(n)'].shift(i)
            if i==n-1: break
            df[f'open_price(n-{i})'] = df['open_price(n)'].shift(i)
    else:
        df = df.drop(columns=['open_price(n)'])

    # 欠損値を含む行を削除
    df = df.dropna(how='any')
    print('欠損値削除後の総データ数：{}\n'.format(len(df)))
    
    print("[read終了]")
    
    return df

In [4]:
# データ読み込み、各月の結合、欠損値を含む行の削除
# 3 : all
def read_data3(timespan,n):
    print("[read開始]\n")
    
    files = [f'tweet-svm/{timespan}/2021-01.csv',
             f'tweet-svm/{timespan}/2021-02.csv',
             f'tweet-svm/{timespan}/2021-03.csv',
             f'tweet-svm/{timespan}/2021-04.csv',
             f'tweet-svm/{timespan}/2021-05.csv',
             f'tweet-svm/{timespan}/2021-06.csv']
    datas = []
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file)
        datas.append(data)

    # ファイルの結合
    df = pd.concat(datas).reset_index(drop=True)
    df = df.drop(columns=['section'])
    
    # カラム名変更(区間No.割り当て)
    df = df.rename(columns={"neg_count":"neg_count(n)", 
                            "neu_count":"neu_count(n)", 
                            "pos_count":"pos_count(n)",
                            "com_ave":"com_ave(n)", 
                            "tweet_count":"tweet_count(n)", 
                            "open_price":"open_price(n)",
                            "trend":"trend(n)"})
    
    print('欠損値削除前の総データ数：{}'.format(len(df)))
    
    # 目的変数
    df['trend(n+1)'] = df['trend(n)'].shift(-1)

    # 説明変数
    df['open_price(n+1)'] = df['open_price(n)'].shift(-1)
    if n >= 2:
        for i in range(1,n):
            df[f'neg_count(n-{i})'] = df['neg_count(n)'].shift(i)
            df[f'neu_count(n-{i})'] = df['neu_count(n)'].shift(i)
            df[f'pos_count(n-{i})'] = df['pos_count(n)'].shift(i)
            df[f'com_ave(n-{i})'] = df['com_ave(n)'].shift(i)
            df[f'tweet_count(n-{i})'] = df['tweet_count(n)'].shift(i)
            df[f'trend(n-{i})'] = df['trend(n)'].shift(i)
            if i==n-1: break
            df[f'open_price(n-{i})'] = df['open_price(n)'].shift(i)
    else:
        df = df.drop(columns=['open_price(n)'])

    # 欠損値を含む行を削除
    df = df.dropna(how='any')
    print('欠損値削除後の総データ数：{}\n'.format(len(df)))
    
    print("[read終了]")
    
    return df

In [5]:
# スケーリング x' = (x - xmin)/(xmax - xmin)
def scaling(x_train,x_test):
    print("[scaling開始]")
    
    scaler = MinMaxScaler().fit(x_train)

    x_train_scaled = scaler.transform(x_train)
    x_test_scaled  = scaler.transform(x_test)
    
    print("[scalig終了]")
    
    return x_train_scaled, x_test_scaled

In [6]:
# 学習 One-versus-the-rest
# Default ver. C=1, gamma=auto
def default_svm(x_train, y_train, x_test):
    print("[学習開始(Default)]")
    
    model = OneVsRestClassifier(SVC())
    model.fit(x_train, y_train)
    pred_y = model.predict(x_test)
    
    print("[学習終了(Default)]\n")
    
    return pred_y

In [7]:
# C,gammmaの最適値探索ver.
def best_svm(x_train, y_train, x_test):
    model = OneVsRestClassifier(SVC())
    C_params = np.logspace(-4, 4, 5) #10^(-4) ~ 10^4 までで均等に5つの値
    gamma_params = np.logspace(-4, 4, 5)

    parameters = {'estimator__C': C_params,
                  'estimator__gamma': gamma_params}

    model_tuning = GridSearchCV(estimator = model,
                                param_grid = parameters,
                                n_jobs = -1,
                                verbose = 3
    )
    model_tuning.fit(x_train, y_train)
    pred_y2 = model_tuning.predict(test_x)

    # Best parameter
    model_tuning.best_params_
    
    return pred_y2

In [None]:
# main
# 部分実行
alist = {1:'neg_count, neu_count, pos_count',
         2:'com_ave, tweet_cont',
         3:'all'}
print("学習に用いる感情データの種類\n1 : neg_count, neu_count, pos_count\n2 : com_ave, tweet_cont\n3 : all")
a = int(input())
print("学習に用いるデータの範囲\n(例1) n = 1 → 区間[n]\n(例2) n = 2 → 区間[n-1, n]")
n = int(input())

tlist = ['1d','12h','4h','1h','30m','15m','5m']
aculist = {}

for timespan in tlist:
    print("********************{}：開始********************\n".format(timespan))
    dt_start = datetime.datetime.now()
    print(f'開始時刻：{dt_start}\n')
    
    result = []
    
    #データの読み込み
    if a == 1:
        data = read_data1(timespan,n)
    else:
        data = read_data2(timespan,n)

    # 説明変数、目的変数をセット
    x = data.iloc[:, data.columns!='trend(n+1)']
    y = data.loc[:, 'trend(n+1)']

    #データを訓練用とテスト用に分割
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/6, random_state=0)

    # スケーリング
    x_train_scaled, x_test_scaled = scaling(x_train, x_test)

    # 学習
    y_pred = default_svm(x_train_scaled, y_train, x_test_scaled)

    # 結果表示
    print ('accuracy(Default) : {:.5f}\n'.format(accuracy_score(y_test, y_pred)))
    #print ('最適地探索: {:.5f}'.format(accuracy_score(test_y, pred_y2)))
    
    # 混同行列
    print(confusion_matrix(y_test, y_pred))
    
    # precision, recall, f1-score, support
    print(classification_report(y_test, y_pred, target_names = ['neg','neu','pos']))
    
    dt_end = datetime.datetime.now()
    print(f'終了時刻：{dt_end}\n')
    
    # 結果を格納
    aculist[f'{timespan}'] = accuracy_score(y_test, y_pred)
    df_log = pd.read_csv('tweet-svm/svm_log.csv')
    s = pd.Series([dt_start,
                   dt_end,
                   timespan,
                   alist[a],
                   n,
                   len(x.columns.values),
                   len(data),
                   len(y_train),
                   len(y_test),
                   accuracy_score(y_test, y_pred),
                   confusion_matrix(y_test, y_pred),
                   classification_report(y_test, y_pred,target_names=['neg','neu','pos'], output_dict=True)],
            index=['start_time',
                   'end_time',
                   'timespan',
                   'kind_of_data',
                   'range_of_data',
                   'attribute',
                   'all_data',
                   'training_data',
                   'test_data',
                   'accuracy',
                   'confusion_matrix',
                   'score_report'])
    df_log = df_log.append(s, ignore_index=True)
    df_log.to_csv('tweet-svm/svm_log.csv',index=False)
    
    # メモリ開放
    del data,x,y,x_train,x_test,y_train,y_test,x_train_scaled,x_test_scaled,y_pred,dt_start,dt_end,df_log
    gc.collect()
    
    print("********************{}：終了********************\n".format(timespan))
    
print(f'正答率\n{aculist}')

In [8]:
# main
# all実行 (n=1~3, a=1~10の30通り)
alist = {1:'neg_count, neu_count, pos_count',
         2:'com_ave, tweet_cont',
         3:'all'}
print("学習に用いる感情データの種類 a\n1 : neg_count, neu_count, pos_count\n2 : com_ave, tweet_cont\n3 : all")
print("学習に用いるデータの範囲 n\n(例1) n = 1 → 区間[n]\n(例2) n = 2 → 区間[n-1, n]")
alist_ = [1,2,3]
nlist = [1,2,3,4,5,6,7,8,9,10]
tlist = ['1d','12h','4h','1h','30m','15m','5m']
aculist = {}
for a in alist_:
    for n in nlist:
        for timespan in tlist:
            print("********************{}：開始********************\n".format(timespan))
            dt_start = datetime.datetime.now()
            print(f'開始時刻：{dt_start}\n')

            print(f'(a,n) = ({a},{n})')

            result = []

            #データの読み込み
            if a == 1:
                data = read_data1(timespan,n)
            else:
                data = read_data2(timespan,n)

            # 説明変数、目的変数をセット
            x = data.iloc[:, data.columns!='trend(n+1)']
            y = data.loc[:, 'trend(n+1)']

            #データを訓練用とテスト用に分割
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/6, random_state=0)

            # スケーリング
            x_train_scaled, x_test_scaled = scaling(x_train, x_test)

            # 学習
            y_pred = default_svm(x_train_scaled, y_train, x_test_scaled)

            # 結果表示
            print ('accuracy(Default) : {:.5f}\n'.format(accuracy_score(y_test, y_pred)))
            #print ('最適地探索: {:.5f}'.format(accuracy_score(test_y, pred_y2)))

            # 混同行列
            print(confusion_matrix(y_test, y_pred))

            # precision, recall, f1-score, support
            print(classification_report(y_test, y_pred, target_names = ['neg','neu','pos']))

            dt_end = datetime.datetime.now()
            print(f'終了時刻：{dt_end}\n')

            # 結果を格納
            aculist[f'{timespan}'] = accuracy_score(y_test, y_pred)
            df_log = pd.read_csv('tweet-svm/svm_log.csv')
            s = pd.Series([dt_start,
                           dt_end,
                           timespan,
                           alist[a],
                           n,
                           len(x.columns.values),
                           len(data),
                           len(y_train),
                           len(y_test),
                           accuracy_score(y_test, y_pred),
                           confusion_matrix(y_test, y_pred),
                           classification_report(y_test, y_pred,target_names=['neg','neu','pos'], output_dict=True)],
                    index=['start_time',
                           'end_time',
                           'timespan',
                           'kind_of_data',
                           'range_of_data',
                           'attribute',
                           'all_data',
                           'training_data',
                           'test_data',
                           'accuracy',
                           'confusion_matrix',
                           'score_report'])
            df_log = df_log.append(s, ignore_index=True)
            df_log.to_csv('tweet-svm/svm_log.csv',index=False)

            # メモリ開放
            del data,x,y,x_train,x_test,y_train,y_test,x_train_scaled,x_test_scaled,y_pred,dt_start,dt_end,df_log
            gc.collect()

            print("********************{}：終了********************\n".format(timespan))
    
print(f'正答率\n{aculist}')

学習に用いる感情データの種類 a
1 : neg_count, neu_count, pos_count
2 : com_ave, tweet_cont
3 : all
学習に用いるデータの範囲 n
(例1) n = 1 → 区間[n]
(例2) n = 2 → 区間[n-1, n]
********************1d：開始********************

開始時刻：2021-09-21 22:40:52.954632

(a,n) = (1,1)
[read開始]

欠損値削除前の総データ数：181
欠損値削除後の総データ数：180

[read終了]
[scaling開始]
[scalig終了]
[学習開始(Default)]
[学習終了(Default)]

accuracy(Default) : 0.36667

[[2 4 5]
 [2 4 4]
 [0 4 5]]
              precision    recall  f1-score   support

         neg       0.50      0.18      0.27        11
         neu       0.33      0.40      0.36        10
         pos       0.36      0.56      0.43         9

    accuracy                           0.37        30
   macro avg       0.40      0.38      0.36        30
weighted avg       0.40      0.37      0.35        30

終了時刻：2021-09-21 22:40:53.081396

********************1d：終了********************

********************12h：開始********************

開始時刻：2021-09-21 22:40:53.164424

(a,n) = (1,1)
[read開始]

欠損値削除前の総データ数：362
欠損値削除後の総データ数：

In [30]:
#log リセット
'''
df = pd.DataFrame(columns=['start_time',
                   'end_time',
                   'timespan',
                   'kind_of_data',
                   'range_of_data',
                   'attribute',
                   'all_data',
                   'training_data',
                   'test_data',
                   'accuracy',
                   'confusion_matrix',
                   'score_report'])
df.to_csv('tweet-svm/svm_log.csv',index=False)
'''