In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc
import time
import datetime

In [2]:
# データ読み込み、各月の結合、欠損値を含む行の削除
def read_data(timespan):
    print("[read開始]\n")
    
    files = [f'tweet-sentiment/{timespan}/2021-01.csv',
             f'tweet-sentiment/{timespan}/2021-02.csv',
             f'tweet-sentiment/{timespan}/2021-03.csv',
             f'tweet-sentiment/{timespan}/2021-04.csv',
             f'tweet-sentiment/{timespan}/2021-05.csv',
             f'tweet-sentiment/{timespan}/2021-06.csv']
    datas = []
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file)
        datas.append(data)

    df = pd.concat(datas).reset_index(drop=True)
    df = df.drop(columns=['section'])
        
    print("【欠損値】")
    print(df.isnull().sum())

    print('\n欠損値削除前の総データ数：{}'.format(len(df)))
    df = df.dropna(how='any')
    print('欠損値削除後の総データ数：{}\n'.format(len(df)))
    
    print("[read終了]")
    
    return df

In [3]:
# スケーリング x' = (x - xmin)/(xmax - xmin)
def scaling(x_train,x_test):
    print("[scaling開始]")
    
    scaler = MinMaxScaler().fit(x_train)

    x_train_scaled = scaler.transform(x_train)
    x_test_scaled  = scaler.transform(x_test)
    
    print("[scalig終了]")
    
    return x_train_scaled, x_test_scaled

In [4]:
# 学習 One-versus-the-rest
# Default ver. C=1, gamma=auto
def default_svm(x_train, y_train, x_test):
    print("[学習開始(Default)]")
    
    model = OneVsRestClassifier(SVC())
    model.fit(x_train, y_train)
    pred_y = model.predict(x_test)
    
    print("[学習終了(Default)]\n")
    
    return pred_y

In [5]:
# C,gammmaの最適値探索ver.
def best_svm(x_train, y_train, x_test):
    model = OneVsRestClassifier(SVC())
    C_params = np.logspace(-4, 4, 5) #10^(-4) ~ 10^4 までで均等に5つの値
    gamma_params = np.logspace(-4, 4, 5)

    parameters = {'estimator__C': C_params,
                  'estimator__gamma': gamma_params}

    model_tuning = GridSearchCV(estimator = model,
                                param_grid = parameters,
                                n_jobs = -1,
                                verbose = 3
    )
    model_tuning.fit(x_train, y_train)
    pred_y2 = model_tuning.predict(test_x)

    # Best parameter
    model_tuning.best_params_
    
    return pred_y2

In [6]:
# main
dt_start = datetime.datetime.now()
tlist = ['1d','12h','4h','1h','30m','15m','5m']
aculist = []

for timespan in tlist:
    print("********************{}：開始********************\n".format(timespan))
    
    #データの読み込み
    data = read_data(timespan)

    # 説明変数、目的変数をセット
    x = data.iloc[:,:-1]  # 最終行以外
    y = data.iloc[:,-1]   # 最終行

    #データを訓練用とテスト用に分割する
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/6, random_state=0)

    # スケーリング
    x_train_scaled, x_test_scaled = scaling(x_train, x_test)

    # 学習
    y_pred = default_svm(x_train_scaled, y_train, x_test_scaled)

    # 結果表示
    print ('accuracy(Default) : {:.5f}\n'.format(accuracy_score(y_test, y_pred)))
    #print ('最適地探索: {:.5f}'.format(accuracy_score(test_y, pred_y2)))
    
    # 混同行列
    print(confusion_matrix(y_test, y_pred))
    
    # precision, recall, f1-score, support
    print(classification_report(y_test, y_pred))
    
    # 結果を格納
    aculist.append(accuracy_score(y_test, y_pred))
    
    # メモリ開放
    del data,x,y,x_train,x_test,y_train,y_test,x_train_scaled,x_test_scaled,y_pred
    gc.collect()
    
    print("********************{}：終了********************\n".format(timespan))
    
dt_end = datetime.datetime.now()
print(aculist)

1d：開始

[read開始]

【欠損値】
neg_count      0
neu_count      0
pos_count      0
com_ave        0
tweet_count    0
open_price     0
trend          0
dtype: int64

欠損値削除前の総データ数：181
欠損値削除後の総データ数：181

[read終了]
[scaling開始]
[scalig終了]
[学習開始(Default)]
[学習終了(Default)]

accuracy(Default) : 0.25806

[[4 8 4]
 [2 2 0]
 [2 7 2]]
              precision    recall  f1-score   support

           0       0.50      0.25      0.33        16
           1       0.12      0.50      0.19         4
           2       0.33      0.18      0.24        11

    accuracy                           0.26        31
   macro avg       0.32      0.31      0.25        31
weighted avg       0.39      0.26      0.28        31

1d：終了

12h：開始

[read開始]

【欠損値】
neg_count      0
neu_count      0
pos_count      0
com_ave        0
tweet_count    0
open_price     0
trend          0
dtype: int64

欠損値削除前の総データ数：362
欠損値削除後の総データ数：362

[read終了]
[scaling開始]
[scalig終了]
[学習開始(Default)]
[学習終了(Default)]

accuracy(Default) : 0.40984

[[ 7 11  5]
 