In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gc

In [17]:
# データ読み込み、各月の結合、欠損値を含む行の削除
def read_data(timespan):
    month = 1
    files = [f'tweet-sentiment/{timespan}/2021-0{month}.csv',
             f'tweet-sentiment/{timespan}/2021-0{month+1}.csv',
             f'tweet-sentiment/{timespan}/2021-0{month+2}.csv',
             f'tweet-sentiment/{timespan}/2021-0{month+3}.csv',
             f'tweet-sentiment/{timespan}/2021-0{month+4}.csv',
             f'tweet-sentiment/{timespan}/2021-0{month+5}.csv']
    datas = []
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file)
        datas.append(data)

    df = pd.concat(datas).reset_index(drop=True)
    print("【欠損値】")
    print(df.isnull().sum())
    df = df.drop(columns=['section'])
    df = df.dropna(how='any')
    
    print("read終了\n")
    
    return df

In [18]:
# スケーリング x' = (x - xmin)/(xmax - xmin)
def scaling(x_train,x_test):
    scaler = MinMaxScaler().fit(x_train)

    x_train_scaled = scaler.transform(x_train)
    x_test_scaled  = scaler.transform(x_test)
    
    print("scalig終了\n")
    
    return x_train_scaled, x_test_scaled

In [19]:
# 学習 One-versus-the-rest
# Default ver. C=1, gamma=auto
def default_svm(x_train, y_train, x_test):
    model = OneVsRestClassifier(SVC())
    model.fit(x_train, y_train)
    pred_y = model.predict(x_test)
    
    print("SVM(Default)学習終了\n")
    
    return pred_y

In [20]:
# C,gammmaの最適値探索ver.
def best_svm(x_train, y_train, x_test):
    model = OneVsRestClassifier(SVC())
    C_params = np.logspace(-4, 4, 5) #10^(-4) ~ 10^4 までで均等に5つの値
    gamma_params = np.logspace(-4, 4, 5)

    parameters = {'estimator__C': C_params,
                  'estimator__gamma': gamma_params}

    model_tuning = GridSearchCV(estimator = model,
                                param_grid = parameters,
                                n_jobs = -1,
                                verbose = 3
    )
    model_tuning.fit(x_train, y_train)
    pred_y2 = model_tuning.predict(test_x)

    # Best parameter
    model_tuning.best_params_
    
    return pred_y2

In [None]:
# main

#データの読み込み
timespan = str(input("timespanを入力："))
data = read_data(timespan)

# 説明変数、目的変数をセット
x = data.iloc[:,:-1]  # 最終行以外
y = data.iloc[:,-1]   # 最終行

#データを訓練用とテスト用に分割する
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/6)

# スケーリング
x_train_scaled, x_test_scaled = scaling(x_train, x_test)

# 学習
pred_y = default_svm(x_train_scaled, y_train, x_test_scaled)

# 結果表示
print ('Default : {:.5f}'.format(accuracy_score(y_test, pred_y)))
#print ('最適地探索: {:.5f}'.format(accuracy_score(test_y, pred_y2)))

In [31]:
gc.collect()

52