## 参考
- [機械学習でイモの分類器を作ってみる](https://sewaashi.com/python-3/)
- [1つの画像が複数のクラスに属する場合（Multi-label）の画像分類](https://qiita.com/koshian2/items/ab5e0c68a257585d7c6f)
- [Pythonのscikit-learnによる分類まとめ](https://qiita.com/wawana12/items/fd0a1558cbf13158aed1)
- [今日からはじめるPython機械学習プログラミング基礎 教師あり学習その1](https://qiita.com/holy015/items/d3057f711cd251e7a2bc)

# 2値分類: アクション作品か?

## データの準備

### 1. DBの準備

In [548]:
import pandas as pd

from lib.db import *

DB_NAME = "soundtrack.sqlite"

conn = connect_db(DB_NAME)
create_albums_table_if_not_exists(conn)
create_tracks_table_if_not_exists(conn)
create_genres_table_if_not_exists(conn)

### 番外. ジャンル分け付されたアルバム用の型の準備

In [549]:
from dataclasses import dataclass
from typing import List


@dataclass
class AlbumWithGenre:
    id: str
    name: str
    acousticness: float
    danceability: float
    duration_ms: int
    energy: float
    instrumentalness: float
    key: int
    liveness: float
    loudness: float
    mode: int
    speechiness: float
    tempo: float
    time_signature: int
    valence: float
    genre: List[str]

### 2. (楽曲数)次元の12個のパラメータをPCAで圧縮して1次元のデータに

In [550]:
from sklearn.decomposition import PCA

# 最終的に欲しいやつ
albumWithGenres: List[AlbumWithGenre] = []

# ジャンルデータが存在するアルバムをすべて取得
album_genres = get_genre_list(conn, 200, 0)

for genre in album_genres:
    album_id = genre.id  # SPOTIFY_ALBUM_ID
    album = get_album(conn, album_id)
    album_name = album.name
    album_genre_tags = genre.tags
    album_tracks = get_tracks(conn, album_id, 100, 0)

    # 曲が極端に少ない場合飛ばす
    if len(album_tracks) < 3:
        continue

    # アルバム内の各楽曲のパラメータをまとめて各々スカラーに.(12xN -> 12x1)
    album_acousticness = []
    album_danceability = []
    album_duration_ms = []
    album_energy = []
    album_instrumentalness = []
    album_key = []
    album_liveness = []
    album_loudness = []
    album_mode = []
    album_speechiness = []
    album_tempo = []
    album_time_signature = []
    album_valence = []

    for track in album_tracks:
        album_acousticness.append(track.acousticness)
        album_danceability.append(track.danceability)
        album_duration_ms.append(track.duration_ms)
        album_energy.append(track.energy)
        album_instrumentalness.append(track.instrumentalness)
        album_key.append(track.key)
        album_liveness.append(track.liveness)
        album_loudness.append(track.loudness)
        album_mode.append(track.mode)
        album_speechiness.append(track.speechiness)
        album_tempo.append(track.tempo)
        album_time_signature.append(track.time_signature)
        album_valence.append(track.valence)

    album_matrix = [
        album_acousticness,
        album_danceability,
        album_duration_ms,
        album_energy,
        album_instrumentalness,
        album_key,
        album_liveness,
        album_loudness,
        album_mode,
        album_speechiness,
        album_tempo,
        album_time_signature,
        album_valence
    ]

    # (楽曲数)次元から1次元に
    pca = PCA(n_components=1)
    album_pca = pca.fit_transform(album_matrix)

    albumWithGenres.append(
        AlbumWithGenre(
            id=album_id,
            name=album_name,
            acousticness=(album_pca[0])[0],
            danceability=(album_pca[1])[0],
            duration_ms=(album_pca[2])[0],
            energy=(album_pca[3])[0],
            instrumentalness=(album_pca[4])[0],
            key=(album_pca[5])[0],
            liveness=(album_pca[6])[0],
            loudness=(album_pca[7])[0],
            mode=(album_pca[8])[0],
            speechiness=(album_pca[9])[0],
            tempo=(album_pca[10])[0],
            time_signature=(album_pca[11])[0],
            valence=(album_pca[12])[0],
            genre=album_genre_tags
        )
    )

### 3. アクション作品か否か?にデータを作り直して使いやすいように

In [551]:
@dataclass
class AlbumIsAction:
    id: str
    name: str
    acousticness: float
    danceability: float
    duration_ms: int
    energy: float
    instrumentalness: float
    key: int
    liveness: float
    loudness: float
    mode: int
    speechiness: float
    tempo: float
    time_signature: int
    valence: float
    isAction: int


albumIsActions: List[dict] = []

for albumWithGenre in albumWithGenres:
    albumIsActions.append(
        vars(
            AlbumIsAction(
                id=albumWithGenre.id,
                name=albumWithGenre.name,
                acousticness=albumWithGenre.acousticness,
                danceability=albumWithGenre.danceability,
                duration_ms=albumWithGenre.duration_ms,
                energy=albumWithGenre.energy,
                instrumentalness=albumWithGenre.instrumentalness,
                key=albumWithGenre.key,
                liveness=albumWithGenre.liveness,
                loudness=albumWithGenre.loudness,
                mode=albumWithGenre.mode,
                speechiness=albumWithGenre.speechiness,
                tempo=albumWithGenre.tempo,
                time_signature=albumWithGenre.time_signature,
                valence=albumWithGenre.valence,
                isAction=1 if "action" in albumWithGenre.genre else 0
            )
        )
    )

### 番外. CSVに出力してすぐ使えるように

In [552]:
import csv
import pandas

# CSV
# iterator_list = list(itertools.chain.from_iterable(albumIsActions))
df = pandas.json_normalize(albumIsActions)
df.to_csv('data.csv', index=False, encoding='utf-8', quoting=csv.QUOTE_ALL, header=False)


#### 4. データの用意

In [553]:
from scipy import stats

# 学習用データ
df_isAction_learn = pd.read_csv(
    "data.csv",
    names=[
        "id",
        "name",
        "acousticness",
        "danceability",
        "duration_ms",
        "energy",
        "instrumentalness",
        "key",
        "liveness",
        "loudness",
        "mode",
        "speechiness",
        "tempo",
        "time_signature",
        "valence",
        "isAction",
    ],
    nrows=150,  # 最初の100件
)
# STR列を消す
df_isAction_learn = df_isAction_learn.drop(labels="id", axis="columns")
df_isAction_learn = df_isAction_learn.drop(labels="name", axis="columns")
# 答えの欄も消す
df_isAction_learn_answers = df_isAction_learn["isAction"].tolist()
df_isAction_learn = df_isAction_learn.drop(labels="isAction", axis="columns")
# 標準化
# df_isAction_learn = df_isAction_learn.apply(stats.zscore, axis=0)

In [554]:
from scipy import stats

df_isAction_test = pd.read_csv(
    "data.csv",
    names=[
        "id",
        "name",
        "acousticness",
        "danceability",
        "duration_ms",
        "energy",
        "instrumentalness",
        "key",
        "liveness",
        "loudness",
        "mode",
        "speechiness",
        "tempo",
        "time_signature",
        "valence",
        "isAction",
    ],
    skiprows=150,  # 最初の100件飛ばす
)
df_isAction_test_answers = df_isAction_test["isAction"].tolist()
# 同じ分データを消す
df_isAction_test = df_isAction_test.drop(labels="id", axis="columns")
df_isAction_test = df_isAction_test.drop(labels="name", axis="columns")
df_isAction_test = df_isAction_test.drop(labels="isAction", axis="columns")

# df_isAction_test = df_isAction_test.apply(stats.zscore, axis=0)

### 5. 学習

In [555]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

model = LinearSVC(dual=True)
# 教師あり学習
model.fit(df_isAction_learn, df_isAction_learn_answers)
# 分類
test_predict = model.predict(df_isAction_test)
print(accuracy_score(df_isAction_test_answers, test_predict))

0.38095238095238093




In [556]:
from sklearn.linear_model import RidgeClassifier

model = RidgeClassifier()
model.fit(df_isAction_learn, df_isAction_learn_answers)
# 分類
test_predict = model.predict(df_isAction_test)
print(accuracy_score(df_isAction_test_answers, test_predict))

0.9047619047619048


In [557]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(df_isAction_learn, df_isAction_learn_answers)
# 分類
test_predict = model.predict(df_isAction_test)
print(accuracy_score(df_isAction_test_answers, test_predict))

0.8571428571428571


In [558]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier()
model.fit(df_isAction_learn, df_isAction_learn_answers)
# 分類
test_predict = model.predict(df_isAction_test)
print(accuracy_score(df_isAction_test_answers, test_predict))

0.6190476190476191


In [559]:
from sklearn.linear_model import Perceptron

model = Perceptron()
model.fit(df_isAction_learn, df_isAction_learn_answers)
# 分類
test_predict = model.predict(df_isAction_test)
print(accuracy_score(df_isAction_test_answers, test_predict))

0.6190476190476191


In [560]:
from sklearn.linear_model import Perceptron

model = Perceptron(random_state=3)
model.fit(df_isAction_learn, df_isAction_learn_answers)
# 分類
test_predict = model.predict(df_isAction_test)
print(accuracy_score(df_isAction_test_answers, test_predict))

0.38095238095238093


In [561]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier()
model.fit(df_isAction_learn, df_isAction_learn_answers)
# 分類
test_predict = model.predict(df_isAction_test)
print(accuracy_score(df_isAction_test_answers, test_predict))

0.38095238095238093


In [562]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

model = make_pipeline(StandardScaler(), SVC())
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)

0.6190476190476191

In [563]:
from sklearn.svm import NuSVC

model = make_pipeline(StandardScaler(), NuSVC())
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)

0.47619047619047616

In [564]:
from sklearn.svm import LinearSVC

model = make_pipeline(StandardScaler(), LinearSVC(dual=True))
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)



0.5238095238095238

In [565]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)

0.6666666666666666

In [566]:
# from sklearn.neighbors import RadiusNeighborsClassifier
# 
# model = RadiusNeighborsClassifier()
# model.fit(df_isAction_learn, df_isAction_learn_answers)
# model.score(df_isAction_test, df_isAction_test_answers)

In [567]:
from sklearn.neighbors import NearestCentroid

model = NearestCentroid()
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)

0.5238095238095238

In [568]:
from sklearn.gaussian_process import GaussianProcessClassifier

model = GaussianProcessClassifier()
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)

0.6190476190476191

In [569]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)

0.5714285714285714

In [570]:
# from sklearn.naive_bayes import MultinomialNB
# 
# model = MultinomialNB()
# model.fit(df_isAction_learn, df_isAction_learn_answers)
# model.score(df_isAction_test, df_isAction_test_answers)

In [571]:
# from sklearn.naive_bayes import ComplementNB
# 
# model = ComplementNB()
# model.fit(df_isAction_learn, df_isAction_learn_answers)
# model.score(df_isAction_test, df_isAction_test_answers)

In [572]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)

0.6190476190476191

In [573]:
# from sklearn.naive_bayes import CategoricalNB
# 
# model = CategoricalNB()
# model.fit(df_isAction_learn, df_isAction_learn_answers)
# model.score(df_isAction_test, df_isAction_test_answers)

In [574]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(df_isAction_learn, df_isAction_learn_answers)
model.score(df_isAction_test, df_isAction_test_answers)

0.6666666666666666