# TPS-Feb-2022

In [1]:
NB = '007'

## Import libralies

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 500)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [7]:
from math import factorial

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

## Load and check data

### Load data

In [3]:
# Load data
##### Load train and Test set
train_df = pd.read_csv("../data/raw/train.csv")
test_df = pd.read_csv("../data/raw/test.csv")

In [4]:
FEATURES = [col for col in train_df.columns if col not in ['row_id', 'target']]

## Feature Engineering
- 平均、分散、最大値、最小値、中央値、第一四分位、第三四分位、尖度、歪度を追加

In [5]:
train_df["mean"] = train_df[FEATURES].mean(axis=1)
train_df["std"] = train_df[FEATURES].std(axis=1)
train_df["min"] = train_df[FEATURES].min(axis=1)
train_df["max"] = train_df[FEATURES].max(axis=1)
train_df["median"] = train_df[FEATURES].median(axis=1)
train_df["25%"] = train_df[FEATURES].quantile(q=0.25, axis=1)
train_df["75%"] = train_df[FEATURES].quantile(q=0.75, axis=1)
train_df["skew"] = train_df[FEATURES].skew(axis=1)
train_df["kurt"] = train_df[FEATURES].kurt(axis=1)

test_df["mean"] = test_df[FEATURES].mean(axis=1)
test_df["std"] = test_df[FEATURES].std(axis=1)
test_df["min"] = test_df[FEATURES].min(axis=1)
test_df["max"] = test_df[FEATURES].max(axis=1)
test_df["median"] = test_df[FEATURES].median(axis=1)
test_df["25%"] = test_df[FEATURES].quantile(q=0.25, axis=1)
test_df["75%"] = test_df[FEATURES].quantile(q=0.75, axis=1)
test_df["skew"] = test_df[FEATURES].skew(axis=1)
test_df["kurt"] = test_df[FEATURES].kurt(axis=1)

# FEATURES.extend(['mean', 'std', 'min', 'max', 'median', '25%', '75%', 'skew', 'kurt', 'gcd'])

## 　主成分分析

In [16]:
pca = PCA()
pca.fit(train_df[FEATURES])
train_pca = pca.transform(train_df[FEATURES])

train_pca_cols = ["PCA{}".format(x + 1) for x in range(len(train_df[FEATURES].columns))]
train_pca_df = pd.DataFrame(train_pca, columns=train_pca_cols)

train_pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA277,PCA278,PCA279,PCA280,PCA281,PCA282,PCA283,PCA284,PCA285,PCA286
0,0.006646,0.069854,-0.023743,0.01847,-0.004203,0.009313,-0.011676,0.001996,0.015399,-0.004787,...,1.850972e-05,-8e-06,-5.585292e-06,1.196883e-05,6e-06,-1.944287e-06,2e-06,5.485936e-07,1.329793e-07,9.540979e-18
1,-0.025797,0.006773,-0.008334,-0.003302,-0.008506,-0.011347,0.004151,0.001914,0.000588,0.005052,...,-1.870797e-05,-1.7e-05,6.318468e-06,-4.761966e-06,1e-06,-6.246728e-07,-3e-06,3.177374e-07,-1.351602e-07,2.4286130000000003e-17
2,-0.02907,-0.005515,0.000657,-0.001592,0.002292,0.001942,-0.002271,-0.000323,-0.001389,0.000822,...,9.390363e-06,7e-06,-1.643514e-07,-6.017212e-07,5e-06,1.5739e-06,-4e-06,1.036072e-06,9.360844e-07,-8.153200000000001e-17
3,-0.032686,-0.006226,0.005941,0.003985,0.00215,0.000738,-0.001066,-0.0013,-0.000586,0.000742,...,-1.077665e-06,-5e-06,4.160533e-07,-3.928258e-06,-2e-06,-5.688576e-07,-5e-06,-8.48152e-08,-1.877031e-07,-1.00614e-16
4,0.042883,0.053586,0.00996,-0.011809,0.040478,-0.01533,0.005708,0.011352,-0.000899,-0.01567,...,-5.375752e-07,5e-06,6.200262e-05,1.986051e-06,8e-06,4.731725e-07,1e-06,-2.807576e-06,4.226935e-07,3.2959750000000005e-17


In [18]:
train_exp_df = pd.DataFrame(pca.explained_variance_ratio_, index=train_pca_cols)

In [19]:
train_exp_df.cumsum()

Unnamed: 0,0
PCA1,0.241967
PCA2,0.390881
PCA3,0.436054
PCA4,0.466916
PCA5,0.486691
PCA6,0.500958
PCA7,0.51285
PCA8,0.524118
PCA9,0.534561
PCA10,0.544481


In [15]:
pca = PCA()
pca.fit(test_df[FEATURES])
test_pca = pca.transform(test_df[FEATURES])

test_pca_cols = ["PCA{}".format(x + 1) for x in range(len(test_df[FEATURES].columns))]
test_pca_df = pd.DataFrame(test_pca, columns=test_pca_cols)

test_pca_df.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA277,PCA278,PCA279,PCA280,PCA281,PCA282,PCA283,PCA284,PCA285,PCA286
0,-0.026061,-0.005676,-0.002087,-0.004364,0.00197,-0.003346,0.001415,-4.3e-05,0.000773,0.000364,...,3e-06,-3.079238e-07,6.099493e-07,1.1e-05,6.532714e-06,8.414694e-08,-5.418227e-07,4.001424e-08,-4.094528e-07,-1.101549e-16
1,-0.02997,-0.000829,0.003157,-0.005445,-0.0077,0.006919,-0.004859,-0.001958,0.000154,0.001523,...,-8.9e-05,-5.040642e-05,5.575077e-06,-1.6e-05,-1.167959e-06,-1.464624e-06,1.073612e-07,4.598498e-09,-4.829779e-09,-3.2959750000000005e-17
2,-0.020501,-0.00596,-0.004886,-0.007022,0.002234,-0.003713,0.002252,0.000387,0.001689,0.00057,...,1.1e-05,2.294285e-06,7.029154e-07,1.7e-05,8.773039e-06,-2.242007e-07,-4.700031e-07,1.585401e-07,4.145179e-07,-8.239937000000001e-18
3,-0.032448,-0.006448,0.005718,0.003875,0.002457,-0.000752,0.001256,-0.001193,0.000631,0.000712,...,-1.2e-05,-9.835746e-07,-2.447849e-07,-9e-06,-3.755113e-07,-2.461793e-07,-1.023788e-06,7.705193e-07,-1.224953e-07,-1.110223e-16
4,0.030404,-0.004855,0.002239,0.004519,-0.004469,0.009123,-0.00026,-0.000754,0.003835,-0.002049,...,-1.9e-05,5.582952e-06,-3.521718e-06,-1.4e-05,-4.237364e-07,-1.709658e-06,6.245489e-07,-2.008912e-07,-8.452278e-08,-2.775558e-17


In [None]:
pd.DataFrame(pca.explained_variance_ratio_, index=test_pca_cols)

### マージ

In [24]:
train_df = pd.concat(objs=[train_df, train_pca_df], axis=1)
test_df = pd.concat(objs=[test_df, test_pca_df], axis=1)

## Save dataset

In [25]:
pd.to_pickle(train_df, f"../data/processed/nb{NB}_train.pkl", compression='zip')
pd.to_pickle(test_df, f"../data/processed/nb{NB}_test.pkl", compression='zip')

# 検証メモ

In [None]:
dataset[dataset['target'] == 'Streptococcus_pyogenes'].describe()

In [None]:
train_df.columns.get_loc('A4T4G1C1')

In [None]:
train_df.columns.get_loc(train_df[FEATURES].idxmax(axis=1))

In [13]:
train_pca_df

array([[ 6.64632144e-03,  6.98539913e-02, -2.37427587e-02, ...,
         5.48593563e-07,  1.32979319e-07,  9.54097912e-18],
       [-2.57972780e-02,  6.77285417e-03, -8.33373794e-03, ...,
         3.17737442e-07, -1.35160166e-07,  2.42861287e-17],
       [-2.90703790e-02, -5.51456053e-03,  6.56628899e-04, ...,
         1.03607184e-06,  9.36084444e-07, -8.15320034e-17],
       ...,
       [-1.48741811e-02, -6.85031862e-03, -5.92601717e-03, ...,
         7.10393828e-07,  4.33723747e-08, -5.81132364e-17],
       [-1.77393351e-02,  4.05500766e-03, -1.26390295e-02, ...,
         1.33825779e-07, -1.31453992e-07,  1.08420217e-17],
       [-5.03014223e-02, -6.21804184e-03,  1.96070266e-02, ...,
        -6.37139310e-07, -4.23215429e-07, -4.33680869e-17]])

In [None]:
train.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train['target'].unique()

In [None]:
train.columns

### Targetのバランス
- 結論：ほぼ均等

In [None]:
dataset['target'].value_counts()

In [None]:
target_df = pd.DataFrame(train['target'].value_counts()).reset_index()
target_df.columns = ['target', 'count']
target_df['percentage'] = target_df['count'] / len(train) * 100

target_df

In [None]:
g = sns.catplot(x="count", y="target", data=target_df, kind="bar")
g = g.set_ylabels("Num of target")