# TPS-Feb-2022

In [1]:
NB = '003'

## Import libralies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

## Load and check data

### Load data

In [3]:
# Load data
##### Load train and Test set
train_df = pd.read_csv("../data/raw/train.csv")
test_df = pd.read_csv("../data/raw/test.csv")

In [4]:
train_len = len(train_df)
dataset = pd.concat(objs=[train_df, test_df], axis=0).reset_index(drop=True)
# reset_index: indexを0から順に振り直す
# drop: Falseの場合、元のindexが「index」列が新たに生成されて残る。Trueの場合「index」列は作られない。

#dataset = dataset.drop(columns=['row_id'])
#train = train.drop(columns=['row_id'])

dataset.head()

Unnamed: 0,row_id,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,...,A8T0G1C1,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0,target
0,0,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Streptococcus_pyogenes
1,1,-9.536743e-07,-1e-05,-4.3e-05,0.000886,-0.0002,0.00076,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,0.000914,0.000914,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Salmonella_enterica
2,2,-9.536743e-07,-2e-06,7e-06,0.000129,0.000268,0.00027,0.000243,0.000125,1e-06,...,8.4e-05,4.8e-05,8.1e-05,0.000106,7.2e-05,1e-05,8e-06,1.9e-05,1.046326e-06,Salmonella_enterica
3,3,4.632568e-08,-6e-06,1.2e-05,0.000245,0.000492,0.000522,0.000396,0.000197,-3e-06,...,0.000151,0.0001,0.00018,0.000202,0.000153,2.1e-05,1.5e-05,4.6e-05,-9.536743e-07,Salmonella_enterica
4,4,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,Enterococcus_hirae


In [5]:
FEATURES = [col for col in train_df.columns if col not in ['row_id', 'target']]

## Feature Engineering
- 今回は平均、分散、最大値、最小値を追加するだけ

In [6]:
train_df["mean"] = train_df[FEATURES].mean(axis=1)
train_df["std"] = train_df[FEATURES].std(axis=1)
train_df["min"] = train_df[FEATURES].min(axis=1)
train_df["max"] = train_df[FEATURES].max(axis=1)
train_df["median"] = train_df[FEATURES].median(axis=1)
train_df["25%"] = train_df[FEATURES].quantile(q=0.25, axis=1)
train_df["75%"] = train_df[FEATURES].quantile(q=0.75, axis=1)
train_df["skew"] = train_df[FEATURES].skew(axis=1)
train_df["kurt"] = train_df[FEATURES].kurt(axis=1)

test_df["mean"] = test_df[FEATURES].mean(axis=1)
test_df["std"] = test_df[FEATURES].std(axis=1)
test_df["min"] = test_df[FEATURES].min(axis=1)
test_df["max"] = test_df[FEATURES].max(axis=1)
test_df["median"] = test_df[FEATURES].median(axis=1)
test_df["25%"] = test_df[FEATURES].quantile(q=0.25, axis=1)
test_df["75%"] = test_df[FEATURES].quantile(q=0.75, axis=1)
test_df["skew"] = test_df[FEATURES].skew(axis=1)
test_df["kurt"] = test_df[FEATURES].kurt(axis=1)

FEATURES.extend(['mean', 'std', 'min', 'max', 'median', '25%', '75%', 'skew', 'kurt'])

In [7]:
train_df.head()

Unnamed: 0,row_id,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,...,target,mean,std,min,max,median,25%,75%,skew,kurt
0,0,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,Streptococcus_pyogenes,2.7670280000000003e-17,0.005643,-0.014033,0.023992,-0.000687,-0.002403,-4.3e-05,1.307963,4.111766
1,1,-9.536743e-07,-1e-05,-4.3e-05,0.000886,-0.0002,0.00076,-0.0002,-0.000114,-4.3e-05,...,Salmonella_enterica,1.1299960000000001e-17,0.001751,-0.005016,0.008984,-8.6e-05,-0.000801,0.000796,0.696087,4.551135
2,2,-9.536743e-07,-2e-06,7e-06,0.000129,0.000268,0.00027,0.000243,0.000125,1e-06,...,Salmonella_enterica,-1.785171e-18,0.000601,-0.002587,0.002327,1.5e-05,-0.000124,0.000201,-0.415096,4.17459
3,3,4.632568e-08,-6e-06,1.2e-05,0.000245,0.000492,0.000522,0.000396,0.000197,-3e-06,...,Salmonella_enterica,-4.829865e-19,0.00116,-0.005403,0.004602,1.9e-05,-0.00023,0.000394,-0.395986,4.501727
4,4,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,Enterococcus_hirae,2.8724910000000005e-17,0.007117,-0.024033,0.037984,-0.000343,-0.002403,-4.3e-05,1.250485,6.388081


In [8]:
test_df.head()

Unnamed: 0,row_id,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,...,A10T0G0C0,mean,std,min,max,median,25%,75%,skew,kurt
0,200000,-9.536743e-07,-2e-06,-9.153442e-07,2.4e-05,3.4e-05,-2e-06,2.1e-05,2.4e-05,-9e-06,...,4.632568e-08,-4.669097e-18,0.000366,-0.001885,0.00094,3.090652e-05,-8.7e-05,0.000191,-1.583268,4.690538
1,200001,-9.536743e-07,-1e-05,-4.291534e-05,-0.000114,0.0018,-0.00024,0.0018,-0.000114,0.000957,...,-9.536743e-07,8.94391e-18,0.001869,-0.007022,0.006984,-8.583069e-05,-0.000801,0.000798,0.066551,2.381128
2,200002,4.632568e-08,3e-06,8.465576e-08,-1.4e-05,7e-06,-5e-06,-4e-06,3e-06,4e-06,...,4.632568e-08,4.0326839999999995e-19,0.000337,-0.001519,0.000889,-4.536743e-07,-0.000109,0.00014,-0.817271,3.098878
3,200003,-9.536743e-07,-8e-06,8.084656e-06,0.000216,0.00042,0.000514,0.000452,0.000187,-5e-06,...,-9.536743e-07,-1.567283e-18,0.001129,-0.005106,0.004536,2.391681e-05,-0.000257,0.000391,-0.374804,4.401019
4,200004,-9.536743e-07,-1e-05,-4.291534e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,...,-9.536743e-07,1.465796e-17,0.003872,-0.014033,0.019992,-0.0002002716,-0.001202,0.000864,0.438406,5.211107


## Save dataset

In [9]:
pd.to_pickle(train_df, f"../data/processed/nb{NB}_train.pkl", compression='zip')
pd.to_pickle(test_df, f"../data/processed/nb{NB}_test.pkl", compression='zip')

# 検証メモ

In [None]:
dataset[dataset['target'] == 'Streptococcus_pyogenes'].describe()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train['target'].unique()

In [None]:
train.columns

### Targetのバランス
- 結論：ほぼ均等

In [None]:
dataset['target'].value_counts()

In [None]:
target_df = pd.DataFrame(train['target'].value_counts()).reset_index()
target_df.columns = ['target', 'count']
target_df['percentage'] = target_df['count'] / len(train) * 100

target_df

In [None]:
g = sns.catplot(x="count", y="target", data=target_df, kind="bar")
g = g.set_ylabels("Num of target")