# Object

* 自分用のNotebookの雛形をまとめる
    * コンペ概要
        * 簡単な背景
        * 何を予測するのか
        * 評価指標
        * 期間
        * 類似コンペ 等
    * 提供データセットの概要
        * カラム
        * サイズ
        * レコード数 等
    * 探索的データ分析
        * 欠損値
        * 分布
        * データセットの結合 等

* 作成の過程で取り組むこと
    * Kaggle Notebookの機能の活用
        * kaggle APIの利用
    * ビジュアライズに有効な手法やライブラリの活用
        * マークダウン
        * 複数のライブラリで視覚化を比較
        * 動きを出すことを意識
    * 英語での表現の習得
        * 日本語と併記しても冗長にならないよう工夫

# IMPORT

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
# import japanize_matplotlib

from pathlib import Path

In [None]:
INPUT = Path('../input')
COMPETITIONS = INPUT / Path('walmart-recruiting-store-sales-forecasting')

In [None]:
FEATURES = pd.read_csv(COMPETITIONS / 'features.csv.zip')
features = FEATURES.copy()
features.tail()

In [None]:
stores = pd.read_csv(COMPETITIONS / 'stores.csv')
stores.tail()

In [None]:
sample = pd.read_csv(COMPETITIONS / 'sampleSubmission.csv.zip')
sample.tail()

# ADD FEATURES

In [None]:
features['Date'] = features['Date'].astype('datetime64[ns]')
features['year'] = features['Date'].dt.year
features['month'] = features['Date'].dt.month
features['dayofweek'] = features['Date'].dt.dayofweek

# EDA

In [None]:
features['year'].value_counts()

In [None]:
features[['Store', 'Fuel_Price']].groupby('Store').mean().tail()

In [None]:
for col in ['year', 'month', 'dayofweek']:
    print(col ,': ' , np.sort(features[col].unique()))

In [None]:
features[features['Store']==45][['year', 'Fuel_Price']].groupby('year').mean()

In [None]:
features[features['Store']==45][['month', 'Fuel_Price']].groupby('month').mean()

In [None]:
features[features['Store']==45][['dayofweek', 'Fuel_Price']].groupby('dayofweek').mean()

# VISUALIZE

In [None]:
cols = ['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
sns.set_context('paper')
fig = plt.subplots(figsize=(10, 10))
plt.title(f"Heatmap")
sns.heatmap(features[cols].corr(), annot=True, fmt='.2f', cmap='Blues', square=True)
plt.show()

In [None]:
_, axes = plt.subplots(2, 3, figsize=(18,12))
axes = axes.ravel()
for i, col in enumerate(['year', 'month', 'Temperature', 'MarkDown1']):
    sns.scatterplot(data=features, x=col, y='Fuel_Price', alpha=0.05, ax=axes[i]) # hue=**
    # axes[i].legend(loc='upper right')
    axes[i].set_title(f"Scatterplot: {col} & Fuel_Price")
sns.scatterplot(data=features, x='Temperature', y='MarkDown2', alpha=0.05, ax=axes[4])
sns.scatterplot(data=features, x='MarkDown1', y='MarkDown4', alpha=0.05, ax=axes[5])
axes[4].set_title(f"Scatterplot: Temperature & MarkDown2")
axes[5].set_title(f"Scatterplot: MarkDown1 & MarkDown4")
plt.show()

In [None]:
def msno_matrix(df):
    df_ = df.copy()
    missing_ratios = df_.notnull().mean() * 100
    df_.columns = [f"{col}\n{ratio:.1f}%" for col, ratio in zip(df_.columns, missing_ratios)]
    msno.matrix(df_)
    plt.show()

In [None]:
msno_matrix(FEATURES)