# Digit Recognizer

チュートリアル用コンペ

https://www.kaggle.com/competitions/digit-recognizer

## import

In [None]:
import datetime
from zoneinfo import ZoneInfo

# ====================================================
# Library
# ====================================================
import os
import gc
import warnings

warnings.filterwarnings("ignore")
import scipy as sp
import numpy as np
import pandas as pd
from pathlib import Path
# import joblib
import pickle

# import torch
from sklearn.model_selection import StratifiedKFold

# models
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from keras.layers import Dense, Dropout, Flatten, Activation
#from keras.layers import Conv2D, MaxPooling2D
from keras.models import Sequential
from keras.utils import to_categorical


## Config

In [None]:
# ====================================================
# Configurations
# ====================================================
class Config:
    VER = 1
    AUTHOR = "virtual-hippo"
    COMPETITION = "Digit Recognizer"
    DATA_PATH = Path("./input")
    OOF_DATA_PATH = Path("./oof")
    MODEL_DATA_PATH = Path("./model")
    SUB_DATA_PATH = Path("./submit")

    epochs = 50
    batch_size = 128
    seed = 42
    verbose = 25
    n_folds = 5
    target_col = "label"
    metric = "f1_score"
    early_stopping_round = 200
    classification_lgb_params = {
        'objective': 'multiclass',
        'num_class': 10,
        "learning_rate": 0.05,
        "seed": seed,
    }
    model_weight_dict = {"lightgbm": 0.50}
    
    def features():
        return [f"pixel{i}"  for i in range(0,784)]
    

## データの読み込み関数

In [None]:
def read_train_data():
    return pd.read_csv(Config.DATA_PATH.joinpath(Path('train.csv')))

def read_test_data():
    return pd.read_csv(Config.DATA_PATH.joinpath(Path('test.csv')))


## 前処理

In [None]:


def preprocessing(input_df: pd.DataFrame, preprocessors) -> pd.DataFrame:
    def normarization(input_df: pd.DataFrame) -> pd.DataFrame:
        input_df[Config.features()] = input_df[Config.features()].astype('float32').apply(lambda x: x/255)
        return input_df
    
    def make_features(input_df: pd.DataFrame) -> pd.DataFrame:
        output_df = input_df.copy()
        # いろいろ特徴量作成を追加する
        for f in preprocessors:
            output_df = f(output_df)
        return output_df
    
    output_df = input_df.copy()
    output_df = normarization(output_df)
    output_df = make_features(output_df)

    return output_df

train_df = read_train_data()
train_df = preprocessing(train_df, [])
train_df

## モデル

In [None]:
class Model:
    def __init__(self):
        model = Sequential()
        #model.add(Conv2D(filters=32, kernel_size=(3, 3),input_shape=(28,28,1)))
        model.add(Activation('relu'))
        #model.add(Conv2D(filters=64, kernel_size=(3, 3)))
        model.add(Activation('relu'))
        #model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Flatten())
        model.add(Dense(128))
        model.add(Activation('relu'))
        model.add(Dropout(0.5))
        model.add(Dense(10))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])
        self.model = model

    def fit(self, x_train, y_train,x_valid,y_valid):
        self.model.fit(x_train, y_train,
          batch_size=Config.batch_size,
          epochs=Config.epochs,
          verbose=Config.verbose,
          validation_data=(x_valid, y_valid))
        
        # Predict validation
        valid_pred = self.model.predict(x_valid)
        return self.model, valid_pred
        

## 学習

In [None]:
def learn():
    kfold = StratifiedKFold(n_splits=Config.n_folds, shuffle=True, random_state=Config.seed)
    

    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df[Config.features()], train_df[Config.target_col])):
        print("-" * 50)
        print(f"training fold {fold+1}")

        model = Model()
        x_train = train_df[Config.features()].iloc[train_index].to_numpy()
        y_train = train_df[Config.target_col].iloc[train_index].to_numpy()

        x_valid = train_df[Config.features()].iloc[valid_index].to_numpy()
        y_valid = train_df[Config.target_col].iloc[valid_index].to_numpy()

        # x_train = x_train.reshape(-1, 28, 28, 1)
        # x_valid = x_valid.reshape(-1, 28, 28, 1)

        
        y_train = to_categorical(y_train)
        y_valid = to_categorical(y_valid)

        model, valid_pred = model.fit(np.array(x_train), np.array(y_train), np.array(x_valid), np.array(y_valid))

        pickle.dump(
            model,
            open(
                Config.MODEL_DATA_PATH.joinpath(Path(f"fold{fold + 1}_seed{Config.seed}_ver{Config.VER}.pkl")),
                "wb",
            ),
        )

        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

learn()


## 予測

In [None]:
def predict():
    test_df = read_test_data()
    test_df = preprocessing(test_df, [])
    
    x_test = test_df[Config.features()].to_numpy()

    
    print(x_test.shape)
    print(x_test)

    df = pd.DataFrame({'ImageId': range(1, len(x_test) + 1)})

    for fold in range(Config.n_folds):
        model = pickle.load(
            open(
                Config.MODEL_DATA_PATH.joinpath(Path(f"fold{fold + 1}_seed{Config.seed}_ver{Config.VER}.pkl")),
                "rb",
            )
        )
        # Predict
        pred = np.argmax(model.predict(x_test), axis=1)
        df[f"fold_{fold}"] = pred

        print(pred.shape)
        print(pred)
    
    return df

pred_df = predict()

In [None]:
def output_path():
    now = datetime.datetime.now(tz=ZoneInfo("Asia/Tokyo"))
    now_str = f"{now.strftime('%Y%m%d_%H%M%S')}"
    filename = f"{now_str}_seed{Config.seed}_ver{Config.VER}_{Config.AUTHOR}_submission.csv"
    return Config.SUB_DATA_PATH.joinpath(Path(filename))

out_df = pred_df.copy()
out_df["Label"] = out_df.iloc[:, 1:6].apply(lambda row: sp.stats.mode(row, keepdims=True).mode[0], axis=1)
out_df[["ImageId", "Label"]].to_csv(
    output_path(),
    header=True,
    index=False,
)