In [15]:
# stratified-kfold for regression
import numpy as np 
import pandas as pd
from sklearn import datasets
from sklearn import model_selection

def create_folds(data):
    # we create a new column called kfold and fill it with -1 
    data["kfold"] = -1

    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate the number of bins by Sturge's rule 
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))

    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )

    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1) 
    # return dataframe with folds 
    return data

In [16]:
__name__

'__main__'

In [17]:
if __name__ == "__main__":
    # we create a sample dataset with 15000 samples 
    # and 100 features and 1 target
    X, y = datasets.make_regression(
        n_samples=15000, n_features=100, n_targets=1
    )

    # create a dataframe out of our numpy arrays
    df = pd.DataFrame( 
        X,
        columns=[f"f_{i}" for i in range(X.shape[1])]
    )
    df.loc[:, "target"] = y
    # create folds
    df = create_folds(df)

In [18]:
### confirm
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Columns: 102 entries, f_0 to kfold
dtypes: float64(101), int64(1)
memory usage: 11.7 MB


pandasで圧縮ファイルを読み込む

zipファイルをmacで作成するときの注意点  
・GUIから作成すると、.zipファイル内に、__MACOSX/ というフォルダが自動で作成されてしまう。  
http://shirura.jugem.jp/?eid=134  

なので、pandasで読み込もうとすると、複数ファイルありますエラーが発生してしまう。  

作成したzipファイルから、__MACOSX/　フォルダやファイルを削除すればよい。  
https://qiita.com/seyself/items/ae47f1c22a7375a736f3  

$ zip --delete FILENAME.zip "*.DS_Store" "*__MACOSX*"  

これでpandasで読み込むことができた。  

In [19]:
import pandas as pd

In [26]:
df = pd.read_csv('../input/mnist_train.csv.zip', compression='zip')

In [30]:
df.shape

(60000, 785)

In [32]:
df.head

<bound method NDFrame.head of        label  1x1  1x2  1x3  1x4  1x5  1x6  1x7  1x8  1x9  ...  28x19  28x20  \
0          5    0    0    0    0    0    0    0    0    0  ...      0      0   
1          0    0    0    0    0    0    0    0    0    0  ...      0      0   
2          4    0    0    0    0    0    0    0    0    0  ...      0      0   
3          1    0    0    0    0    0    0    0    0    0  ...      0      0   
4          9    0    0    0    0    0    0    0    0    0  ...      0      0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...    ...    ...   
59995      8    0    0    0    0    0    0    0    0    0  ...      0      0   
59996      3    0    0    0    0    0    0    0    0    0  ...      0      0   
59997      5    0    0    0    0    0    0    0    0    0  ...      0      0   
59998      6    0    0    0    0    0    0    0    0    0  ...      0      0   
59999      8    0    0    0    0    0    0    0    0    0  ...      0      0   

       28