In [1]:
import pandas as pd

In [2]:
# マッピング用の事象
mapping = {
    "Freezing": 0,
    "Warm":1,
    "Cold":2,
    "Boiling Hot":3,
    "Hot":4,
    "Lava Hot":5
}

In [3]:
# CSVファイルの読み込み
df = pd.read_csv("./train.csv")

FileNotFoundError: [Errno 2] No such file or directory: './train.csv'

In [4]:
# マッピング辞書による置換
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

In [5]:
df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

In [3]:
from sklearn import preprocessing

In [7]:
# CSVファイルの読み込み
df = pd.read_csv("./train.csv")

# 欠損値をNONEの文字列に置換
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")


# 初期化
lbl_enc = preprocessing.LabelEncoder()

# 値を変換
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)


In [4]:
# ord_2は名義変数であり、数値に順序性や大小関係はないので、「0-5」などの値をそのままモデルにインプットすることはできない
# そこで、二値変数に変換する
import numpy as np

# 列の特徴量を作成
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

# バイト数の表示
print(example.nbytes)

72


In [5]:
# 二値変数をCSR方式の疎行列で表現
import numpy as np
from scipy import sparse

# 列の特徴量を作成
example = np.array(
    [
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 1]
    ]
)

# numpyの配列を疎行列に変換
sparse_example = sparse.csc_matrix(example)

# バイト数の表示
print(sparse_example.data.nbytes)

32


In [6]:
import numpy as np
from scipy import sparse

n_rows = 1000
n_cols = 10000

example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))

print(example.nbytes)

# numpyの配列を疎行列に変換
sparse_example = sparse.csc_matrix(example)

print(sparse_example.data.nbytes)


80000000
3998616


In [4]:
import numpy as np
from sklearn import preprocessing

example = np.random.randint(1000, size=10000)

# 初期化
ohe = preprocessing.OneHotEncoder(sparse=False)

# one hotベクトルに変換
ohe_example = ohe.fit_transform(example.reshape(-1, 1))

# 密結合のサイズを表示
print(ohe_example.nbytes)

# 再初期化
ohe_2 = preprocessing.OneHotEncoder(sparse=True)

# one hotベクトルに変換
ohe_example = ohe_2.fit_transform(example.reshape(-1, 1))

# 密結合のサイズを表示
print(ohe_example.data.nbytes)



80000000
80000


In [12]:
df["new_feature"]= (
    df.ord_1.astype(str)
    + "_"
    + df.ord_2.astype(str)
)


In [38]:
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [40]:
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: ord_2, dtype: int64

In [13]:
# 以下のように置換して補完を行う
# 欠損値："NONE"
# 数値：文字列
import numpy as np
from sklearn import preprocessing

# CSVファイルの読み込み
train = pd.read_csv("./train.csv")

# CSVファイルの読み込み
test = pd.read_csv("./test.csv")

# 評価用データセットに擬似列を追加
test.loc[:, "target"] = -1

#train-testの結合
data = pd.concat([train, test]).reset_index(drop=True)

# 特徴量として扱う列をリストに格納
# インデックスと目的変数の列は処理対象外
features = [x for x in train.columns if x not in ["id", "target"]]


# 個々の特徴量についてのループ
for feat in features:
    # 初期化
    lbl_enc = preprocessing.LabelEncoder()
    
    # 欠損値を文字列で補完し、全ての値を文字列に置換
    # 置換後の文字列をラベル＝ラベルIDとして使用する
    temp_col = data[feat].fillna("NONE").astype(str).values
    
    # サンプルデータの列をラベルに変換する
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
    
# train/valに分割
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [14]:
# create_folds.py
# StratifiedKFoldを使ってデータセットを分割する
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    
    # サンプルのCSVファイルをpandasに読み込む
    df = pd.read_csv("./train.csv")
    
    # kfold列を追加して初期化
    df["kfold"] = -1

    # サンプルをシャッフル
    df = df.sample(frac=1).reset_index(drop=True)

    # 目的変数の取り出し
    y = df.target.values
    
    # Startified K Foldクラスの初期化
    kf = model_selection.StratifiedKFold(n_splits=5)


    # kfold列を埋める
    for fold, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = fold
    
    # データセットを新しい名前で保存
    df.to_csv("./train_folds.csv", index=False)

In [6]:
df = pd.read_csv("./train_folds.csv")

In [9]:
df.kfold.value_counts()

0    120000
1    120000
2    120000
3    120000
4    120000
Name: kfold, dtype: int64

In [7]:
# ohe_logres.py
from sklearn import metrics
from sklearn import linear_model
from sklearn import preprocessing

def run(fold: int):
    # サンプルのCSVファイルをpandasに読み込む
    df = pd.read_csv("./train_folds.csv")
    
    # 
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
    
    ]
    
    # 個々の特徴量についてのループ
    for col in features:
        # 欠損値を文字列で補完し、全ての値を文字列に置換
        # 置換後の文字列をラベル＝ラベルIDとして使用する
        df.loc[:, col] = df[col].fillna("NONE").astype(str).values
        
        # 引数と一致しないkfoldのデータを学習
        df_train = df[df.kfold != fold].reset_index(drop=True)
        
        # 引数と一致するkfoldのデータを検証
        df_valid = df[df.kfold == fold].reset_index(drop=True)
        
        # 初期化
        ohe = preprocessing.OneHotEncoder()
        
        
    #train-testの結合
    full_data = pd.concat([df_train[features], df_valid[features]],
                          axis=0
                         )
    #oheを学習
    ohe.fit(full_data[features])
    
    # 学習データセットを変換
    x_train = ohe.transform(df_train[features])
    
    # 検証データセットを変換
    x_valid = ohe.transform(df_valid[features])
        
    # モデルを初期化して学習
    model = linear_model.LogisticRegression()
    model.fit(x_train, df_train.target.values)
    
    #  検証データセットを予測
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # AUCを算出
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    print(auc)
    
if __name__== "__main__":
    for fold_ in range(5):
        run(fold_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7845043854397131


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7873640539576067


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.788386046790077


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7831034515610602


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7872046311030512


In [8]:
!python3 -W ignore ohe_logres.py

0.7845043854397131
0.7873640539576067
0.788386046790077
0.7831034515610602
0.7872046311030512


In [12]:
# lbl_rf.py
from sklearn import ensemble
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd

def run(fold: int):
    # サンプルのCSVファイルをpandasに読み込む
    df = pd.read_csv("./train_folds.csv")
    
    # 
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
    
    ]
    
    # 個々の特徴量についてのループ
    for col in features:
        # 欠損値を文字列で補完し、全ての値を文字列に置換
        # 置換後の文字列をラベル＝ラベルIDとして使用する
        df.loc[:, col] = df[col].fillna("NONE").astype(str).values
        
    # 特徴量のラベルエンコーディング
    for col in features:
        # 初期化
        lbl = preprocessing.LabelEncoder()
        
        # ラベルエンコーダの学習
        lbl.fit(df[col])
        
        # データセットの変換
        df.loc[:, col] = lbl.transform(df[col])
        
    # 引数と一致しないkfoldのデータを学習
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # 引数と一致するkfoldのデータを検証
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # 学習データセットを準備
    x_train = df_train[features].values
    
    # 検証データセットを変換
    x_valid = df_valid[features].values
        
    # モデルの初期化と学習
    model = ensemble.RandomForestClassifier(n_jobs=1)
    model.fit(x_train, df_train.target.values)
    
    
    #  検証データセットを予測
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # AUCを算出
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    print(auc)
    
if __name__== "__main__":
    for fold_ in range(5):
        run(fold_)

0.7174618974147643


KeyboardInterrupt: 

In [5]:
!pip install xgboost



In [31]:
# lbl_xgb.py
import xgboost as xgb
from sklearn import preprocessing
import pandas as pd


def run(fold: int):
    # サンプルのCSVファイルをpandasに読み込む
    df = pd.read_csv("./train_folds.csv")
    
    # 
    features = [
        f for f in df.columns if f not in ("id", "target", "kfold")
    
    ]
    
    # 個々の特徴量についてのループ
    for col in features:
        # 欠損値を文字列で補完し、全ての値を文字列に置換
        # 置換後の文字列をラベル＝ラベルIDとして使用する
        df.loc[:, col] = df[col].fillna("NONE").astype(str).values
        
    # 特徴量のラベルエンコーディング
    for col in features:
        # 初期化
        lbl = preprocessing.LabelEncoder()
        
        # ラベルエンコーダの学習
        lbl.fit(df[col])
        
        # データセットの変換
        df.loc[:, col] = lbl.transform(df[col])
        
    # 引数と一致しないkfoldのデータを学習
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # 引数と一致するkfoldのデータを検証
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # 学習データセットを準備
    x_train = df_train[features].values
    
    # 検証データセットを変換
    x_valid = df_valid[features].values
        
    # モデルの初期化と学習
    model = xgb.XGBClassifier(
        n_jobs=1,
        max_depth=7,
        n_estimators=200
        )
        
    model.fit(x_train, df_train.target.values)
    
    
    #  検証データセットを予測
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # AUCを算出
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    
    print(auc)
    
if __name__== "__main__":
    for fold_ in range(5):
        run(fold_)





KeyboardInterrupt: 

In [49]:
num_cols = [
    "age",
    "fnlwgt",
    "capital-gain",
    "capital-loss",
    "hours-per-week"
]
df = df.drop(num_cols, axis=1)


In [53]:
# 目的変数を0と1に置換
target_mapping = {
    "<=50K": 0,
    ">50K": 1
}
df["income"] = df.income.map(target_mapping)

In [64]:
df

Unnamed: 0,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,native-country,income,kfold
0,Private,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,United-States,0,0
1,Private,HS-grad,9,Divorced,Exec-managerial,Unmarried,White,Female,United-States,0,0
2,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Prof-specialty,Other-relative,White,Female,United-States,1,0
3,Private,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,?,0,0
4,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,United-States,1,0
...,...,...,...,...,...,...,...,...,...,...,...
48837,Private,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,United-States,0,4
48838,Private,HS-grad,9,Widowed,Machine-op-inspct,Unmarried,White,Female,United-States,0,4
48839,Private,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States,0,4
48840,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,0,4


In [12]:
# ohe_logres.py
from sklearn import metrics
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd

def run(fold: int):
    # サンプルのCSVファイルをpandasに読み込む
    df = pd.read_csv("./adult_folds.csv")
    
    # 数値を含む列の削除
    num_cols = [
        "age",
        "fnlwgt",
        #"educational-num",
        "capital-gain",
        "capital-loss",
        "hours-per-week"
    ]
    df = df.drop(num_cols, axis=1)
    
    # 目的変数を0と1に置換
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    features = [
        f for f in df.columns if f not in ("income", "kfold")
    
    ]
    
    # 個々の特徴量についてのループ
    for col in features:
        # 欠損値を文字列で補完し、全ての値を文字列に置換
        # 置換後の文字列をラベル＝ラベルIDとして使用する
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
        
    # 引数と一致しないkfoldのデータを学習データにする
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # 引数と一致するkfoldのデータを検証データにする
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # 初期化
    # 全ての項目をカテゴリ変数に変換する必要あり
    ohe = preprocessing.OneHotEncoder()
    
    #train-testの結合
    full_data = pd.concat([df_train[features], df_valid[features]],
                          axis=0
                         )

    #oheを学習
    ohe.fit(full_data[features])
    
    # 学習データセットを準備
    x_train = ohe.transform(df_train[features])
    
    # 検証データセットを変換
    x_valid = ohe.transform(df_valid[features])
        
    # モデルの初期化と学習
    model = linear_model.LogisticRegression()
    model.fit(x_train, df_train.income.values)
    
    
    #  検証データセットを予測
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # AUCを算出
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    print(auc)
    
if __name__== "__main__":
    for fold_ in range(5):
        run(fold_)

  (0, 2)	1.0
  (0, 21)	1.0
  (0, 30)	1.0
  (0, 43)	1.0
  (0, 58)	1.0
  (0, 63)	1.0
  (0, 73)	1.0
  (0, 75)	1.0
  (0, 115)	1.0
  (1, 4)	1.0
  (1, 18)	1.0
  (1, 29)	1.0
  (1, 43)	1.0
  (1, 52)	1.0
  (1, 68)	1.0
  (1, 71)	1.0
  (1, 74)	1.0
  (1, 115)	1.0
  (2, 2)	1.0
  (2, 21)	1.0
  (2, 30)	1.0
  (2, 43)	1.0
  (2, 52)	1.0
  (2, 63)	1.0
  (2, 73)	1.0
  :	:
  (39070, 26)	1.0
  (39070, 43)	1.0
  (39070, 51)	1.0
  (39070, 63)	1.0
  (39070, 73)	1.0
  (39070, 75)	1.0
  (39070, 115)	1.0
  (39071, 4)	1.0
  (39071, 18)	1.0
  (39071, 29)	1.0
  (39071, 43)	1.0
  (39071, 52)	1.0
  (39071, 63)	1.0
  (39071, 73)	1.0
  (39071, 75)	1.0
  (39071, 115)	1.0
  (39072, 6)	1.0
  (39072, 20)	1.0
  (39072, 40)	1.0
  (39072, 41)	1.0
  (39072, 60)	1.0
  (39072, 64)	1.0
  (39072, 73)	1.0
  (39072, 74)	1.0
  (39072, 115)	1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8810573385785094
  (0, 4)	1.0
  (0, 24)	1.0
  (0, 26)	1.0
  (0, 43)	1.0
  (0, 60)	1.0
  (0, 63)	1.0
  (0, 73)	1.0
  (0, 75)	1.0
  (0, 115)	1.0
  (1, 4)	1.0
  (1, 20)	1.0
  (1, 40)	1.0
  (1, 41)	1.0
  (1, 52)	1.0
  (1, 67)	1.0
  (1, 73)	1.0
  (1, 74)	1.0
  (1, 115)	1.0
  (2, 6)	1.0
  (2, 18)	1.0
  (2, 29)	1.0
  (2, 43)	1.0
  (2, 58)	1.0
  (2, 65)	1.0
  (2, 73)	1.0
  :	:
  (39070, 26)	1.0
  (39070, 43)	1.0
  (39070, 51)	1.0
  (39070, 63)	1.0
  (39070, 73)	1.0
  (39070, 75)	1.0
  (39070, 115)	1.0
  (39071, 4)	1.0
  (39071, 18)	1.0
  (39071, 29)	1.0
  (39071, 43)	1.0
  (39071, 52)	1.0
  (39071, 63)	1.0
  (39071, 73)	1.0
  (39071, 75)	1.0
  (39071, 115)	1.0
  (39072, 6)	1.0
  (39072, 20)	1.0
  (39072, 40)	1.0
  (39072, 41)	1.0
  (39072, 60)	1.0
  (39072, 64)	1.0
  (39072, 73)	1.0
  (39072, 74)	1.0
  (39072, 115)	1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8787897415849424
  (0, 4)	1.0
  (0, 24)	1.0
  (0, 26)	1.0
  (0, 43)	1.0
  (0, 60)	1.0
  (0, 63)	1.0
  (0, 73)	1.0
  (0, 75)	1.0
  (0, 115)	1.0
  (1, 4)	1.0
  (1, 20)	1.0
  (1, 40)	1.0
  (1, 41)	1.0
  (1, 52)	1.0
  (1, 67)	1.0
  (1, 73)	1.0
  (1, 74)	1.0
  (1, 115)	1.0
  (2, 6)	1.0
  (2, 18)	1.0
  (2, 29)	1.0
  (2, 43)	1.0
  (2, 58)	1.0
  (2, 65)	1.0
  (2, 73)	1.0
  :	:
  (39071, 26)	1.0
  (39071, 43)	1.0
  (39071, 51)	1.0
  (39071, 63)	1.0
  (39071, 73)	1.0
  (39071, 75)	1.0
  (39071, 115)	1.0
  (39072, 4)	1.0
  (39072, 18)	1.0
  (39072, 29)	1.0
  (39072, 43)	1.0
  (39072, 52)	1.0
  (39072, 63)	1.0
  (39072, 73)	1.0
  (39072, 75)	1.0
  (39072, 115)	1.0
  (39073, 6)	1.0
  (39073, 20)	1.0
  (39073, 40)	1.0
  (39073, 41)	1.0
  (39073, 60)	1.0
  (39073, 64)	1.0
  (39073, 73)	1.0
  (39073, 74)	1.0
  (39073, 115)	1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8751725401579281
  (0, 4)	1.0
  (0, 24)	1.0
  (0, 26)	1.0
  (0, 43)	1.0
  (0, 60)	1.0
  (0, 63)	1.0
  (0, 73)	1.0
  (0, 75)	1.0
  (0, 115)	1.0
  (1, 4)	1.0
  (1, 20)	1.0
  (1, 40)	1.0
  (1, 41)	1.0
  (1, 52)	1.0
  (1, 67)	1.0
  (1, 73)	1.0
  (1, 74)	1.0
  (1, 115)	1.0
  (2, 6)	1.0
  (2, 18)	1.0
  (2, 29)	1.0
  (2, 43)	1.0
  (2, 58)	1.0
  (2, 65)	1.0
  (2, 73)	1.0
  :	:
  (39071, 26)	1.0
  (39071, 43)	1.0
  (39071, 51)	1.0
  (39071, 63)	1.0
  (39071, 73)	1.0
  (39071, 75)	1.0
  (39071, 115)	1.0
  (39072, 4)	1.0
  (39072, 18)	1.0
  (39072, 29)	1.0
  (39072, 43)	1.0
  (39072, 52)	1.0
  (39072, 63)	1.0
  (39072, 73)	1.0
  (39072, 75)	1.0
  (39072, 115)	1.0
  (39073, 6)	1.0
  (39073, 20)	1.0
  (39073, 40)	1.0
  (39073, 41)	1.0
  (39073, 60)	1.0
  (39073, 64)	1.0
  (39073, 73)	1.0
  (39073, 74)	1.0
  (39073, 115)	1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8789720657549095
  (0, 4)	1.0
  (0, 24)	1.0
  (0, 26)	1.0
  (0, 43)	1.0
  (0, 60)	1.0
  (0, 63)	1.0
  (0, 73)	1.0
  (0, 75)	1.0
  (0, 115)	1.0
  (1, 4)	1.0
  (1, 20)	1.0
  (1, 40)	1.0
  (1, 41)	1.0
  (1, 52)	1.0
  (1, 67)	1.0
  (1, 73)	1.0
  (1, 74)	1.0
  (1, 115)	1.0
  (2, 6)	1.0
  (2, 18)	1.0
  (2, 29)	1.0
  (2, 43)	1.0
  (2, 58)	1.0
  (2, 65)	1.0
  (2, 73)	1.0
  :	:
  (39071, 40)	1.0
  (39071, 46)	1.0
  (39071, 56)	1.0
  (39071, 67)	1.0
  (39071, 73)	1.0
  (39071, 74)	1.0
  (39071, 115)	1.0
  (39072, 4)	1.0
  (39072, 18)	1.0
  (39072, 29)	1.0
  (39072, 43)	1.0
  (39072, 60)	1.0
  (39072, 63)	1.0
  (39072, 73)	1.0
  (39072, 75)	1.0
  (39072, 115)	1.0
  (39073, 4)	1.0
  (39073, 9)	1.0
  (39073, 37)	1.0
  (39073, 41)	1.0
  (39073, 51)	1.0
  (39073, 66)	1.0
  (39073, 73)	1.0
  (39073, 75)	1.0
  (39073, 115)	1.0
0.8817458947808354


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
# lbl_xgb.py
from sklearn import metrics
from sklearn import preprocessing
import pandas as pd
import xgboost as xgb

def run(fold: int):
    # サンプルのCSVファイルをpandasに読み込む
    df = pd.read_csv("./adult_folds.csv")
    
    # 数値を含む列の削除
    num_cols = [
        "age",
        "fnlwgt",
        #"educational-num",
        "capital-gain",
        "capital-loss",
        "hours-per-week"
    ]
    df = df.drop(num_cols, axis=1)
    
    # 目的変数を0と1に置換
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    features = [
        f for f in df.columns if f not in ("income", "kfold")
    
    ]
    
    # 特徴量のエンコーディング
    for col in features:
        # 欠損値を文字列で補完し、全ての値を文字列に置換
        # 置換後の文字列をラベル＝ラベルIDとして使用する
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # 特徴量のラベルエンコーディング
    for col in features:
        # 初期化
        lbl = preprocessing.LabelEncoder()
        
        # ラベルエンコーダの学習
        lbl.fit(df[col])
        
        # データセットの変換
        df.loc[:, col] = lbl.transform(df[col])
        
        
    # 引数と一致しないkfoldのデータを学習データにする
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # 引数と一致するkfoldのデータを検証データにする
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    
    # 学習データセットを準備
    x_train = df_train[features].values
    
    # 検証データセットを変換
    x_valid = df_valid[features].values
        
    # モデルの初期化と学習
    model = xgb.XGBClassifier(
        n_jobs=-1
    )
    model.fit(x_train, df_train.income.values)
    
    
    #  検証データセットを予測
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # AUCを算出
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    print(auc)
    
if __name__== "__main__":
    for fold_ in range(5):
        run(fold_)



0.8773120751978942




0.8743154155383793




0.8732755269460349




0.8767377603232294




0.8788226955426812


In [16]:
# lbl_xgb_num.py
from sklearn import metrics
from sklearn import preprocessing
import pandas as pd
import xgboost as xgb

def run(fold: int):
    # サンプルのCSVファイルをpandasに読み込む
    df = pd.read_csv("./adult_folds.csv")
    
    # 数値を含む列の削除
    num_cols = [
        "age",
        "fnlwgt",
        #"educational-num",
        "capital-gain",
        "capital-loss",
        "hours-per-week"
    ]
    df = df.drop(num_cols, axis=1)
    
    # 目的変数を0と1に置換
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    features = [
        f for f in df.columns if f not in ("income", "kfold")
    
    ]
    
    # 特徴量のエンコーディング
    for col in features:
        # 欠損値を文字列で補完し、全ての値を文字列に置換
        # 置換後の文字列をラベル＝ラベルIDとして使用する
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # 特徴量のラベルエンコーディング
    for col in features:
        if col not in num_cols:
            # 初期化
            lbl = preprocessing.LabelEncoder()

            # ラベルエンコーダの学習
            lbl.fit(df[col])

            # データセットの変換
            df.loc[:, col] = lbl.transform(df[col])
        
        
    # 引数と一致しないkfoldのデータを学習データにする
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # 引数と一致するkfoldのデータを検証データにする
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    
    # 学習データセットを準備
    x_train = df_train[features].values
    
    # 検証データセットを変換
    x_valid = df_valid[features].values
        
    # モデルの初期化と学習
    model = xgb.XGBClassifier(
        n_jobs=-1
    )
    model.fit(x_train, df_train.income.values)
    
    
    #  検証データセットを予測
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # AUCを算出
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    print(auc)
    
if __name__== "__main__":
    for fold_ in range(5):
        run(fold_)



0.8773120751978942




0.8743154155383793




0.8732755269460349




0.8767377603232294




0.8788226955426812


In [17]:
# lbl_xgb_num_feat.py

from sklearn import metrics
from sklearn import preprocessing
import pandas as pd
import xgboost as xgb
import itertools

def feature_engineering(df, cat_cols):
    # リスト内の全ての2値の組みを生成
    combi = list(itertools.combinations(cat_cols, 2))
    for c1, c2 in combi:
        df.loc[:, c1 + "_" + c2] = df[c1].astype(str) + "_" + df[c2].astype(str)
        
    return df
        

def run(fold: int):
    # サンプルのCSVファイルをpandasに読み込む
    df = pd.read_csv("./adult_folds.csv")
    
    # 数値を含む列の削除
    num_cols = [
        "age",
        "fnlwgt",
        #"educational-num",
        "capital-gain",
        "capital-loss",
        "hours-per-week"
    ]
    df = df.drop(num_cols, axis=1)
    
    # 目的変数を0と1に置換
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)
    
    # 質的変数の列
    cat_cols = [
        c for c in df.columns if c not in num_cols
        and c not in ("income", "kfold")
    ]
    
    # 新しい質的変数を特徴量に追加
    df = feature_engineering(df, cat_cols)
    
    features = [
        f for f in df.columns if f not in ("income", "kfold")
    ]
    
    
    # 特徴量のエンコーディング
    for col in features:
        # 欠損値を文字列で補完し、全ての値を文字列に置換
        # 置換後の文字列をラベル＝ラベルIDとして使用する
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # 特徴量のラベルエンコーディング
    for col in features:
        if col not in num_cols:
            # 初期化
            lbl = preprocessing.LabelEncoder()

            # ラベルエンコーダの学習
            lbl.fit(df[col])

            # データセットの変換
            df.loc[:, col] = lbl.transform(df[col])
        
        
    # 引数と一致しないkfoldのデータを学習データにする
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # 引数と一致するkfoldのデータを検証データにする
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    
    # 学習データセットを準備
    x_train = df_train[features].values
    
    # 検証データセットを変換
    x_valid = df_valid[features].values
        
    # モデルの初期化と学習
    model = xgb.XGBClassifier(
        n_jobs=-1
    )
    model.fit(x_train, df_train.income.values)
    
    
    #  検証データセットを予測
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # AUCを算出
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    print(auc)
    
if __name__== "__main__":
    for fold_ in range(5):
        run(fold_)



0.8766071006956615




0.8712445344042867




0.8722561645011729




0.876179147975956




0.8776732819704799


In [11]:
# target_encoding.py

from sklearn import metrics
from sklearn import preprocessing
import pandas as pd
import xgboost as xgb
import itertools
import copy 


def mean_target_encoding(data):
    
    # データセットのコピー
    df = copy.deepcopy(data)
    
    # 数値を含む列
    num_cols = [
        "age",
        "fnlwgt",
        #"educational-num",
        "capital-gain",
        "capital-loss",
        "hours-per-week"
    ]  

    # 目的変数を0と1に置換
    target_mapping = {
        "<=50K": 0,
        ">50K": 1
    }
    df.loc[:, "income"] = df.income.map(target_mapping)  
    
    # 目的変数とkfold列を除いて特徴量を作成
    features = [
        f for f in df.columns if f not in ("income", "kfold")
        and f not in num_cols
    ]
    
    # 特徴量のエンコーディング
    for col in features:
        # 欠損値を文字列で補完し、全ての値を文字列に置換
        # 置換後の文字列をラベル＝ラベルIDとして使用する
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")

    # 特徴量のラベルエンコーディング
    for col in features:
        if col not in num_cols:
            # 初期化
            lbl = preprocessing.LabelEncoder()

            # ラベルエンコーダの学習
            lbl.fit(df[col])

            # データセットの変換
            df.loc[:, col] = lbl.transform(df[col]) 
    
    # 検証データセット格納リスト
    encoded_dfs = []

    
    
    # 全ての分割についてループ
    for fold in range(5):
        # データセットの準備
        df_train = df[df.kfold != fold].reset_index(drop=True)
        df_valid = df[df.kfold == fold].reset_index(drop=True)
        
        # 全ての特徴量についてループ
        for column in features:
            # カテゴリごとの目的変数の平均についての辞書を作成
            mapping_dict = dict(
                df_train.groupby(column)["income"].mean()
            )
            # 列の末尾にencをつけた名前で新列を作成
            df_valid.loc[
                :, column + "_enc"
            ] = df_valid[column].map(mapping_dict)
        
        # リストに格納
        encoded_dfs.append(df_valid)
        
    # 結合したデータセットを返す
    encoded_dfs = pd.concat(encoded_dfs, axis=0)
    return encoded_dfs
    
    

def run(df, fold):
    # 引数と一致しないkfoldのデータを学習データにする
    df_train = df[df.kfold != fold].reset_index(drop=True)

    # 引数と一致するkfoldのデータを検証データにする
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # 目的変数とkfold列を除いて特徴量を作成
    features = [
        f for f in df.columns if f not in ("income", "kfold")
    ]
    
    
    # 学習データセットを準備
    x_train = df_train[features].values
    
    # 検証データセットを変換
    x_valid = df_valid[features].values
        
    # モデルの初期化と学習
    model = xgb.XGBClassifier(
        n_jobs=-1
        ,max_depth=7
    )
    model.fit(x_train, df_train.income.values)
    
    #  検証データセットを予測
    valid_preds = model.predict_proba(x_valid)[:, 1]
    
    # AUCを算出
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    
    print(auc)
    
if __name__== "__main__":
    # データの読み込み
    df = pd.read_csv("./adult_folds.csv")
    
    #エンコーディングの実行
    df = mean_target_encoding(df)
    
    # 各分割でモデル実行
    for fold_ in range(5):
        run(df, fold_)



0.9304215261730993




0.9247271360733174




0.9232791633102996




0.9238410866780831




0.929210237537218


In [10]:
type(df)

list