In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
from sklearn.metrics import f1_score

# データの読み込み
train = pd.read_csv('./source/train.csv', index_col=0)
test = pd.read_csv('./source/test.csv', index_col=0)
sample_submit = pd.read_csv('./source/sample_submission.csv', index_col=0, header=None)

from sklearn.preprocessing import LabelEncoder

# カテゴリ変数を数値化する関数
def encode_categorical_variables(df, categorical_variables):
    le = LabelEncoder()
    for col in categorical_variables:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

# カテゴリ変数の要素一覧を表示
categorical_variables = ['curb_loc', 'steward', 'guards', 'sidewalk', 'user_type', 'boroname']

for col in categorical_variables:
    unique_elements = train[col].unique()
    print(f'{col}の要素: {unique_elements}')

# カテゴリ変数を数値化
train_encoded = encode_categorical_variables(train, categorical_variables)
test_encoded = encode_categorical_variables(test, categorical_variables)

print(train_encoded)
print(test_encoded)


curb_locの要素: ['OnCurb' 'OffsetFromCurb']
stewardの要素: [nan '3or4' '1or2' '4orMore']
guardsの要素: [nan 'Helpful' 'Harmful' 'Unsure']
sidewalkの要素: ['Damage' 'NoDamage']
user_typeの要素: ['Volunteer' 'NYC Parks Staff' 'TreesCount Staff']
boronameの要素: ['Queens' 'Bronx' 'Staten Island' 'Manhattan' 'Brooklyn']
       created_at  tree_dbh  curb_loc  health  steward  guards  sidewalk  \
0      2015-06-29        14         1       1        3       3         0   
1      2016-09-21         5         1       1        1       1         1   
2      2015-09-13        26         1       2        3       3         1   
3      2016-05-09        15         1       0        3       3         0   
4      2016-06-24        23         1       1        3       3         1   
...           ...       ...       ...     ...      ...     ...       ...   
19979  2016-07-15        19         1       2        3       3         0   
19980  2016-07-08         5         1       1        3       3         1   
19981  2015-08-2

In [2]:
# カテゴリカル変数の要素一覧を表示
categorical_variables = ['curb_loc', 'steward', 'guards', 'sidewalk', 'user_type',
                          'problems', 'spc_common', 'spc_latin', 'nta', 'nta_name',
                          'boroname', 'zip_city']

for col in categorical_variables:
    unique_elements = train[col].unique()
    print(f'{col}の要素: {unique_elements}')

curb_locの要素: ['OnCurb' 'OffsetFromCurb']
stewardの要素: [nan '3or4' '1or2' '4orMore']
guardsの要素: [nan 'Helpful' 'Harmful' 'Unsure']
sidewalkの要素: ['Damage' 'NoDamage']
user_typeの要素: ['Volunteer' 'NYC Parks Staff' 'TreesCount Staff']
problemsの要素: [nan 'StonesBranchLights' 'Stones' 'BranchLights' 'StonesTrunkOther'
 'StonesWiresRopeBranchLights' 'StonesWiresRope' 'WiresRope' 'MetalGrates'
 'RootOtherTrunkOtherBranchOther' 'StonesTrunkOtherBranchOther'
 'RootOther' 'TrunkOtherBranchLightsBranchOther'
 'MetalGratesRootOtherBranchOther'
 'StonesWiresRopeTrunkLightsBranchLights' 'MetalGratesRootOtherTrunkOther'
 'StonesMetalGrates' 'StonesRootOtherBranchOther' 'StonesRootOther'
 'BranchOther' 'TrunkOther' 'TrunkOtherBranchOther'
 'StonesTrunkOtherBranchLightsBranchOther'
 'StonesRootOtherWiresRopeTrunkOtherBranchLightsBranchOther'
 'StonesBranchOther' 'WiresRopeBranchLights'
 'RootOtherWiresRopeBranchOther' 'TrunkOtherBranchLights'
 'RootOtherWiresRopeTrunkOtherBranchOther' 'StonesRootOtherTrunk

In [4]:
from sklearn.preprocessing import LabelEncoder

# カテゴリ変数を数値化する関数
def encode_categorical_variables(df, categorical_variables):
    le = LabelEncoder()
    for col in categorical_variables:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

# カテゴリ変数の要素一覧を表示
categorical_variables = ['curb_loc', 'steward', 'guards', 'sidewalk', 'user_type', 'boroname']

for col in categorical_variables:
    unique_elements = train[col].unique()
    print(f'{col}の要素: {unique_elements}')

# カテゴリ変数を数値化
train_encoded = encode_categorical_variables(train, categorical_variables)
test_encoded = encode_categorical_variables(test, categorical_variables)

print(train_encoded)
print(test_encoded)


curb_locの要素: [1 0]
stewardの要素: [3 1 0 2]
guardsの要素: [3 1 0 2]
sidewalkの要素: [0 1]
user_typeの要素: [2 0 1]
boronameの要素: [3 0 4 2 1]
       created_at  tree_dbh  curb_loc  health  steward  guards  sidewalk  \
0      2015-06-29        14         1       1        3       3         0   
1      2016-09-21         5         1       1        1       1         1   
2      2015-09-13        26         1       2        3       3         1   
3      2016-05-09        15         1       0        3       3         0   
4      2016-06-24        23         1       1        3       3         1   
...           ...       ...       ...     ...      ...     ...       ...   
19979  2016-07-15        19         1       2        3       3         0   
19980  2016-07-08         5         1       1        3       3         1   
19981  2015-08-20        21         1       0        3       3         0   
19982  2016-06-20         4         1       1        0       3         1   
19983  2015-08-19        31         

In [None]:
# 欠損値の処理
train.fillna('NULL', inplace=True)
test.fillna('NULL', inplace=True)

# problems列の処理
train['bool_problems'] = train['problems'].apply(lambda x: 0 if x=='NULL' else 1)
test['bool_problems'] = test['problems'].apply(lambda x: 0 if x=='NULL' else 1)

# 学習用データと検証用データの分割
train, valid = train_test_split(train, test_size=0.2, stratify=train['health'], random_state=82)

# 使用する特徴量の選択
select_cols = ['tree_dbh', 'curb_loc', 'sidewalk', 'steward', 'guards', 'user_type', 'bool_problems']

# 目的変数とそれ以外に学習用データを分割
x_train = train[select_cols]
y_train = train['health']
x_valid = valid[select_cols]
y_valid = valid['health']

# カテゴリのままでは学習できないのでワンホットエンコーディングで数値化
x_train = pd.get_dummies(x_train)
x_valid = pd.get_dummies(x_valid)
test = pd.get_dummies(test[select_cols])

# AutoML（TPOT）を使用
tpot = TPOTClassifier(generations=5, population_size=20, random_state=42, verbosity=2, n_jobs=-1)
tpot.fit(x_train, y_train)

# 検証データでの評価
valid_predictions = tpot.predict(x_valid)
valid_f1 = f1_score(y_valid, valid_predictions, average='macro')
print(f"Validation F1 Score (Macro): {valid_f1}")

# 予測
pred = tpot.predict(test)

# 予測結果の保存
sample_submit[1] = pred
sample_submit.to_csv('./submit/submit_automl_v1.csv', header=None)


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

# データの読み込み
train = pd.read_csv('./source/train.csv', index_col=0)
test = pd.read_csv('./source/test.csv', index_col=0)
sample_submit = pd.read_csv('./source/sample_submission.csv', index_col=0, header=None)

# 欠損値の処理
train.fillna('NULL', inplace=True)
test.fillna('NULL', inplace=True)

# problems列の処理
train['bool_problems'] = train['problems'].apply(lambda x: 0 if x == 'NULL' else 1)
test['bool_problems'] = test['problems'].apply(lambda x: 0 if x == 'NULL' else 1)

# カテゴリ変数を数値化する関数
def encode_categorical_variables(df, categorical_variables):
    le = LabelEncoder()
    for col in categorical_variables:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

# カテゴリ変数の要素一覧を表示
categorical_variables = ['curb_loc', 'steward', 'guards', 'sidewalk', 'user_type', 'boroname']

for col in categorical_variables:
    unique_elements = train[col].unique()
    print(f'{col}の要素: {unique_elements}')

# カテゴリ変数を数値化
train_encoded = encode_categorical_variables(train, categorical_variables)
test_encoded = encode_categorical_variables(test, categorical_variables)

# 学習用データと検証用データの分割
train_encoded, valid = train_test_split(train_encoded, test_size=0.2, stratify=train_encoded['health'], random_state=82)

# 使用する特徴量の選択
select_cols = ['tree_dbh', 'curb_loc', 'sidewalk', 'steward', 'guards', 'user_type', 'bool_problems']

# 目的変数とそれ以外に学習用データを分割
x_train = train_encoded[select_cols]
y_train = train_encoded['health']
x_valid = valid[select_cols]
y_valid = valid['health']

# カテゴリのままでは学習できないのでワンホットエンコーディングで数値化
x_train = pd.get_dummies(x_train)
x_valid = pd.get_dummies(x_valid)
test = pd.get_dummies(test_encoded[select_cols])

# AutoML（TPOT）を使用
tpot = TPOTClassifier(generations=5, population_size=20, random_state=42, verbosity=2, n_jobs=-1)
tpot.fit(x_train, y_train)

# 検証データでの評価
valid_predictions = tpot.predict(x_valid)
valid_f1 = f1_score(y_valid, valid_predictions, average='macro')
print(f"Validation F1 Score (Macro): {valid_f1}")

# 予測
pred = tpot.predict(test)

# 予測結果の保存
sample_submit[1] = pred
sample_submit.to_csv('./submit/submit_automl_v2.csv', header=None)


curb_locの要素: ['OnCurb' 'OffsetFromCurb']
stewardの要素: ['NULL' '3or4' '1or2' '4orMore']
guardsの要素: ['NULL' 'Helpful' 'Harmful' 'Unsure']
sidewalkの要素: ['Damage' 'NoDamage']
user_typeの要素: ['Volunteer' 'NYC Parks Staff' 'TreesCount Staff']
boronameの要素: ['Queens' 'Bronx' 'Staten Island' 'Manhattan' 'Brooklyn']
                                                                              
Generation 1 - Current best internal CV score: 0.7882029607572608
                                                                             
Generation 2 - Current best internal CV score: 0.7882029607572608
                                                                             
Generation 3 - Current best internal CV score: 0.7882029607572608
                                                                             
Generation 4 - Current best internal CV score: 0.7882654802823865
                                                                              
Generation 5 - Current best internal C