In [1]:
import os
import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


INPUT_DIR = "../data"

In [2]:
def write_submission(id_list, pred, output_file):
    """予測結果を出力します."""
    df = pd.DataFrame({'id': id_list, 'default': pred})
    df.to_csv(output_file, index=False)

In [None]:
train_file = os.path.join(INPUT_DIR, 'train/train.csv')
target_name = 'default'

# 目的変数、説明変数を抽出します.
train_df = pd.read_csv(train_file)
targets = train_df[target_name]
train_df = train_df.drop(target_name, axis=1)
num_cols = [is_numeric_dtype(dtype) for dtype in train_df.dtypes]
train_features = train_df.loc[:, num_cols]
train_features = train_features.fillna(train_features.mean())
mms = MinMaxScaler()
train_features = mms.fit_transform(train_features)

In [6]:
print(train_df.shape)

(10000, 14)


In [6]:
print(train_df.head(30))

    id  loan_amnt       term  int_rate  installment grade  \
0    0    12000.0  36 months     11.53       395.89     B   
1    1    16000.0  60 months     14.65       377.71     F   
2    2     4000.0  36 months     10.99       130.94     A   
3    3     3125.0  36 months      9.93       100.74     F   
4    4    15000.0  60 months     15.99       364.70     F   
5    5    16000.0  60 months     16.29       391.56     C   
6    6    19750.0  36 months     11.99       655.89     B   
7    7    20000.0  60 months     15.61       482.23     B   
8    8     6000.0  36 months     11.22       197.06     C   
9    9    18000.0  36 months      7.89       563.15     B   
10  10    20000.0  36 months     10.15       646.76     C   
11  11     5000.0  36 months     13.11       168.74     B   
12  12    21000.0  36 months      6.62       644.78     A   
13  13     5000.0  36 months     11.44       164.74     C   
14  14    25000.0  60 months     11.49       549.69     D   
15  15    12000.0  36 mo

In [7]:
print(targets.head(30))

0     0
1     0
2     0
3     0
4     1
5     1
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    1
16    0
17    0
18    0
19    1
20    0
21    0
22    0
23    1
24    0
25    1
26    0
27    1
28    0
29    0
Name: default, dtype: int64


In [8]:
test_file = os.path.join(INPUT_DIR, 'test/test.csv')
test_df = pd.read_csv(test_file)

In [10]:
print(test_df.head(35))

       id  loan_amnt       term  int_rate  installment grade  \
0   10000    21000.0  36 months     10.91       686.62     B   
1   10001     3500.0  36 months     10.99       114.57     B   
2   10002    31575.0  36 months     13.35      1069.22     B   
3   10003     8000.0  36 months     12.99       269.52     B   
4   10004     6225.0  36 months     14.99       215.77     B   
5   10005    12000.0  60 months     14.98       285.36     F   
6   10006     3000.0  36 months      8.39        94.55     C   
7   10007     4800.0  36 months     14.08       164.24     B   
8   10008    25000.0  36 months      7.26       774.91     F   
9   10009    15000.0  36 months     11.99       498.15     D   
10  10010    10000.0  36 months     13.99       341.73     E   
11  10011    25200.0  60 months     19.47       660.24     B   
12  10012    32000.0  36 months     12.99      1078.06     F   
13  10013    30400.0  36 months     17.27      1087.94     D   
14  10014    14100.0  60 months     19.0

In [43]:
train_file = os.path.join(INPUT_DIR, 'train/train.csv')
target_name = 'default'

# 目的変数、説明変数を抽出します.
train_df = pd.read_csv(train_file)
targets = train_df[target_name]
train_df = train_df.drop(target_name, axis=1)
num_cols = [is_numeric_dtype(dtype) for dtype in train_df.dtypes]
train_features = train_df.loc[:, num_cols]
train_features = train_features.fillna(train_features.mean())
mms = MinMaxScaler()
train_features = mms.fit_transform(train_features)

# モデルの学習を行います.
X_train, X_test, y_train, y_test = train_test_split(
train_features, targets, test_size=0.2, random_state=42
)
clf = LogisticRegression()
scoring = 'accuracy'
params = {
'penalty': ['l1', 'l2'],
'C': [.01, 1.],
'solver': ['saga', 'liblinear'],
}
model = GridSearchCV(
clf,
params,
scoring=scoring,
cv=5,
)
model.fit(X=X_train, y=y_train)

# 予測を行います.
test_file = os.path.join(INPUT_DIR, 'test/test.csv')
test_df = pd.read_csv(test_file)
num_cols = [is_numeric_dtype(dtype) for dtype in test_df.dtypes]
test_features = test_df.loc[:, num_cols]
test_features = test_features.fillna(test_features.mean())
test_features = mms.transform(test_features)
pred = model.predict_proba(test_features)[:, 1]
# pred.sort()

# 結果を出力します.
output_file = os.path.join(INPUT_DIR, 'submission.csv')
write_submission(test_df['id'], pred, output_file)



In [44]:
train_file = os.path.join(INPUT_DIR, 'train/train.csv')
target_name = 'default'

# 目的変数、説明変数を抽出します.
train_df = pd.read_csv(train_file)
targets = train_df[target_name]
train_df = train_df.drop(target_name, axis=1)
num_cols = [is_numeric_dtype(dtype) for dtype in train_df.dtypes]
train_features = train_df.loc[:, num_cols]
train_features = train_features.fillna(train_features.mean())
mms = MinMaxScaler()
train_features = mms.fit_transform(train_features)

# モデルの学習を行います.
X_train, X_test, y_train, y_test = train_test_split(
train_features, targets, test_size=0.2, random_state=42
)
clf = LogisticRegression()
scoring = 'accuracy'
params = {
'penalty': ['l1', 'l2'],
'C': [.01, 1.],
'solver': ['saga', 'liblinear'],
}
model = GridSearchCV(
clf,
params,
scoring=scoring,
cv=5,
)
model.fit(X=X_train, y=y_train)

# 予測を行います.
test_file = os.path.join(INPUT_DIR, 'test/test.csv')
test_df = pd.read_csv(test_file)
num_cols = [is_numeric_dtype(dtype) for dtype in test_df.dtypes]
test_features = test_df.loc[:, num_cols]
test_features = test_features.fillna(test_features.mean())
test_features = mms.transform(test_features)
pred = model.predict_proba(test_features)[:, 1]
# pred.sort()

# 結果を出力します.
output_file = os.path.join(INPUT_DIR, 'submission.csv')
write_submission(test_df['id'], pred, output_file)



In [45]:
# %pip install lightgbm

In [46]:
# import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [47]:
train_file = os.path.join(INPUT_DIR, 'train/train.csv')
target_name = 'default'

# 目的変数、説明変数を抽出します.
train_df = pd.read_csv(train_file)
targets = train_df[target_name]
train_df = train_df.drop(target_name, axis=1)
num_cols = [is_numeric_dtype(dtype) for dtype in train_df.dtypes]
train_features = train_df.loc[:, num_cols]
train_features = train_features.fillna(train_features.mean())
mms = MinMaxScaler()
train_features = mms.fit_transform(train_features)

# モデルの学習を行います.
X_train, X_test, y_train, y_test = train_test_split(
train_features, targets, test_size=0.2, random_state=42
)
clf = LogisticRegression()
scoring = 'accuracy'
params = {
'penalty': ['l1', 'l2'],
'C': [.01, 1.],
'solver': ['saga', 'liblinear'],
}
model = GridSearchCV(
clf,
params,
scoring=scoring,
cv=5,
)
model.fit(X=X_train, y=y_train)

# 予測を行います.
test_file = os.path.join(INPUT_DIR, 'test/test.csv')
test_df = pd.read_csv(test_file)
num_cols = [is_numeric_dtype(dtype) for dtype in test_df.dtypes]
test_features = test_df.loc[:, num_cols]
test_features = test_features.fillna(test_features.mean())
test_features = mms.transform(test_features)
pred = model.predict_proba(test_features)[:, 1]
# pred.sort()

# 結果を出力します.
output_file = os.path.join(INPUT_DIR, 'submission.csv')
write_submission(test_df['id'], pred, output_file)



In [48]:
# %pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [49]:
from xgboost import XGBClassifier

train_file = os.path.join(INPUT_DIR, 'train/train.csv')
target_name = 'default'

# 目的変数、説明変数を抽出します.
train_df = pd.read_csv(train_file)
targets = train_df[target_name]
train_df = train_df.drop(target_name, axis=1)
num_cols = [is_numeric_dtype(dtype) for dtype in train_df.dtypes]
train_features = train_df.loc[:, num_cols]
train_features = train_features.fillna(train_features.mean())
mms = MinMaxScaler()
train_features = mms.fit_transform(train_features)

# モデルの学習を行います.
X_train, X_test, y_train, y_test = train_test_split(
train_features, targets, test_size=0.2, random_state=42
)
clf = LogisticRegression()
scoring = 'accuracy'
params = {
'penalty': ['l1', 'l2'],
'C': [.01, 1.],
'solver': ['saga', 'liblinear'],
}
# model = GridSearchCV(
# clf,
# params,
# scoring=scoring,
# cv=5,
# )
# model.fit(X=X_train, y=y_train)



# モデルの学習

# バリデーションデータでの予測
# pred = model.predict(X)

# 予測を行います.
test_file = os.path.join(INPUT_DIR, 'test/test.csv')
test_df = pd.read_csv(test_file)
num_cols = [is_numeric_dtype(dtype) for dtype in test_df.dtypes]
test_features = test_df.loc[:, num_cols]
test_features = test_features.fillna(test_features.mean())
test_features = mms.transform(test_features)
pred = model.predict_proba(test_features)[:, 1]
# pred.sort()

model = XGBClassifier()

# モデルの学習
model.fit(X_train, y_train)

# バリデーションデータでの予測
# pred = model.predict(X_test)

pred = model.predict_proba(test_features)[:, 1]

# 結果を出力します.
output_file = os.path.join(INPUT_DIR, 'submission.csv')
write_submission(test_df['id'], pred, output_file)

In [50]:
from xgboost import XGBClassifier

train_file = os.path.join(INPUT_DIR, 'train/train.csv')
target_name = 'default'

# 目的変数、説明変数を抽出します.
train_df = pd.read_csv(train_file)
print(train_df.head(1))
targets = train_df[target_name]
train_df = train_df.drop(target_name, axis=1)
num_cols = [is_numeric_dtype(dtype) for dtype in train_df.dtypes]
train_features = train_df.loc[:, num_cols]
train_features = train_features.fillna(train_features.mean())
mms = MinMaxScaler()
train_features = mms.fit_transform(train_features)

# モデルの学習を行います.
X_train, X_test, y_train, y_test = train_test_split(
train_features, targets, test_size=0.2, random_state=42
)
clf = XGBClassifier()
scoring = 'accuracy'
params = {
    # 'booster' : 'gbtree',
    # 'objective' : 'binary:logistic',
    # 'eta' : '0.1'
}
model = GridSearchCV(
clf,
params,
scoring=scoring,
cv=5,
)
model.fit(X=X_train, y=y_train)



# モデルの学習

# バリデーションデータでの予測
# pred = model.predict(X)

# 予測を行います.
test_file = os.path.join(INPUT_DIR, 'test/test.csv')
test_df = pd.read_csv(test_file)
num_cols = [is_numeric_dtype(dtype) for dtype in test_df.dtypes]
test_features = test_df.loc[:, num_cols]
test_features = test_features.fillna(test_features.mean())
test_features = mms.transform(test_features)
pred = model.predict_proba(test_features)[:, 1]
# pred.sort()

# model = XGBClassifier()

# # モデルの学習
# model.fit(X_train, y_train)

# バリデーションデータでの予測
# pred = model.predict(X_test)

# pred = model.predict_proba(test_features)[:, 1]

# 結果を出力します.
output_file = os.path.join(INPUT_DIR, 'submission.csv')
write_submission(test_df['id'], pred, output_file)

   id  loan_amnt       term  int_rate  installment grade  \
0   0    12000.0  36 months     11.53       395.89     B   

                  emp_title emp_length home_ownership  annual_inc  \
0  Sales Operations Manager   < 1 year           RENT     85000.0   

  verification_status  default             purpose               title  \
0     Source Verified        0  debt_consolidation  Debt consolidation   

  addr_state  
0         CO  


In [51]:
from xgboost import XGBClassifier

train_file = os.path.join(INPUT_DIR, 'train/train.csv')
target_name = 'default'

# 目的変数、説明変数を抽出します.
train_df = pd.read_csv(train_file)
print(train_df.head(1))
targets = train_df[target_name]
train_df = train_df.drop(target_name, axis=1)
num_cols = [is_numeric_dtype(dtype) for dtype in train_df.dtypes]
train_features = train_df.loc[:, num_cols]
train_features = train_features.fillna(train_features.mean())
mms = MinMaxScaler()
train_features = mms.fit_transform(train_features)

# モデルの学習を行います.
X_train, X_test, y_train, y_test = train_test_split(
train_features, targets, test_size=0.2, random_state=42
)
clf = XGBClassifier()
scoring = 'accuracy'
params = {
    # 'booster' : 'gbtree',
    # 'objective' : 'binary:logistic',
    # 'eta' : '0.1'
}
model = GridSearchCV(
clf,
params,
scoring=scoring,
cv=5,
)
model.fit(X=X_train, y=y_train)



# モデルの学習

# バリデーションデータでの予測
# pred = model.predict(X)

# 予測を行います.
test_file = os.path.join(INPUT_DIR, 'test/test.csv')
test_df = pd.read_csv(test_file)
num_cols = [is_numeric_dtype(dtype) for dtype in test_df.dtypes]
test_features = test_df.loc[:, num_cols]
test_features = test_features.fillna(test_features.mean())
test_features = mms.transform(test_features)
pred = model.predict_proba(test_features)[:, 1]

# 結果を出力します.
output_file = os.path.join(INPUT_DIR, 'submission.csv')
write_submission(test_df['id'], pred, output_file)

   id  loan_amnt       term  int_rate  installment grade  \
0   0    12000.0  36 months     11.53       395.89     B   

                  emp_title emp_length home_ownership  annual_inc  \
0  Sales Operations Manager   < 1 year           RENT     85000.0   

  verification_status  default             purpose               title  \
0     Source Verified        0  debt_consolidation  Debt consolidation   

  addr_state  
0         CO  


In [52]:
# import os
# import pandas as pd
# from pandas.api.types import is_numeric_dtype
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.preprocessing import MinMaxScaler
# from xgboost import XGBClassifier

# INPUT_DIR = "../data"

# def write_submission(id_list, pred, output_file):
#     """予測結果を出力します."""
#     df = pd.DataFrame({'id': id_list, 'default': pred})
#     df.to_csv(output_file, index=False)

# # 学習データの読み込み
# train_file = os.path.join(INPUT_DIR, 'train/train.csv')
# target_name = 'default'

# train_df = pd.read_csv(train_file)
# targets = train_df[target_name]
# train_df = train_df.drop(columns=[target_name])

# train_df = pd.get_dummies(train_df)

# # 数値カラムのみ抽出
# num_cols = [col for col in train_df.columns if is_numeric_dtype(train_df[col])]
# train_features = train_df[num_cols].fillna(train_df[num_cols].mean())

# train_features = pd.get_dummies(train_features)

# # スケーリング
# mms = MinMaxScaler()
# train_features = mms.fit_transform(train_features)

# # 学習データと検証データに分割
# X_train, X_test, y_train, y_test = train_test_split(
#     train_features, targets, test_size=0.2, random_state=42
# )

# # XGBoostモデルの構築
# clf = XGBClassifier(
#     use_label_encoder=False,
#     eval_metric='logloss',
#     random_state=42
# )

# # ハイパーパラメータチューニング（必要なら）
# params = {
#     'max_depth': [3, 5],
#     'learning_rate': [0.05, 0.1],
#     'n_estimators': [100, 200],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }

# model = GridSearchCV(
#     clf,
#     params,
#     scoring='roc_auc',
#     cv=5,
#     verbose=1,
#     n_jobs=-1
# )

# # モデルの学習
# model.fit(X_train, y_train)

# # テストデータの読み込みと前処理
# test_file = os.path.join(INPUT_DIR, 'test/test.csv')
# test_df = pd.read_csv(test_file)
# test_features = test_df[num_cols].fillna(test_df[num_cols].mean())
# test_features = mms.transform(test_features)

# # 予測（デフォルト確率）
# pred = model.predict_proba(test_features)[:, 1]

# # 提出ファイルの作成
# output_file = os.path.join(INPUT_DIR, 'submission.csv')
# write_submission(test_df['id'], pred, output_file)


In [53]:
print(test_df.head(2))

      id  loan_amnt       term  int_rate  installment grade  \
0  10000    21000.0  36 months     10.91       686.62     B   
1  10001     3500.0  36 months     10.99       114.57     B   

            emp_title emp_length home_ownership  annual_inc  \
0  Accounting Manager    8 years       MORTGAGE     64000.0   
1   Walt Disney World    5 years       MORTGAGE     50000.0   

  verification_status           purpose             title addr_state  
0        Not Verified  home_improvement  Home improvement         FL  
1            Verified  home_improvement      Home repairs         FL  


In [54]:
print(train_df.head(1))

   id  loan_amnt       term  int_rate  installment grade  \
0   0    12000.0  36 months     11.53       395.89     B   

                  emp_title emp_length home_ownership  annual_inc  \
0  Sales Operations Manager   < 1 year           RENT     85000.0   

  verification_status             purpose               title addr_state  
0     Source Verified  debt_consolidation  Debt consolidation         CO  


In [62]:
print(train_df['title'])

0            Debt consolidation
1            Debt consolidation
2                      GoodtoGo
3            Debt consolidation
4            Debt consolidation
                 ...           
9995         Debt consolidation
9996    Credit card refinancing
9997           Home improvement
9998         Debt consolidation
9999                      Other
Name: title, Length: 10000, dtype: object


In [63]:
print(train_df['emp_length'])

0        < 1 year
1          1 year
2       10+ years
3         3 years
4        < 1 year
          ...    
9995    10+ years
9996      2 years
9997          NaN
9998     < 1 year
9999      5 years
Name: emp_length, Length: 10000, dtype: object


In [64]:
print(train_df['grade'])

0       B
1       F
2       A
3       F
4       F
       ..
9995    D
9996    F
9997    F
9998    D
9999    A
Name: grade, Length: 10000, dtype: object


In [56]:
train_df = pd.read_csv(train_file)
print(train_df.head(1))

   id  loan_amnt       term  int_rate  installment grade  \
0   0    12000.0  36 months     11.53       395.89     B   

                  emp_title emp_length home_ownership  annual_inc  \
0  Sales Operations Manager   < 1 year           RENT     85000.0   

  verification_status  default             purpose               title  \
0     Source Verified        0  debt_consolidation  Debt consolidation   

  addr_state  
0         CO  


In [57]:
print(pred)

[0.02268165 0.00900773 0.22015148 ... 0.07095399 0.32646158 0.04816739]
