In [17]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

In [18]:
# 讀檔
submission = pd.read_csv("sample_submission.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 把 id, product_code, failure等, training用不到的column去掉
train_x = train.drop(columns=["id","product_code", "failure"])
test_x = test.drop(columns=["id","product_code"])

# 把 failure從字串轉成數字
train_y = train.iloc[:, -1]

# 把 attribute的 "material_{number}" 轉成 "{number}"
for i in range(len(train_x)):
    train_x.iat[i, 1] = train_x.iat[i, 1].split('_')[-1]
    train_x.iat[i, 2] = train_x.iat[i, 2].split('_')[-1]

for i in range(len(test_x)):
    test_x.iat[i, 1] = test_x.iat[i, 1].split('_')[-1]
    test_x.iat[i, 2] = test_x.iat[i, 2].split('_')[-1]


# 因為資料有10%的空缺，空缺的部分用 median的方式填補
imp = SimpleImputer(missing_values=np.nan, strategy='median')


train_x = train_x.astype({'attribute_0':'float', 'attribute_1':'float', 'attribute_2':'float', 'attribute_3':'float', 'measurement_0':'float', 'measurement_1':'float', 'measurement_2':'float'})
train_imp = imp.fit(train_x)
train_x = train_imp.transform(train_x)

test_x = test_x.astype({'attribute_0':'float', 'attribute_1':'float', 'attribute_2':'float', 'attribute_3':'float', 'measurement_0':'float', 'measurement_1':'float', 'measurement_2':'float'})
test_imp = imp.fit(test_x)
test_x = test_imp.transform(test_x)

train_y = train_y.astype({'failure':'float'})

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)


測試sklearn中不同的Classifier

In [20]:
RandomForest = RandomForestClassifier()
Linear = LinearRegression()
Logistic = LogisticRegression()
ExtraTrees = ExtraTreesClassifier()
GradientBoosting = GradientBoostingClassifier()
RidgeClassifier = RidgeClassifier()
KNeighbors = KNeighborsClassifier()

Logistic.fit(X_train, y_train)
Linear.fit(X_train, y_train)
ExtraTrees.fit(X_train, y_train)
RandomForest.fit(X_train, y_train)
GradientBoosting.fit(X_train, y_train)
RidgeClassifier.fit(X_train, y_train)
KNeighbors.fit(X_train, y_train)

pred_Logistic_train = Logistic.predict_proba(X_train)[:, 1]
pred_Linear_train = Linear.predict(X_train)
pred_ExtraTrees_train = ExtraTrees.predict_proba(X_train)[:, 1]
pred_RandomForest_train = RandomForest.predict_proba(X_train)[:, 1]
pred_GradientBoosting_train = GradientBoosting.predict_proba(X_train)[:, 1]
pred_RidgeClassifier_train = RidgeClassifier.predict(X_train)
pred_KNeighbors_train = KNeighbors.predict_proba(X_train)[:, 1]

pred_Logistic_val = Logistic.predict_proba(X_val)[:,1]
pred_Linear_val = Linear.predict(X_val)
pred_ExtraTrees_val = ExtraTrees.predict_proba(X_val)[:,1]
pred_RandomForest_val = RandomForest.predict_proba(X_val)[:,1]
pred_GradientBoosting_val = GradientBoosting.predict_proba(X_val)[:,1]
pred_RidgeClassifier_val = RidgeClassifier.predict(X_val)
pred_KNeighbors_val = KNeighbors.predict_proba(X_val)[:, 1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



training data 切成 train data -> 0.8, val data -> 0.2
下面輸出的結果有從高到低排序，效果最好的為LogisticRegression，但也僅到baseline


In [21]:
print("1.%25s, train/val = %.5f/%.5f" % ("LogisticRegression", roc_auc_score(y_train, pred_Logistic_train), roc_auc_score(y_val, pred_Logistic_val)))
print("2.%25s, train/val = %.5f/%.5f" % ("LinearRegression", roc_auc_score(y_train, pred_Linear_train), roc_auc_score(y_val, pred_Linear_val)))
print("3.%25s, train/val = %.5f/%.5f" % ("GradientBoosting", roc_auc_score(y_train, pred_GradientBoosting_train), roc_auc_score(y_val, pred_GradientBoosting_val)))
print("4.%25s, train/val = %.5f/%.5f" % ("ExtraTreesClassifier", roc_auc_score(y_train, pred_ExtraTrees_train), roc_auc_score(y_val, pred_ExtraTrees_val)))
print("5.%25s, train/val = %.5f/%.5f" % ("RandomForestClassifier", roc_auc_score(y_train, pred_RandomForest_train), roc_auc_score(y_val, pred_RandomForest_val)))
print("6.%25s, train/val = %.5f/%.5f" % ("KNeighbors", roc_auc_score(y_train, pred_KNeighbors_train), roc_auc_score(y_val, pred_KNeighbors_val)))
print("7.%25s, train/val = %.5f/%.5f" % ("RidgeClassifier", roc_auc_score(y_train, pred_RidgeClassifier_train), roc_auc_score(y_val, pred_RidgeClassifier_val)))

1.       LogisticRegression, train/val = 0.59230/0.59399
2.         LinearRegression, train/val = 0.59386/0.59279
3.         GradientBoosting, train/val = 0.65984/0.59001
4.     ExtraTreesClassifier, train/val = 1.00000/0.55225
5.   RandomForestClassifier, train/val = 1.00000/0.54981
6.               KNeighbors, train/val = 0.80556/0.52849
7.          RidgeClassifier, train/val = 0.50015/0.50012
