In [1]:
import baseline

In [2]:
X, y = baseline.baselineXy()

In [3]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer, Normalizer, PowerTransformer, RobustScaler, SplineTransformer, StandardScaler, MinMaxScaler
td = {
 'sdoif_mean_id': QuantileTransformer,
 'sdoif_mean_sq': RobustScaler,
 'elevation_mean_sqrt': QuantileTransformer,
 'procurv_mean_id': StandardScaler,
 'placurv_mean_id': Normalizer,
 'lsfactor_mean_id': Normalizer,
 'slope_mean_id': RobustScaler,
 'twi_mean_id': Normalizer,
 'twi_mean_sqrt': QuantileTransformer,
 'aspect_mean_id': QuantileTransformer
}

In [5]:
for key, t in td.items():
    X[key] = t().fit_transform(X[[key]])
X = X.to_numpy()

In [4]:
groups = [
    [1, 2, 6, 7],
    [3, 8],
    [4, 5, 9, 10],
    [11, 12],
    [13],
    [14, 15],
    [16, 17, 21, 22],
    [18, 23],
    [19, 20, 24, 25],
    list(range(1, 26))
]

def get_group_mean(df, col, group):
    cols = [f"{i}_{col}" for i in group]
    return df[cols].to_numpy().mean(axis=1)

In [5]:
import baseline
X, y = baseline.originalXy()
get_group_mean(X, "slope", groups[0])

array([37.73571  , 31.7261425, 49.1132025, ..., 39.9281625, 35.24104  ,
       20.02481  ])

In [25]:
import pandas as pd
import numpy as np

def preprocess(X):
    def normalize(x):
        return (x - x.min()) / (x.max() - x.min())

    def special_normalize(x):
        return ((x - x.min()) / (x.max() - x.min()) + 0.01) / 1.01

    fns = [(np.array, 'id'), (np.sqrt, 'sqrt'), (np.square, 'sq'), (np.log, 'log')]
    print(len(baseline.continuous) * len(groups) * len(fns))
    cols = []
    col_names = []
    for col in baseline.continuous:
        for i, group in enumerate(groups):
            for fn, name in fns:
                this = f"{col}_g{i}_{name}"
                if name == 'log':
                    cols.append(pd.Series(fn(special_normalize(get_group_mean(X, col, group))), name=this, dtype=float))
                else:
                    cols.append(pd.Series(fn(normalize(get_group_mean(X, col, group))), name=this, dtype=float))
                col_names.append(this)
    
    for col in X.columns:
        cols.append(X[col])

    df = pd.concat(cols, axis=1)

    import dfcols
    def most_freq(df, col):
        matrix = df[dfcols.all_square_cols(col)].to_numpy()
        return np.array(list(map(np.argmax, map(np.bincount, matrix))))

    cat = "geology"
    # get most freq category
    df[cat] = most_freq(X, cat)

    for cat_val in df[cat].unique():
        df[f"{cat}_{cat_val}"] = np.array(df[cat] == cat_val, dtype=int)

    return df
grid_groups = preprocess(X)

320


In [12]:
from finn.feat_trans import test_feats
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [13]:
import lightgbm as lgb

In [14]:
def lgb_test(x, y, **kwargs):
    kf = KFold(n_splits=3)
    f1m = 0
    for train_ind, test_ind in kf.split(x):
        X_train, X_test = x[train_ind], x[test_ind]
        y_train, y_test = y[train_ind], y[test_ind]
        X_train, y_train = SMOTE().fit_resample(X_train, y_train)
        X_train, X_test = StandardScaler().fit_transform(X_train), StandardScaler().fit_transform(X_test)
        ds = lgb.Dataset(X_train, label=y_train)
        bst = lgb.train(kwargs, ds)
        y_pred = bst.predict(X_test)
        y_pred = np.array(y_pred > 0.5, dtype=int)
        f1 = f1_score(y_test, y_pred)
        f1m += f1 / 3
    return f1m

In [15]:
import featuretools as ft

In [27]:
X, y = baseline.originalXy()
X = preprocess(X)
X, y = SMOTE().fit_resample(X, y)
X['geology'] = X['geology'].map({
        1: "Weathered Cretaceous granitic rocks",
        2: "Weathered Jurassic granite rocks",
        3: "Weathered Jurassic tuff and lava",
        4: "Weathered Cretaceous tuff and lava",
        5: "Quaternary deposits",
        6: "Fill",
        7: "Weathered Jurassic sandstone, siltstone and mudstone"
    })
X['id'] = X.index
es = ft.EntitySet(id = 'gridgroups')
es = es.add_dataframe(dataframe_name='gg', dataframe = X, index='id')
es.normalize_dataframe(base_dataframe_name='gg', new_dataframe_name='2nd', index='geology')

320


  X['id'] = X.index


Entityset: gridgroups
  DataFrames:
    gg [Rows: 16296, Columns: 554]
    2nd [Rows: 7, Columns: 1]
  Relationships:
    gg.geology -> 2nd.geology

In [28]:
X

Unnamed: 0,sdoif_g0_id,sdoif_g0_sqrt,sdoif_g0_sq,sdoif_g0_log,sdoif_g1_id,sdoif_g1_sqrt,sdoif_g1_sq,sdoif_g1_log,sdoif_g2_id,sdoif_g2_sqrt,...,25_sdoif,geology,geology_3,geology_2,geology_5,geology_1,geology_7,geology_4,geology_6,id
0,0.680410,0.824870,0.462958,-0.380419,0.680235,0.824763,0.462719,-0.380674,0.680087,0.824674,...,1.281693,Weathered Jurassic tuff and lava,1,0,0,0,0,0,0,0
1,0.960227,0.979912,0.922036,-0.040175,0.960051,0.979822,0.921698,-0.040357,0.959901,0.979745,...,1.359579,Weathered Jurassic tuff and lava,1,0,0,0,0,0,0,1
2,0.979967,0.989933,0.960336,-0.020034,0.980065,0.989982,0.960528,-0.019935,0.980189,0.990045,...,1.365038,Weathered Jurassic granite rocks,0,1,0,0,0,0,0,2
3,0.029984,0.173158,0.000899,-3.229237,0.030168,0.173691,0.000910,-3.224625,0.030344,0.174196,...,1.100731,Weathered Jurassic granite rocks,0,1,0,0,0,0,0,3
4,0.690175,0.830768,0.476342,-0.366375,0.689855,0.830575,0.475900,-0.366832,0.689496,0.830359,...,1.283876,Quaternary deposits,0,0,1,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16291,0.731971,0.855529,0.535904,-0.308505,0.731884,0.855478,0.535777,-0.308622,0.731806,0.855432,...,1.296128,Weathered Jurassic tuff and lava,1,0,0,0,0,0,0,16291
16292,0.734917,0.857246,0.540241,-0.304555,0.734831,0.857196,0.540113,-0.304670,0.734747,0.857147,...,1.296958,Weathered Jurassic tuff and lava,1,0,0,0,0,0,0,16292
16293,0.794210,0.891185,0.630770,-0.227845,0.794008,0.891071,0.630449,-0.228096,0.793834,0.890974,...,1.313354,Weathered Jurassic tuff and lava,1,0,0,0,0,0,0,16293
16294,0.770762,0.876427,0.602098,-0.264147,0.770575,0.876323,0.601797,-0.264379,0.770386,0.876218,...,1.306828,Weathered Jurassic tuff and lava,1,0,0,0,0,0,0,16294


In [29]:
feature_matrix, feature_names = ft.dfs(
    entityset=es, 
    target_dataframe_name = 'gg',
    max_depth=10
)

len(feature_names)

3866

In [31]:
feats = StandardScaler().fit_transform(feature_matrix.drop(["geology"], axis=1, inplace=False))
feats.shape

(16296, 3865)

In [32]:
from boruta import BorutaPy
feat_selector = BorutaPy(
    verbose=2,
    estimator=ExtraTreesClassifier(),
    n_estimators='auto',
    max_iter=8,  # number of iterations to perform
)
feat_selector.fit(feats, y)

Iteration: 	1 / 8
Confirmed: 	0
Tentative: 	3865
Rejected: 	0
Iteration: 	2 / 8
Confirmed: 	0
Tentative: 	3865
Rejected: 	0
Iteration: 	3 / 8
Confirmed: 	0
Tentative: 	3865
Rejected: 	0
Iteration: 	4 / 8
Confirmed: 	0
Tentative: 	3865
Rejected: 	0
Iteration: 	5 / 8
Confirmed: 	0
Tentative: 	3865
Rejected: 	0
Iteration: 	6 / 8
Confirmed: 	0
Tentative: 	3865
Rejected: 	0
Iteration: 	7 / 8
Confirmed: 	0
Tentative: 	3865
Rejected: 	0


BorutaPy finished running.

Iteration: 	8 / 8
Confirmed: 	0
Tentative: 	786
Rejected: 	0


BorutaPy(estimator=ExtraTreesClassifier(n_estimators=879,
                                        random_state=RandomState(MT19937) at 0x2B70CD7D0B40),
         max_iter=8, n_estimators='auto',
         random_state=RandomState(MT19937) at 0x2B70CD7D0B40, verbose=2)

In [39]:
feat_selector.fit_transform(feats, y)

ValueError: Found input variables with inconsistent numbers of samples: [16296, 10864]

In [36]:
def complete(X):
    gg = preprocess(X)
    gg['id'] = gg.index
    es = ft.EntitySet(id = 'gridgroups')
    es = es.add_dataframe(dataframe_name='gg', dataframe =gg, index='id')
    es.normalize_dataframe(base_dataframe_name='gg', new_dataframe_name='2nd', index='geology')
    feature_matrix, feature_names = ft.dfs(
        entityset=es, 
        target_dataframe_name = 'gg',
        max_depth=10
    )
    feats = StandardScaler().fit_transform(feature_matrix.drop(["geology"], axis=1, inplace=False))
    short_feats = feat_selector.transform(feats)
    return short_feats
#y_pred = cb.predict(short_feats)

In [17]:
cb = CatBoostClassifier(iterations=400, learning_rate=0.2, depth=6)
X, y = baseline.originalXy()
feats = complete(X)
print(feats.shape)
feats, y = SMOTE().fit_resample(feats, y)
cb.fit(feats, y)
print(f1_score(y, cb.predict(feats)))
y_pred = cb.predict(complete(baseline.origtestX()))

320
(10864, 252)
0:	learn: 0.5943333	total: 95.1ms	remaining: 37.9s
1:	learn: 0.5343578	total: 130ms	remaining: 25.9s
2:	learn: 0.4949056	total: 165ms	remaining: 21.8s
3:	learn: 0.4708808	total: 199ms	remaining: 19.7s
4:	learn: 0.4509781	total: 235ms	remaining: 18.6s
5:	learn: 0.4381107	total: 270ms	remaining: 17.7s
6:	learn: 0.4291794	total: 306ms	remaining: 17.2s
7:	learn: 0.4202852	total: 341ms	remaining: 16.7s
8:	learn: 0.4096361	total: 404ms	remaining: 17.5s
9:	learn: 0.3996324	total: 438ms	remaining: 17.1s
10:	learn: 0.3941120	total: 472ms	remaining: 16.7s
11:	learn: 0.3886123	total: 507ms	remaining: 16.4s
12:	learn: 0.3849474	total: 542ms	remaining: 16.1s
13:	learn: 0.3807273	total: 577ms	remaining: 15.9s
14:	learn: 0.3763270	total: 612ms	remaining: 15.7s
15:	learn: 0.3723740	total: 647ms	remaining: 15.5s
16:	learn: 0.3694734	total: 680ms	remaining: 15.3s
17:	learn: 0.3662116	total: 715ms	remaining: 15.2s
18:	learn: 0.3631344	total: 753ms	remaining: 15.1s
19:	learn: 0.3589783	to

In [18]:
test = pd.read_csv("data/Test.csv")
sub_file = pd.DataFrame({'Sample_ID': test.Sample_ID, 'Label': y_pred})
sub_file.to_csv('finn/cat_fs.csv', index = False)
sub_file.head()

Unnamed: 0,Sample_ID,Label
0,10865,0
1,10866,0
2,10867,0
3,10868,1
4,10869,1


In [85]:
from sklearn.model_selection import train_test_split
from itertools import product
X, y = baseline.originalXy()
feats = complete(X)

for i, x in enumerate([feats]):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)

    params = {'objective': 'binary'}
    ds = lgb.Dataset(X_train, label=y_train)
    boosts = ['gbdt', 'goss']
    lrs = [0.1]
    ns = [31, 47, 63]
    iterations = [100]
    for b, lr, n, it in product(boosts, lrs, ns, iterations):
        params['boosting'] = b
        params['learning_rate'] = lr
        params['num_leaves'] = n
        params['num_iterationsn'] = it
        bst = lgb.train(params, ds)
        y_prob = bst.predict(X_test)
        for thresh in [0.4, 0.42, 0.45, 0.5, 0.55]:
            y_pred = np.array(y_prob > thresh, dtype=int)

            f1 = f1_score(y_test, y_pred)

            print(f"X#{i}, {b: >5} {lr:.2f} {n} {it}, t={thresh:.3f}: {f1:.6f}")


320
[LightGBM] [Info] Number of positive: 6531, number of negative: 6531
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 69101
[LightGBM] [Info] Number of data points in the train set: 13062, number of used features: 271
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
X#0,  gbdt 0.10 31 100, t=0.4: 0.702746
X#0,  gbdt 0.10 31 100, t=0.4: 0.704433
X#0,  gbdt 0.10 31 100, t=0.5: 0.708054
X#0,  gbdt 0.10 31 100, t=0.5: 0.706701
X#0,  gbdt 0.10 31 100, t=0.6: 0.694319
[LightGBM] [Info] Number of positive: 6531, number of negative: 6531
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 69101
[LightGBM] [Info] Number of data points in the train set: 13062, number of used features: 271
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
X#0,  gbdt 0.10 47 100, t=0.4: 0.709196
X#0,  gbdt 0.10 47 100, t=0.4: 0.709137
X#0,  gbdt 0.10 47 100, t=0.5: 0.708191
X#0,  

In [86]:
X_train, X_test, y_train, y_test = train_test_split(feats, y, test_size=0.2, random_state=42)
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

kernels = ['linear', 'poly', 'rbf']
gamma = [0.25, 0.5, 0.75]
for k, g in product(kernels, gamma):
    svc = SVC(kernel=k, gamma=g).fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    print(f"{k} {g}, {f1:.6f}")

linear 0.25, 0.671698
linear 0.5, 0.671698
linear 0.75, 0.671698
poly 0.25, 0.571429
poly 0.5, 0.571429
poly 0.75, 0.571429
rbf 0.25, 0.076125
rbf 0.5, 0.055944
rbf 0.75, 0.055944


In [88]:
penalties = ['elasticnet', 'l1', 'l2', 'none']
for pen in penalties:
    lr = LogisticRegression(penalty=pen, solver='saga', l1_ratio=0.5).fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    f1 = f1_score(y_test, y_pred)

    print(f"{pen}, {f1:.6f}")




elasticnet, 0.672218




l1, 0.670704




l2, 0.672218
none, 0.672714




In [90]:
bs = [True, False]
crits = ['gini', 'entropy']
for b, c in product(bs, crits):
    rf = RandomForestClassifier(criterion=c, bootstrap=b)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{b} {c}: {f1:.6f}")

True gini: 0.694097
True entropy: 0.700348
False gini: 0.679210
False entropy: 0.695574


In [92]:
for b, c in product(bs, crits):
    et = ExtraTreesClassifier(criterion=c, bootstrap=b)
    et.fit(X_train, y_train)
    y_pred = et.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{b} {c}: {f1:.6f}")

True gini: 0.708949
True entropy: 0.703259
False gini: 0.709502
False entropy: 0.701880


In [95]:
from sklearn.neural_network import MLPClassifier

acts = ['identity', 'relu', 'logistic', 'tanh']
sizes = [(320, 150, 75), (150, 150, 25), (320, 50, 100, 50), (100, 200, 75, 50)]
for act, s in product(acts, sizes):
    clf = MLPClassifier(hidden_layer_sizes=s, random_state=1, activation=act)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{act}, {s}: {f1:.6f}")
# (150,100,75,50)

identity, (320, 150, 75): 0.659409
identity, (150, 150, 25): 0.663324
identity, (320, 50, 100, 50): 0.654275
identity, (100, 200, 75, 50): 0.658247
relu, (320, 150, 75): 0.648250
relu, (150, 150, 25): 0.648250
relu, (320, 50, 100, 50): 0.631678
relu, (100, 200, 75, 50): 0.644222
logistic, (320, 150, 75): 0.607434
logistic, (150, 150, 25): 0.598738
logistic, (320, 50, 100, 50): 0.630837
logistic, (100, 200, 75, 50): 0.605455
tanh, (320, 150, 75): 0.650866
tanh, (150, 150, 25): 0.626244
tanh, (320, 50, 100, 50): 0.638079
tanh, (100, 200, 75, 50): 0.619437


In [101]:
acts = ['identity', 'relu', 'logistic', 'tanh']
sizes = [(150, 100, 75, 50, 10)]
for act, s in product(acts, sizes):
    clf = MLPClassifier(hidden_layer_sizes=s, random_state=1, activation=act)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(f"{act}, {s}: {f1:.6f}")

identity, (150, 100, 75, 50, 10): 0.661107
relu, (150, 100, 75, 50, 10): 0.661710
logistic, (150, 100, 75, 50, 10): 0.650435
tanh, (150, 100, 75, 50, 10): 0.641003


In [108]:
ds = lgb.Dataset(X_train, label=y_train)
bst = lgb.train({'learning_rate': 0.1, 'boosting': 'goss', 'num_leaves': 63, 'num_iterations': 100}, ds)

cb = CatBoostClassifier(iterations=400, learning_rate=0.2, depth=6).fit(X_train, y_train)

svc = SVC(kernel='linear').fit(X_train, y_train)

lr = LogisticRegression(penalty='none', solver='newton-cg').fit(X_train, y_train)

rf = RandomForestClassifier(criterion='entropy', bootstrap=True).fit(X_train, y_train)

et = ExtraTreesClassifier(criterion='gini', bootstrap=False).fit(X_train, y_train)

nn = MLPClassifier(hidden_layer_sizes=(150, 100, 75, 50, 10)).fit(X_train, y_train)

In [109]:
bst_pred = bst.predict(X_train)
cb_pred = cb.predict_proba(X_train)[:, 1]
svc_pred = svc.predict(X_train)
lr_pred = lr.predict_proba(X_train)
rf_pred = rf.predict_proba(X_train)
et_pred = et.predict_proba(X_train)
nn_pred = nn.predict_proba(X_train)

all_preds = np.c_[bst_pred, cb_pred, svc_pred, lr_pred, rf_pred, et_pred, nn_pred]
all_preds

array([[1.42411367e-01, 1.06707716e-02, 1.00000000e+00, ...,
        0.00000000e+00, 9.99999799e-01, 2.00930038e-07],
       [6.95488322e-01, 9.31561895e-01, 1.00000000e+00, ...,
        1.00000000e+00, 4.00316036e-06, 9.99995997e-01],
       [5.20182152e-01, 3.38847266e-01, 1.00000000e+00, ...,
        0.00000000e+00, 9.67939055e-01, 3.20609446e-02],
       ...,
       [9.79092660e-01, 9.69322922e-01, 1.00000000e+00, ...,
        1.00000000e+00, 2.11617825e-06, 9.99997884e-01],
       [9.89017515e-01, 9.98216615e-01, 1.00000000e+00, ...,
        1.00000000e+00, 3.88452357e-08, 9.99999961e-01],
       [1.03691389e+00, 9.96556033e-01, 1.00000000e+00, ...,
        1.00000000e+00, 6.89674967e-07, 9.99999310e-01]])

In [110]:
final_lr = LogisticRegression().fit(all_preds, y_train)

In [111]:
bst_pred = bst.predict(X_test)
cb_pred = cb.predict_proba(X_test)[:, 1]
svc_pred = svc.predict(X_test)
lr_pred = lr.predict_proba(X_test)
rf_pred = rf.predict_proba(X_test)
et_pred = et.predict_proba(X_test)
nn_pred = nn.predict_proba(X_test)

all_preds = np.c_[bst_pred, cb_pred, svc_pred, lr_pred, rf_pred, et_pred, nn_pred]

y_pred = final_lr.predict(all_preds)
f1_score(y_test, y_pred)

0.7002700270027004

In [37]:
X, y = baseline.originalXy()
X = complete(X)
X, y = SMOTE().fit_resample(X, y)
X.shape

320


ValueError: Found array with 0 feature(s) (shape=(10864, 0)) while a minimum of 1 is required.

In [23]:
ds = lgb.Dataset(X, label=y)
bst = lgb.train({'learning_rate': 0.1, 'boosting': 'goss', 'num_leaves': 63, 'num_iterations': 100}, ds)

cb = CatBoostClassifier(iterations=400, learning_rate=0.2, depth=6).fit(X, y)

svc = SVC(kernel='linear').fit(X, y)

lr = LogisticRegression(penalty='none', solver='newton-cg').fit(X, y)

rf = RandomForestClassifier(criterion='entropy', bootstrap=True).fit(X, y)

et = ExtraTreesClassifier(criterion='gini', bootstrap=False).fit(X, y)

nn = MLPClassifier(hidden_layer_sizes=(150, 100, 75, 50, 10)).fit(X, y)

bst_pred = bst.predict(X)
cb_pred = cb.predict_proba(X)[:, 1]
svc_pred = svc.predict(X)
lr_pred = lr.predict_proba(X)
rf_pred = rf.predict_proba(X)
et_pred = et.predict_proba(X)
nn_pred = nn.predict_proba(X)

all_preds = np.c_[bst_pred, cb_pred, svc_pred, lr_pred, rf_pred, et_pred, nn_pred]
#all_preds = np.c_[bst_pred, cb_pred, lr_pred, rf_pred, et_pred, nn_pred]
final_lr = LogisticRegression().fit(all_preds, y)

320


NameError: name 'feat_selector' is not defined

In [19]:
final_lr = LogisticRegression().fit(all_preds, y)

X_test = complete(baseline.origtestX())
#X_test = baseline.origtestX()

bst_pred = bst.predict(X_test)
cb_pred = cb.predict_proba(X_test)[:, 1]
svc_pred = svc.predict(X_test)
lr_pred = lr.predict_proba(X_test)
rf_pred = rf.predict_proba(X_test)
et_pred = et.predict_proba(X_test)
nn_pred = nn.predict_proba(X_test)

all_preds = np.c_[bst_pred, cb_pred, svc_pred, lr_pred, rf_pred, et_pred, nn_pred]
#all_preds = np.c_[bst_pred, cb_pred, lr_pred, rf_pred, et_pred, nn_pred]

y_pred = final_lr.predict(all_preds)
y_pred

array([1, 0, 0, ..., 0, 0, 1])

In [20]:
test = pd.read_csv("data/Test.csv")
sub_file = pd.DataFrame({'Sample_ID': test.Sample_ID, 'Label': y_pred})
sub_file.to_csv('finn/finalLR.csv', index = False)
sub_file.head()

Unnamed: 0,Sample_ID,Label
0,10865,1
1,10866,0
2,10867,0
3,10868,1
4,10869,1
