In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test.csv', 'train.csv', 'sample_submission.csv']


In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [13]:
from sklearn.svm import SVC, NuSVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm_notebook as tqdm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
# from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, ShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn import model_selection
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import random
import optuna

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


def evaluate(y_train, X_train, title = 'Cross Validation Scores'):
    # Y_train = df["target"]
    # X_train = df.drop(labels = ["target"],axis = 1)
    
    cv = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 46 )
    
    algorithms = [
        "SV",
        "NuSVC",
        "RandomForest", "GradientBoosting", "ExtraTrees",
        "LinearDiscriminant", "QuadraticDIscriminant",
        "DecisionTree", "KNeighbors", "LogisticRegression", "MLPClassifier", "GaussianNB",
        "XGBClassifier"
    ]
    
    classifiers = [
        SVC(),
        NuSVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=4, nu=0.59, coef0=0.053),

        RandomForestClassifier(),
        GradientBoostingClassifier(),
        ExtraTreesClassifier(),

        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),

        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        LogisticRegression(),
        MLPClassifier(),
        GaussianNB(),
        
        xgb.XGBClassifier(),
    ]
    classifiers = [
        KNeighborsClassifier()
    ]
    
    cv_results = []
    for classifier in tqdm(classifiers):
        cv_results.append(cross_val_score(classifier, X_train, y = y_train, scoring = "accuracy", cv = cv, n_jobs=4))

    cv_means = []
    cv_std = []
    for cv_result in cv_results:
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())

    cv_res = pd.DataFrame({
        "CrossValMeans":cv_means,
        "CrossValerrors": cv_std,
        "Algorithm":algorithms})

    plt.figure()
    g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std})
    g.set_xlabel("Mean Accuracy")
    g = g.set_title(title)

    r = list(zip(algorithms, cv_means))
    r.sort(key=lambda x: x[1])
    max_result = r[-1]
    print("Max Accuracy: ", max_result)

In [4]:
def preprocess_features(df_):
    df = df_.copy()

    # PCA
    # df = PCA(n_components=40, random_state=4).fit_transform(df)
    df = PCA(svd_solver='full',n_components='mle').fit_transform(df)
    df = StandardScaler().fit_transform(df)

    # variance
    # df = VarianceThreshold(threshold=1.5).fit_transform(df)
    # df = StandardScaler().fit_transform(df)

    # TODO: LGB tokade feature importance

    return df

In [6]:
feature_cols = [c for c in train.columns if c not in ['id', 'wheezy-copper-turtle-magic', 'target']]

def target_xy(weezy):
    train2 = train[train['wheezy-copper-turtle-magic']==weezy]
    train2.reset_index(drop=True,inplace=True)

    y_train = train2['target']
    X_train = train2[feature_cols]
    X_train = preprocess_features(X_train)
    return [X_train, y_train]

In [9]:
X_train, y_train = target_xy(0)

In [20]:
def objective(trial):
    n_neighbors=trial.suggest_int('n_neighbors', 2, 8)
    weights='uniform'
    algorithm='auto'
    leaf_size=trial.suggest_int('leaf_size', 25, 35)
    p=trial.suggest_int('p', 1, 3)
    metric='minkowski'
    metric_params=None
    n_jobs=1
    
    cv = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 46 )
    
    classifier = KNeighborsClassifier(
        n_neighbors=n_neighbors, weights=weights, algorithm=algorithm,
        leaf_size=leaf_size, p=p, metric=metric)
    return cross_val_score(classifier, X_train, y = y_train, scoring = "accuracy", cv = cv, n_jobs=4).mean()

In [21]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[I 2019-06-03 23:34:39,040] Finished trial#0 resulted in value: 0.7850931677018633. Current best value is 0.7850931677018633 with parameters: {'n_neighbors': 8, 'leaf_size': 26, 'p': 1}.
[I 2019-06-03 23:34:39,251] Finished trial#1 resulted in value: 0.7583850931677019. Current best value is 0.7583850931677019 with parameters: {'n_neighbors': 4, 'leaf_size': 26, 'p': 1}.
[I 2019-06-03 23:34:39,686] Finished trial#2 resulted in value: 0.8186335403726709. Current best value is 0.7583850931677019 with parameters: {'n_neighbors': 4, 'leaf_size': 26, 'p': 1}.
[I 2019-06-03 23:34:39,892] Finished trial#3 resulted in value: 0.768944099378882. Current best value is 0.7583850931677019 with parameters: {'n_neighbors': 4, 'leaf_size': 26, 'p': 1}.
[I 2019-06-03 23:34:40,072] Finished trial#4 resulted in value: 0.768944099378882. Current best value is 0.7583850931677019 with parameters: {'n_neighbors': 4, 'leaf_size': 26, 'p': 1}.
[I 2019-06-03 23:34:40,503] Finished trial#5 resulted in value: 0.7

[I 2019-06-03 23:34:49,746] Finished trial#43 resulted in value: 0.7453416149068323. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:34:50,021] Finished trial#44 resulted in value: 0.7751552795031056. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:34:50,507] Finished trial#45 resulted in value: 0.7838509316770186. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:34:50,681] Finished trial#46 resulted in value: 0.7962732919254658. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:34:50,870] Finished trial#47 resulted in value: 0.7453416149068323. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:34:51,091] Finished trial#48 resulted in va

[I 2019-06-03 23:35:00,319] Finished trial#86 resulted in value: 0.768944099378882. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:35:00,509] Finished trial#87 resulted in value: 0.768944099378882. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:35:00,704] Finished trial#88 resulted in value: 0.7229813664596273. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:35:00,895] Finished trial#89 resulted in value: 0.7453416149068323. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:35:01,486] Finished trial#90 resulted in value: 0.8124223602484474. Current best value is 0.7229813664596273 with parameters: {'n_neighbors': 2, 'leaf_size': 33, 'p': 1}.
[I 2019-06-03 23:35:01,691] Finished trial#91 resulted in valu

In [22]:
study.best_params

{'leaf_size': 33, 'n_neighbors': 2, 'p': 1}

In [23]:
study.best_value

0.7229813664596273

In [25]:
# 全試行の確認
# study.trials

In [31]:
%%time
# INITIALIZE VARIABLES
oof = np.zeros(len(train))
preds = np.zeros(len(test))
cols = [c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic']]

# BUILD 512 SEPARATE NON-LINEAR MODELS
for i in tqdm(range(512)):
    
    # EXTRACT SUBSET OF DATASET WHERE WHEEZY-MAGIC EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)
    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
        
    # STRATIFIED K FOLD (Using splits=25 scores 0.002 better but is slower)
    skf = StratifiedKFold(n_splits=11, random_state=42)
    for train_index, test_index in skf.split(train3, train2['target']):
        
        # MODEL WITH SUPPORT VECTOR MACHINE
        clf = KNeighborsClassifier(
            n_neighbors=study.best_params["n_neighbors"], leaf_size=study.best_params["leaf_size"], p=study.best_params["p"])
        clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
        oof[idx1[test_index]] = clf.predict_proba(train3[test_index,:])[:,1]
        preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
    #if i%10==0: print(i)
    
auc = roc_auc_score(train['target'],oof)
print('CV score =',round(auc,5))

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


CV score = 0.84731
CPU times: user 3min 33s, sys: 2.16 s, total: 3min 35s
Wall time: 3min 50s


In [32]:
sub = pd.read_csv('../input/sample_submission.csv')
sub['target'] = preds
sub.to_csv('submission.csv',index=False)