In [1]:
import sys
import logging
import time
import numpy as np
import pandas as pd
import os.path

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfTransformer

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2
import pickle

Using TensorFlow backend.


In [0]:
def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)
        
    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq')
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values
    
    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [4]:

start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )
# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')

[2020-03-04 05:08:31]: Load data 
[2020-03-04 05:08:32]: NumExpr defaulting to 8 threads. 


# Feature 1

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
neigh = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', radius=1000.0)
neigh.fit(X)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                 radius=1000.0)

In [0]:
nearest_distance, nearest_id= neigh.kneighbors(X,  return_distance=True)

In [0]:
np.savetxt("applied/otto/feature1.csv",  np.sum(nearest_distance,axis = 1).reshape(-1,1) , delimiter=",")

# Feature 2

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
neigh2 = NearestNeighbors(n_neighbors=3, algorithm='kd_tree', radius=1000.0)
neigh2.fit(X)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                 radius=1000.0)

In [0]:
nearest_distance2, nearest_id2 = neigh2.kneighbors(X, return_distance=True)

# Feature 3

In [0]:
neigh4 = NearestNeighbors(n_neighbors=5, algorithm='kd_tree', radius=1000.0)
neigh4.fit(X)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1000.0)

In [0]:
nearest_distance4, nearest_id4 = neigh4.kneighbors(X, return_distance=True)

In [0]:
np.savetxt("applied/otto/feature2.csv",  np.sum(nearest_distance2,axis = 1).reshape(-1,1) , delimiter=",")
np.savetxt("applied/otto/feature3.csv",  np.sum(nearest_distance4,axis = 1).reshape(-1,1) , delimiter=",")

## feature 4

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X)

In [0]:
feature4 = tfidf.toarray()
neigh_feature4 = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', radius=1000.0)
neigh_feature4.fit(feature4)
nearest_distance_feature4,  nearest_id_feature4 = neigh_feature4.kneighbors(feature4, return_distance=True)

In [0]:
np.savetxt("applied/otto/feature4.csv",  nearest_distance_feature4 , delimiter=",")

## Feature 5

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
X_embedded = TSNE(n_components=3).fit_transform(X)

In [0]:
# kmeans = KMeans(n_clusters=2, random_state=42).fit(X_embedded )

In [0]:
# feature5 = np.concatenate([X_embedded, kmeans.labels_.reshape(-1, 1)], axis = 1)
neigh_feature5 = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', radius=1000.0)
neigh_feature5.fit(X_embedded)
nearest_distance_feature5, nearest_id_feature5 = neigh_feature5.kneighbors(X_embedded, return_distance=True)

In [0]:
np.savetxt("applied/otto/feature5.csv",  feature5 , delimiter=",")

# feature 6

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
kmeans = KMeans(n_clusters=9, random_state=42).fit(X)

In [0]:
np.savetxt("applied/otto/feature6.csv",  kmeans.labels_.reshape(-1, 1) , delimiter=",")

## feature 7

In [0]:
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='None')
non_zero = np.sum([x==0 for x in X], axis = 1).reshape(-1, 1)
print(non_zero.shape)

(61878, 1)


In [0]:
np.savetxt("applied/otto/feature7.csv",  non_zero , delimiter=",")