In [0]:
import sys
import logging
import time
import numpy as np
import pandas as pd
import os.path

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfTransformer

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.regularizers import l1_l2
import pickle

In [0]:
def load_data(train_data_path='data/train.csv', test_data_path = 'data/test.csv'):
    train_df = pd.read_csv(train_data_path, sep=',', index_col=0, header=0)
    test_df = pd.read_csv(test_data_path, sep=',', index_col=0, header=0)
    
    train_df['target'] = train_df['target'].str[-1].astype(int) - 1
        
    return train_df, test_df

In [0]:
def process_data(train_df, test_df, ylabel='target', standarization=False, discretization=False, transform=None):
    numerical_features = train_df.columns

    if standarization:
        standarized_features = numerical_features
        standarize_feature(train_df, test_df, standarized_features)
        
    if discretization:
        discretized_features = numerical_features
        discretize_feature(train_df, test_df, discretized_features, num_bins=10, how='equal_freq')
    
    X = train_df.drop(ylabel, axis=1).values
    y = train_df[ylabel].values
    X_submission = test_df.values
    
    if transform == 'log':
        X = np.log1p(X)
        X_submission = np.log1p(X_submission)
    elif transform == 'sqrt':
        X = np.sqrt(X + 3.0 / 8)
        X_submission = np.sqrt(X_submission + 3.0 / 8)
    elif transform == 'pca':
        pca = PCA(n_components=3).fit(X)
        X = pca.transform(X)
        X_submission = pca.transform(X_submission)
    elif transform == 'tsne':
        tsne = TSNE(n_components=3).fit(X)
        X = tsne.transform(X)
        X_submission = tsne.transform(X_submission)
    elif transform == 'pca+':
        pca = PCA(n_components=3).fit(X)
        X = np.hstack((X, pca.transform(X)))
        X_submission = np.hstack((X, pca.transform(X)))
    elif transform == 'tsne+':
        tsne = TSNE(n_components=3).fit(X)
        X = np.hstack((X, tsne.transform(X)))
        X_submission = np.hstack((X_submission, tsne.transform(X_submission)))        
    return X, y, X_submission

In [0]:

start_time = time.time()

logging.basicConfig(level=logging.DEBUG,
                    format='[%(asctime)s]: %(message)s ',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    stream=sys.stdout,
                    filemode="w"
                    )
# load data
logging.info('Load data')
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='none')

[2020-03-12 12:56:27]: Load data 


# X Feature 1,2,3

In [0]:
print(X.shape)
class_row = [0]
for i in range(9):
    # if i ==0:
    #     class_row.append(np.sum([x == i for x in y]))
    # else:
    class_row.append(np.sum([x == i for x in y]) + class_row[i])
print(class_row)
class_matrix = []
for i in range(9):
    class_matrix.append(X[ class_row[i]:class_row[i+1], :]) 
print(class_matrix[-1].shape)

(61878, 93)
[0, 1929, 18051, 26055, 28746, 31485, 45620, 48459, 56923, 61878]
(4955, 93)


In [0]:
# print(class_id)
# print(np.sum(class_id))

In [0]:
four_neigh_nearest_distance_matrix = []
four_neigh_nearest_id_matrix = []
for i in range(9):
    print(i)
    four_neigh = NearestNeighbors(n_neighbors=5, algorithm='kd_tree', radius=1000.0)
    four_neigh.fit(class_matrix[i])
    four_neigh_nearest_distance,four_neigh_nearest_id = four_neigh.kneighbors(X, return_distance=True)
    four_neigh_nearest_distance_matrix.append(four_neigh_nearest_distance)
    four_neigh_nearest_id_matrix.append(four_neigh_nearest_id)

0
1
2
3
4
5
6
7
8


In [0]:
four_neigh_nearest_distance_matrix_ = np.array(four_neigh_nearest_distance_matrix)
print(four_neigh_nearest_distance_matrix_.shape)
feature1 = []
feature2 = []
feature3 = []
feature1 = four_neigh_nearest_distance_matrix_[:,:,0]
print(four_neigh_nearest_distance_matrix_[:,:,0:2].shape)
feature2 = np.sum(four_neigh_nearest_distance_matrix_[:,:,0:2], axis = 2)
feature3 = np.sum(four_neigh_nearest_distance_matrix_[:,:,0:4], axis = 2)
for i in range(9):
    # print(feature1.shape)
    feature1[i, class_row[i]:class_row[i+1]] = four_neigh_nearest_distance_matrix_[i, class_row[i]:class_row[i+1],1]
for i in range(9):
    print(feature2.shape)
    print(four_neigh_nearest_distance_matrix_[i, class_row[i]:class_row[i+1] ,1:3].shape)
    feature2[i, class_row[i]:class_row[i+1]] = np.sum(four_neigh_nearest_distance_matrix_[i, class_row[i]:class_row[i+1] ,1:3], axis = 1)
for i in range(9):
    # print(feature1.shape)
    feature3[i, class_row[i]:class_row[i+1]] = np.sum(four_neigh_nearest_distance_matrix_[i, class_row[i]:class_row[i+1] , 1:5], axis = 1)

# print(np.array(feature1).shape)
# print(four_neigh_nearest_distance_matrix_[0,:,0])
# print(np.array(feature1).shape)

(9, 61878, 5)
(9, 61878, 2)
(9, 61878)
(1929, 2)
(9, 61878)
(16122, 2)
(9, 61878)
(8004, 2)
(9, 61878)
(2691, 2)
(9, 61878)
(2739, 2)
(9, 61878)
(14135, 2)
(9, 61878)
(2839, 2)
(9, 61878)
(8464, 2)
(9, 61878)
(4955, 2)


In [0]:
np.savetxt("applied/otto/feature1.csv",  np.array(feature1).T , delimiter=",")
np.savetxt("applied/otto/feature2.csv",  np.array(feature2).T , delimiter=",")
np.savetxt("applied/otto/feature3.csv",  np.array(feature3).T , delimiter=",")

## X_submission_feature 1,2

In [0]:
four_neigh_nearest_distance_matrix_X_submission = []
four_neigh_nearest_id_matrix_X_submission = []
for i in range(9):
    print(i)
    four_neigh_X_submission  = NearestNeighbors(n_neighbors=5, algorithm='kd_tree', radius=1000.0)
    four_neigh_X_submission.fit(class_matrix[i])
    four_neigh_nearest_distance_X_submission ,four_neigh_nearest_id_X_submission  = four_neigh_X_submission.kneighbors(X_submission, return_distance=True)
    four_neigh_nearest_distance_matrix_X_submission.append(four_neigh_nearest_distance_X_submission )
    four_neigh_nearest_id_matrix_X_submission.append(four_neigh_nearest_id_X_submission )

0
1
2
3
4
5
6
7
8


In [0]:
four_neigh_nearest_distance_matrix_X_submission_ = np.array(four_neigh_nearest_distance_matrix_X_submission)
print(four_neigh_nearest_distance_matrix_X_submission_.shape)
feature1_X_submission = []
feature2_X_submission = []
feature3_X_submission = []
feature1_X_submission = four_neigh_nearest_distance_matrix_X_submission_[:,:,0]
feature2_X_submission = np.sum(four_neigh_nearest_distance_matrix_X_submission_[:,:,0:2], axis = 2)
feature3_X_submission = np.sum(four_neigh_nearest_distance_matrix_X_submission_[:,:,0:4], axis = 2)

(9, 144368, 5)


In [0]:
np.savetxt("applied/otto/feature1_X_submission.csv",  np.array(feature1_X_submission).T , delimiter=",")
np.savetxt("applied/otto/feature2_X_submission.csv",  np.array(feature2_X_submission).T , delimiter=",")
np.savetxt("applied/otto/feature3_X_submission.csv",  np.array(feature3_X_submission).T , delimiter=",")

# X feature 4

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X)
feature4_X = tfidf.toarray()

In [0]:
print(feature4_X.shape)

(61878, 93)


In [0]:
feature4_class_matrix = []
for i in range(9):
    feature4_class_matrix.append( feature4_X[ class_row[i]:class_row[i+1], :])
print(np.array(feature4_class_matrix)[1].shape) 

(16122, 93)


In [0]:
nearest_distance_feature4_matrix = []
nearest_id_feature4_matrix = []
for i in range(9):
    print(i)
    neigh_feature4_X = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', radius=1000.0)
    neigh_feature4_X.fit(feature4_class_matrix[i])
    nearest_distance_feature4,  nearest_id_feature4 = neigh_feature4_X.kneighbors(feature4_X, return_distance=True)
    nearest_distance_feature4_matrix.append(nearest_distance_feature4)
    nearest_id_feature4_matrix.append(nearest_id_feature4)

0
1
2
3
4
5
6
7
8


In [0]:
# nearest_distance_feature4_matrix_ = np.array(nearest_distance_feature4_matrix)
# print(nearest_distance_feature4_matrix_)
# np.savetxt("applied/otto/feature4_for_test.csv",  nearest_distance_feature4_matrix_[:,:,0].T, delimiter=",")

In [0]:
nearest_distance_feature4_matrix_ = np.array(nearest_distance_feature4_matrix)
print(nearest_distance_feature4_matrix_.shape)
feature4 = []
feature4 = nearest_distance_feature4_matrix_[:,:,0]
for i in range(9):
    feature4[i, class_row[i]:class_row[i+1]] = nearest_distance_feature4_matrix_[i, class_row[i]:class_row[i+1],1]

(9, 61878, 2)


In [0]:
np.savetxt("applied/otto/feature4.csv",  np.array(feature4).T , delimiter=",")

# X submission feature 4

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
transformer = TfidfTransformer(smooth_idf=False)
tfidf_submission = transformer.fit_transform(X_submission)
feature4_X_submission = tfidf_submission.toarray()

In [0]:
nearest_distance_feature4_matrix_X_submission = []
nearest_id_feature4_matrix_X_submission = []
for i in range(9):
    print(i)
    neigh_feature4_X_submission = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', radius=1000.0)
    neigh_feature4_X_submission.fit(feature4_class_matrix[i])
    nearest_distance_feature4_X_submission,  nearest_id_feature4_X_submission = neigh_feature4_X_submission.kneighbors(feature4_X_submission, return_distance=True)
    nearest_distance_feature4_matrix_X_submission.append(nearest_distance_feature4_X_submission)
    nearest_id_feature4_matrix_X_submission.append(nearest_id_feature4_X_submission)

0
1
2
3
4
5
6
7
8


In [0]:
print(np.array(nearest_distance_feature4_matrix_X_submission)[:,:,0].shape)

(9, 144368)


In [0]:
np.savetxt("applied/otto/feature4_X_submission.csv",  np.array(nearest_distance_feature4_matrix_X_submission)[:,:,0].T , delimiter=",")

## Feature 5 X

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
X_stack = np.concatenate([X , X_submission], axis = 0)
print(X_stack.shape)

(206246, 93)


In [0]:
X_embedded = TSNE(n_components=3).fit_transform(X_stack)

In [0]:
# np.savetxt("applied/otto/X_embedded.csv", X_embedded , delimiter=",")
print(X_embedded.shape)
print(len(X))
X_embedded_X = X_embedded[:len(X),:]
X_embedded_X_submission = X_embedded[len(X):,:]
print(X_embedded_X.shape)
print(X_embedded_X_submission.shape)

(206246, 3)
61878
(61878, 3)
(144368, 3)


In [0]:
feature5_class_matrix = []
for i in range(9):
    feature5_class_matrix.append( X_embedded_X[ class_row[i]:class_row[i+1], :])
print(np.array(feature5_class_matrix)[-1].shape) 

(4955, 3)


In [0]:
# feature5 = np.concatenate([X_embedded, kmeans.labels_.reshape(-1, 1)], axis = 1)
# neigh_feature5 = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', radius=1000.0)
# neigh_feature5.fit(X_embedded)
# nearest_distance_feature5, nearest_id_feature5 = neigh_feature5.kneighbors(X_embedded, return_distance=True)

In [0]:
nearest_distance_feature5_matrix_X = []
nearest_id_feature5_matrix_X = []
for i in range(9):
    print(i)
    neigh_feature5_X= NearestNeighbors(n_neighbors=2, algorithm='kd_tree', radius=1000.0)
    neigh_feature5_X.fit(feature5_class_matrix[i])
    nearest_distance_feature5_X,  nearest_id_feature5_X = neigh_feature5_X.kneighbors(X_embedded_X, return_distance=True)
    nearest_distance_feature5_matrix_X.append(nearest_distance_feature5_X)
    nearest_id_feature5_matrix_X.append(nearest_id_feature5_X)

0
1
2
3
4
5
6
7
8


In [0]:
nearest_distance_feature5_matrix_X_ = np.array(nearest_distance_feature5_matrix_X)
print(nearest_distance_feature5_matrix_X_.shape)
feature5 = []
feature5 = nearest_distance_feature5_matrix_X_[:,:,0]
for i in range(9):
    feature5[i, class_row[i]:class_row[i+1]] = nearest_distance_feature5_matrix_X_[i, class_row[i]:class_row[i+1],1]

(9, 61878, 2)


In [0]:
print(feature5.shape )

(9, 61878)


In [0]:
np.savetxt("applied/otto/feature5.csv", feature5.T , delimiter=",")

# feature 5 X submission

In [0]:
nearest_distance_feature5_matrix_X_submission = []
nearest_id_feature5_matrix_X_submission = []
for i in range(9):
    print(i)
    neigh_feature5_X_submission = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', radius=1000.0)
    neigh_feature5_X_submission.fit(feature5_class_matrix[i])
    nearest_distance_feature5_X_submission,  nearest_id_feature5_X_submission = neigh_feature5_X_submission.kneighbors(X_embedded_X_submission, return_distance=True)
    nearest_distance_feature5_matrix_X_submission.append(nearest_distance_feature5_X_submission)
    nearest_id_feature5_matrix_X_submission.append(nearest_id_feature5_X_submission)

0
1
2
3
4
5
6
7
8


In [0]:
feature5_X_submission = np.array(nearest_distance_feature5_matrix_X_submission)[:,:,0]
print(feature5_X_submission.shape)

(9, 144368)


In [0]:
np.savetxt("applied/otto/feature5_X_submission.csv", feature5_X_submission.T , delimiter=",")

# feature 6

In [0]:
X, y, X_submission = process_data(train_df, test_df, transform='None')
X_stack = np.concatenate([X , X_submission], axis = 0)
print(X_stack.shape)

(206246, 93)


In [0]:
kmeans2 = KMeans(n_clusters=2, random_state=42).fit(X_stack)
kmeans3 = KMeans(n_clusters=3, random_state=42).fit(X_stack)
kmeans4 = KMeans(n_clusters=4, random_state=42).fit(X_stack)
kmeans5 = KMeans(n_clusters=5, random_state=42).fit(X_stack)
kmeans6 = KMeans(n_clusters=6, random_state=42).fit(X_stack)
kmeans7 = KMeans(n_clusters=7, random_state=42).fit(X_stack)
kmeans8 = KMeans(n_clusters=8, random_state=42).fit(X_stack)
kmeans9 = KMeans(n_clusters=9, random_state=42).fit(X_stack)
kmeans10 = KMeans(n_clusters=10, random_state=42).fit(X_stack)

In [0]:
feature6_stack = [kmeans2.labels_,
                  kmeans3.labels_,
                  kmeans4.labels_,
                  kmeans5.labels_,
                  kmeans6.labels_,
                  kmeans7.labels_,
                  kmeans8.labels_,
                  kmeans9.labels_,
                  kmeans10.labels_,]
print(np.array(feature6_stack).shape)
print(kmeans2.labels_.shape)
feature6_X = np.array(feature6_stack)[:,:len(X)].T
feature6_X_submission = np.array(feature6_stack)[:,len(X):].T
# print(feature6_X.shape)
# print(feature6_X_submission.shape)

(9, 206246)
(206246,)
(61878, 9)
(144368, 9)


In [0]:
np.savetxt("applied/otto/feature6_X.csv",  feature6_X , delimiter=",")
np.savetxt("applied/otto/feature6_X_submission.csv",  feature6_X_submission , delimiter=",")

## feature 7

In [0]:
train_df, test_df = load_data(train_data_path='applied/otto/train.csv', test_data_path='applied/otto/test.csv')
X, y, X_submission = process_data(train_df, test_df, transform='None')
non_zero = np.sum([x==0 for x in X_submission], axis = 1).reshape(-1, 1)
print(non_zero.shape)

(144368, 1)


In [0]:
np.savetxt("applied/otto/feature7_X_submission.csv",  non_zero , delimiter=",")