In [62]:
from traceback import print_tb

import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn import (
    decomposition,
    ensemble,
    feature_selection,
    impute,
    linear_model,
    model_selection,
    pipeline,
    preprocessing,
    svm,

)


In [63]:
def load_data():
    X_train_df = pd.read_csv('./data/X_train.csv',skiprows=1,header=None)
    Y_train_df = pd.read_csv('./data/Y_train.csv',skiprows=1,header=None)
    X_test_df = pd.read_csv('./data/X_test.csv',skiprows=1,header=None)

    X_train = X_train_df.iloc[:,1:]
    Y_train = Y_train_df.iloc[:,1]
    X_test = X_test_df.iloc[:,1:]
    return X_train, Y_train, X_test


In [64]:
def remove_outlier(X_train,y_train):
    remove_outlier_pipe = pipeline.Pipeline(
        steps=[
            
            #("imputer", impute.IterativeImputer(max_iter=10, random_state=42)), # impute missing values
           ("imputer",impute.SimpleImputer(strategy="median")),
           ("scaler", preprocessing.RobustScaler()),
            ("pca", decomposition.PCA(n_components=3)),
            ("outlier", ensemble.IsolationForest(contamination=0.03)), # 3% of data is outlier
        ]
    )
    pred = remove_outlier_pipe.fit_predict(X_train)
    print(pred)
    X_train, y_train = X_train[pred == 1], y_train[pred == 1]
    return X_train, y_train
        



In [65]:
def preprocess(X_train, X_test):
    preprocessor = pipeline.Pipeline(
        steps=[
            
            ("imputer",impute.SimpleImputer(strategy="median")),
           # ("imputer", impute.IterativeImputer(max_iter=10, random_state=42)), # impute missing values
            ("scaler", preprocessing.StandardScaler()),
        ]
    )

    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)
    return X_train, X_test

In [66]:
def select_features(X_train, y_train, X_test):
    feature_selector = pipeline.Pipeline(
        steps=[
            ('variance_filter', feature_selection.VarianceThreshold(threshold=0)),
            ('correlation_filter', feature_selection.SelectKBest(feature_selection.f_regression, k=300)),
            #('pca', decomposition.PCA(n_components=100)),
            ('lasso',feature_selection.SelectFromModel(
                estimator=linear_model.Lasso(alpha=0.1, random_state=42),
                threshold='mean',  
            )),
            
        ]
    )
    feature_selector.fit(X_train, y_train)
    X_train = feature_selector.transform(X_train)
    print(X_train.shape)
    X_test = feature_selector.transform(X_test)
    print(X_train.shape)
    return X_train, X_test

In [67]:

def train(X_train, y_train, X_test):

    base_models = [
                ("svr", svm.SVR(C=40, epsilon=1e-04)),
                ("gbm", ensemble.GradientBoostingRegressor(learning_rate=0.11)),
                ("etr", ensemble.ExtraTreesRegressor())
    ]
    classifier = ensemble.StackingRegressor(estimators=base_models,final_estimator=linear_model.Ridge(),cv=5, n_jobs=6)
    
    classifier.fit(X_train, y_train)
    y_test = classifier.predict(X_test)
    
    return classifier

In [68]:
def main():
    
    X_train, y_train, X_test = load_data()
    
    X_train_wo_outlier, y_train_wo_outlier = remove_outlier(X_train, y_train)
    X_train, X_test = preprocess(X_train_wo_outlier, X_test)
    X_train, X_test = select_features(X_train, y_train_wo_outlier, X_test)
    model = train(X_train, y_train_wo_outlier, X_test)
    
   # pred = model.named_steps['train'].predict(X_test)
    pred = model.predict(X_test)
    pred = np.vstack((np.arange(X_test.shape[0]), pred)).T
    print("Ready")
    #pred = np.round(pred).astype(int)
    np.savetxt("submission_2_ML_goos_architecture.csv", pred, delimiter=",", header="id,y", comments="", fmt=["%d", "%f"])
    

In [69]:
main()

[1 1 1 ... 1 1 1]
(1175, 101)
(1175, 101)
Ready
