In [1]:
import kagglegym
from sklearn import linear_model as lm
import numpy as np
from pca import pca
import pandas as pd
import time

In [2]:
env = kagglegym.make()
observation = env.reset()
train = observation.train
mean_values = train.mean(axis=0)
train = train.fillna(mean_values)

In [3]:
def getTimeDurationSince(start):
    end = time.time()
    duration = end - start
    if duration < 60:
        return str(round(duration, 2)) + 's'
    else:
        mins = int(duration / 60)
        secs = round(duration % 60, 2)
        if mins < 60:
            return str(mins) + 'm ' + str(secs) + 's'
        else:
            hours = int(duration / 3600)
            mins = mins % 60
            return str(hours) + 'h ' + str(mins) + 'm ' + str(secs) + 's'

In [4]:
high_y_cut = 0.093497
low_y_cut = -0.086093
y_is_above_cut = (train.y > high_y_cut)
y_is_below_cut = (train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)

def run_lr(cols_to_use, reduce_components_to=-1, clip_y=False):
    starTime = time.time()
    
    env.unique_idx = int(env.n/2)
    observation = env.reset()
    train = observation.train
    mean_values = train.mean(axis=0)
    train = train.fillna(mean_values)

    model = lm.LinearRegression()
    train_x = np.array(train[cols_to_use])
    if reduce_components_to != -1:
        train_x, y, evals, evecs = pca(train_x, red_cmps_to=reduce_components_to)
        
    if clip_y:
        train_x = train_x[y_is_within_cut]
        train_y = train.loc[y_is_within_cut, 'y']
    else:
        train_y = train.y.values
        
    model.fit(train_x, train_y)

    while True:
        features = observation.features.fillna(mean_values)
        test_x = np.array(features[cols_to_use])
        if reduce_components_to != -1:
            test_x_pca = np.dot(np.transpose(evecs), np.transpose(test_x))
            test_x = np.transpose(test_x_pca)
        observation.target.y = model.predict(test_x)
        if clip_y:
            observation.target.y = observation.target.y.clip(low_y_cut, high_y_cut)
        target = observation.target

        observation, reward, done, info = env.step(target)
        if done:
            break
    components = 'wszystkich' if reduce_components_to == -1 else str(reduce_components_to)
    print('Dla {} składowych głownych czas wyniósł: {}'.format(components, getTimeDurationSince(starTime)))
    return info['public_score']

In [5]:
pca_indexes = ['all', '50', '35', '20', '10', '5', '2']
def run_lr_pca(cols_to_use, clip_y=False):
    return pd.Series([
      run_lr(cols_to_use, reduce_components_to=-1, clip_y=clip_y),
      run_lr(cols_to_use, reduce_components_to=50, clip_y=clip_y),
      run_lr(cols_to_use, reduce_components_to=35, clip_y=clip_y),
      run_lr(cols_to_use, reduce_components_to=20, clip_y=clip_y),
      run_lr(cols_to_use, reduce_components_to=10, clip_y=clip_y),
      run_lr(cols_to_use, reduce_components_to=5, clip_y=clip_y),
      run_lr(cols_to_use, reduce_components_to=2, clip_y=clip_y),
    ], pca_indexes)

In [6]:
cols_to_use = [col for col in train.columns if col not in ['id','timestamp','y']]
corrcoefs = []
for col in cols_to_use:
    corrcoefs.append([col, np.corrcoef(train[col].values, train.y.values)[0, 1]])
corrcoefs = np.array(corrcoefs)

corrcoefs_vals = corrcoefs[:,1].astype(np.float)
corrcoefs_abs = np.absolute(corrcoefs_vals)
sort_indices = corrcoefs_abs.argsort()
sorted_corrcoefs = np.flip(corrcoefs[sort_indices], axis=0)
top_corrcoefs = sorted_corrcoefs[:,0]

# Obliczenia

In [None]:
all_cols_series = run_lr_pca(cols_to_use)
print(all_cols_series)
print()
print('Z obcięciem danych dla skrajnych y:')
all_cols_clip_y_series = run_lr_pca(cols_to_use, clip_y=True)
print(all_cols_clip_y_series)

In [None]:
top_50_corr_cols = top_corrcoefs[:50]

top_50_corr_cols_series = run_lr_pca(top_50_corr_cols)
print(top_50_corr_cols_series)
print()
print('Z obcięciem danych dla skrajnych y:')
top_50_corr_cols_clip_y_series = run_lr_pca(top_50_corr_cols, clip_y=True)
print(top_50_corr_cols_clip_y_series)

In [None]:
top_35_corr_cols = top_corrcoefs[:35]

top_35_corr_cols_series = run_lr_pca(top_35_corr_cols)
print(top_35_corr_cols_series)
print()
print('Z obcięciem danych dla skrajnych y:')
top_35_corr_cols_clip_y_series = run_lr_pca(top_35_corr_cols, clip_y=True)
print(top_35_corr_cols_clip_y_series)

In [None]:
top_20_corr_cols = top_corrcoefs[:20]

top_20_corr_cols_series = run_lr_pca(top_20_corr_cols)
print(top_20_corr_cols_series)
print()
print('Z obcięciem danych dla skrajnych y:')
top_20_corr_cols_clip_y_series = run_lr_pca(top_20_corr_cols, clip_y=True)
print(top_20_corr_cols_clip_y_series)

In [None]:
top_10_corr_cols = top_corrcoefs[:10]

top_10_corr_cols_series = run_lr_pca(top_10_corr_cols)
print(top_10_corr_cols_series)
print()
print('Z obcięciem danych dla skrajnych y:')
top_10_corr_cols_clip_y_series = run_lr_pca(top_10_corr_cols, clip_y=True)
print(top_10_corr_cols_clip_y_series)

In [None]:
top_5_corr_cols = top_corrcoefs[:5]

top_5_corr_cols_series = run_lr_pca(top_5_corr_cols)
print(top_5_corr_cols_series)
print()
print('Z obcięciem danych dla skrajnych y:')
top_5_corr_cols_clip_y_series = run_lr_pca(top_5_corr_cols, clip_y=True)
print(top_5_corr_cols_clip_y_series)

# Podsumowanie:

In [None]:
d = {'01. Wszystkie kolumny':                                               all_cols_series,
     '02. Wszystkie kolumn z obcięciem dla skrajnych y':                    all_cols_clip_y_series,
     '03. 50 najbardziej skorelowanych kolumn':                             top_50_corr_cols_series,
     '04. 50 najbardziej skorelowanych kolumn z obcięciem dla skrajnych y': top_50_corr_cols_clip_y_series,
     '05. 35 najbardziej skorelowanych kolumn':                             top_35_corr_cols_series,
     '06. 35 najbardziej skorelowanych kolumn z obcięciem dla skrajnych y': top_35_corr_cols_clip_y_series,
     '07. 20 najbardziej skorelowanych kolumn':                             top_20_corr_cols_series,
     '08. 20 najbardziej skorelowanych kolumn z obcięciem dla skrajnych y': top_20_corr_cols_clip_y_series,
     '09. 10 najbardziej skorelowanych kolumn':                             top_10_corr_cols_series,
     '10. 10 najbardziej skorelowanych kolumn z obcięciem dla skrajnych y': top_10_corr_cols_clip_y_series,
     '11. 5 najbardziej skorelowanych kolumn':                              top_5_corr_cols_series,
     '12. 5 najbardziej skorelowanych kolumn z obcięciem dla skrajnych y':  top_5_corr_cols_clip_y_series}

def row_align_left(val):
    return 'text-align: left'

df = pd.DataFrame(d, pca_indexes).transpose()
df.style.set_table_styles([{'selector': '.row_heading',
                           'props': [('text-align', 'left')]}])