In [9]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, classification_report

In [7]:
from problem import get_train_data, get_test_data

data_train, labels_train = get_train_data()
data_test, labels_test = get_test_data()

# Model

In [25]:
from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import numpy as np


def compute_rolling_std(X_df, features, time_window, center=False):
    for feature in features:
        name = "_".join([feature, time_window, "std"])
        X_df[name] = X_df[feature].rolling(time_window, center=center).std()
        X_df[name] = X_df[name].ffill().bfill()
        X_df[name] = X_df[name].astype(X_df[feature].dtype)
    return X_df

def add_features(X_df):
    Alfven_Mach_number = X_df['V'] * 1e12 * np.sqrt(X_df['Np'] * 1.7e-27 * 1e6) * np.sqrt(4e-7*np.pi) / X_df['B']
    raw_pressure = X_df['V']**2 * X_df['Np'] * 1.7e-27 * 1e12 * 1e9
    X_df['AMach_number'] = Alfven_Mach_number
    X_df['raw_pressure'] = raw_pressure
    return X_df

class FeatureExtractor(BaseEstimator):
    def fit(self, X, y):
        return self

    def transform(self, X):
        X = add_features(X)
        return compute_rolling_std(X, ["Beta", "Vth", "B", "Bx", "Bz"], "2h")

def get_estimator():

    feature_extractor = FeatureExtractor()
    classifier = LogisticRegression(max_iter=1000)
    pipe = make_pipeline(feature_extractor, StandardScaler(), classifier)
    return pipe

In [26]:
model = get_estimator()

In [22]:
model.fit(data_train, labels_train)

In [30]:
scaler = StandardScaler()
train_new = scaler.fit_transform(data_train)

# Evaluate

In [10]:
log_loss(labels_test, y_pred)

0.16424689004953327

In [11]:
# using argmax here to convert the probabilities to binary 0/1
print(classification_report(labels_test, y_pred.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97    191755
           1       0.87      0.17      0.28     13819

    accuracy                           0.94    205574
   macro avg       0.91      0.58      0.62    205574
weighted avg       0.94      0.94      0.92    205574



In [12]:
from sklearn.model_selection import cross_validate
from problem import get_cv

def evaluation(X, y):
    pipe = get_estimator()
    cv = get_cv(X, y)
    results = cross_validate(
        pipe,
        X,
        y,
        scoring=["neg_log_loss"],
        cv=cv,
        verbose=1,
        return_train_score=True,
        n_jobs=-1,
    )
    return results

In [13]:
results = evaluation(data_train, labels_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   36.9s finished


In [14]:
print(
    "Training score Log Loss: {:.3f} +- {:.3f}".format(
        -np.mean(results["train_neg_log_loss"]), np.std(results["train_neg_log_loss"])
    )
)
print(
    "Testing score Log Loss: {:.3f} +- {:.3f} \n".format(
        -np.mean(results["test_neg_log_loss"]), np.std(results["test_neg_log_loss"])
    )
)

Training score Log Loss: 0.229 +- 0.030
Testing score Log Loss: 0.243 +- 0.045 



In [35]:
!ramp-test --submission v2

[38;5;178m[1mTesting Solar wind classification[0m
[38;5;178m[1mReading train and test files from ./data/ ...[0m
[38;5;178m[1mReading cv ...[0m
[38;5;178m[1mTraining submissions/v2 ...[0m
[38;5;178m[1mCV fold 0[0m
^C


In [28]:
!ramp-test --submission starting_kit

[38;5;178m[1mTesting Solar wind classification[0m
[38;5;178m[1mReading train and test files from ./data/ ...[0m
[38;5;178m[1mReading cv ...[0m
[38;5;178m[1mTraining submissions/starting_kit ...[0m
[38;5;178m[1mCV fold 0[0m
	[38;5;178m[1mscore  mixed  pw_ll  pw_prec  pw_rec  ev_prec  ev_rec      time[0m
	[38;5;10m[1mtrain[0m   [38;5;10m[1m0.24[0m   [38;5;150m0.16[0m     [38;5;150m0.78[0m    [38;5;150m0.42[0m     [38;5;150m0.23[0m    [38;5;150m0.26[0m  [38;5;150m9.724659[0m
	[38;5;12m[1mvalid[0m   [38;5;12m[1m0.38[0m   [38;5;105m0.31[0m     [38;5;105m0.78[0m    [38;5;105m0.42[0m     [38;5;105m0.24[0m    [38;5;105m0.29[0m  [38;5;105m0.693141[0m
	[38;5;1m[1mtest[0m    [38;5;1m[1m0.24[0m   [38;5;218m0.15[0m     [38;5;218m0.91[0m    [38;5;218m0.22[0m     [38;5;218m0.18[0m    [38;5;218m0.09[0m  [38;5;218m0.350382[0m
[38;5;178m[1mCV fold 1[0m
	[38;5;178m[1mscore  mixed  pw_ll  pw_prec  pw_rec  ev_prec  ev_rec       ti