In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline

df = pd.read_csv('../data/data_segmented_1_04_08.csv')
print(df)

In [2]:
df.iloc[:,-1].replace(['GAIT', 'RAMP', 'STEP', 'UNEVEN'],
                        [0, 0,1,1], inplace=True)

In [3]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

def z_score_standardization(series):
    return (series - series.mean()) / series.std()

for col in X.columns:
    X[col] = z_score_standardization(X[col])

X = X.dropna(axis='columns')

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import RidgeCV

ridge = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X, y)
importance = np.abs(ridge.coef_)
feature_names = np.array(X.columns)
plt.bar(height=importance, x=feature_names)
plt.title("Feature importances via coefficients")
plt.show()

In [None]:
from sklearn.feature_selection import SelectFromModel
from time import time

threshold = np.sort(importance)[-10] + 0.01
tic = time()
sfm = SelectFromModel(ridge, threshold=threshold).fit(X, y)
toc = time()
print(f"Features selected by SelectFromModel: {feature_names[sfm.get_support()]}")
print(f"Done in {toc - tic:.3f}s")

over = SMOTE(sampling_strategy='all')
under = EditedNearestNeighbours(sampling_strategy='all')
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X, y = pipeline.fit_resample(X, y)

In [7]:
df = X[feature_names[sfm.get_support()]].join(y)
df.to_csv('../data/data_1_04_08_binary.csv', index = False)