In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LassoCV

# Data preparation

In [60]:
x_train=pd.read_csv("data/artificial_train.data",header=None, sep=" ")

x_train=x_train.drop(columns=[500])

y_train=pd.read_csv("data/artificial_train.labels",header=None, sep=" ")

y_train=y_train.to_numpy().reshape(-1)

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)

# PCA

In [65]:
#pca = PCA(n_components=10)

#pca.fit(x_train)

#sum(pca.explained_variance_ratio_)

#x_train=pca.transform(x_train)

# Univariate feature selection

In [66]:
#x_train = SelectKBest(chi2, k=10).fit_transform(x_train, y_train)

# Removing features with low variance

In [67]:
#sel = VarianceThreshold(threshold=0.025)
#x_train=sel.fit_transform(x_train)

# Selecting features based on importance

In [None]:
lasso = LassoCV().fit(x_train, y_train)
importance = np.abs(lasso.coef_)
threshold = np.sort(importance)[-11] + 0.01
sfm = SelectFromModel(lasso, threshold=threshold).fit(x_train, y_train)
x_train=sfm.transform(x_train)

# Sequential Feature Selection (is very slow) (lasso can be replaced with reg or forest)

In [None]:
#lasso = LassoCV().fit(x_train, y_train)
#importance = np.abs(lasso.coef_)

#sfs_forward = SequentialFeatureSelector(lasso, n_features_to_select=4,
                                        direction='forward').fit(x_train, y_train)

#sfs_forward.get_support()

#x_train=sfs_forward.transform(x_train)

# Models

In [189]:
reg = LogisticRegression().fit(x_train, y_train.to_numpy().reshape(-1))

pred = reg.predict(x_train)

balanced_accuracy_score(y_train, pred)

0.612

In [190]:
forest=RandomForestClassifier().fit(x_train, y_train.to_numpy().reshape(-1))

pred = forest.predict(x_train)

balanced_accuracy_score(y_train, pred)

1.0

# 5 cross validation

In [192]:
acc_reg=[0 for x in range(5)]
acc_forest=[0 for x in range(5)]
for i in range(5):
    mask = np.zeros(x_train.shape[0], dtype=bool)
    mask[i::5] = True

    x_test = x_train[mask]
    x_fold = x_train[~mask]
    
    y_test = y_train[mask]
    y_fold = y_train[~mask]
    
    reg = LogisticRegression().fit(x_fold, y_fold)
    pred = reg.predict(x_test)
    acc_reg[i]=balanced_accuracy_score(y_test, pred)
    
    forest=RandomForestClassifier().fit(x_fold, y_fold)
    pred = forest.predict(x_test)
    acc_forest[i]=balanced_accuracy_score(y_test, pred)
    

In [193]:
acc_forest

[0.8688908949572457,
 0.8104293864478029,
 0.8523809523809525,
 0.8322082208220822,
 0.8522792628341377]

In [194]:
acc_reg

[0.6132300208129592,
 0.588104293864478,
 0.6213032581453635,
 0.60996099609961,
 0.6042359530894451]