# OneClassSVM
https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Read and standarize the dataset
df = pd.read_csv('data/selected_data1.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
df = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns.values)
df['target'] = y

# Divide dataframe depending on target
df_correct = df.loc[df['target'] == 1]
df_incorrect = df.loc[df['target'] == 0]

# Print each target shape
print('Target 1 shape: ' + str(df_correct.shape))
print('Target 0 shape: ' + str(df_incorrect.shape))

Target 1 shape: (5049, 25)
Target 0 shape: (4951, 25)


In [2]:
from sklearn.model_selection import train_test_split

df_train, df_test_correct = train_test_split(df_correct, test_size=0.2, random_state=0)
df_test_incorrect = df_incorrect

X_train = df_train.iloc[:,:-1]
X_test_correct = df_test_correct.iloc[:,:-1]
X_test_incorrect = df_test_incorrect.iloc[:,:-1]

In [3]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.97).fit(X_train)
X_train = pca.transform(X_train)
X_test_correct = pca.transform(X_test_correct)
X_test_incorrect = pca.transform(X_test_incorrect)


**OneClassSVM**

SVMs are max-margin methods, i.e. they do not model a probability distribution. Here the idea is to find a function that is positive for regions with high density of points, and negative for small densities.

**Relevant parameters for tunning**

- **nu**: The proportion of outliers you expect to observe. The parameter nu is an upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors relative to the total number of training examples. For example, if you set it to 0.05 you are guaranteed to find at most 5% of your training examples being misclassified (at the cost of a small margin, though) and at least 5% of your training examples being support vectors.
    - decreasing nu can cause overfitting
    - increasing nu can cause underfitting
- **gamma**: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’(if ‘auto’, uses 1 / n_features).

In [4]:
# -1 = outlier
#  1 = inlier
def MyOneClassSVM(kernel='rbf'):
    import collections
    from sklearn.svm import OneClassSVM
    
    model = OneClassSVM(kernel=kernel, gamma='auto', nu=0.2).fit(X_train)
    pred_correct = model.predict(X_test_correct)
    pred_incorrect = model.predict(X_test_incorrect)
    pred_train = model.predict(X_train)

    print("PRED TRAIN", collections.Counter(pred_train))
    print("PRED CORRECT", collections.Counter(pred_correct))
    print("PRED INCORRECT", collections.Counter(pred_incorrect))

### rbf

In [5]:
MyOneClassSVM(kernel='rbf')

PRED TRAIN Counter({1: 3232, -1: 807})
PRED CORRECT Counter({1: 801, -1: 209})
PRED INCORRECT Counter({-1: 3122, 1: 1829})


### poly

In [6]:
MyOneClassSVM(kernel='poly')

PRED TRAIN Counter({1: 3199, -1: 840})
PRED CORRECT Counter({1: 782, -1: 228})
PRED INCORRECT Counter({1: 3378, -1: 1573})


### linear

In [7]:
MyOneClassSVM(kernel='linear')

PRED TRAIN Counter({-1: 3499, 1: 540})
PRED CORRECT Counter({-1: 873, 1: 137})
PRED INCORRECT Counter({-1: 3613, 1: 1338})


### sigmoid

In [8]:
MyOneClassSVM(kernel='sigmoid')

PRED TRAIN Counter({1: 3229, -1: 810})
PRED CORRECT Counter({1: 822, -1: 188})
PRED INCORRECT Counter({1: 4147, -1: 804})


# OneClassSVM (ExtractedData - Correlation - Varianze)

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('Z:\Descargas\extracted_data.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop('id', axis=1)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [3]:
# remove low varianze columns
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
selector.fit(X)
columns = X.columns[selector.get_support(indices=True)]
X = X[columns]
X.head()

  self.variances_ = np.nanvar(X, axis=0)
  (self.variances_ <= self.threshold)):
  return self.variances_ > self.threshold


Unnamed: 0,Flow rate__ar_coefficient__k_10__coeff_1,Flow rate__ar_coefficient__k_10__coeff_2,Flow rate__ar_coefficient__k_10__coeff_3,Flow rate__ar_coefficient__k_10__coeff_4,"Flow rate__augmented_dickey_fuller__attr_""usedlag""",Flow rate__cid_ce__normalize_True,Flow rate__count_above_mean,Flow rate__count_below_mean,"Flow rate__fft_aggregated__aggtype_""centroid""","Flow rate__fft_aggregated__aggtype_""kurtosis""",...,Zone9_Pressure__sum_of_reoccurring_data_points,Zone9_Pressure__sum_of_reoccurring_values,Zone9_Pressure__sum_values,Zone9_Pressure__symmetry_looking__r_0.1,Zone9_Pressure__symmetry_looking__r_0.15000000000000002,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_1,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_2,Zone9_Pressure__time_reversal_asymmetry_statistic__lag_3,Zone9_Pressure__value_count__value_0,Zone9_Pressure__variance
0,1.097224,-0.119454,-0.018179,0.013075,1.0,2.979672,372.0,544.0,49.084752,8.469481,...,0.0,0.0,5125808.0,0.0,1.0,220032400000.0,432622100000.0,627574400000.0,749.0,188784300.0
1,1.157133,-0.161022,-0.005301,-0.040777,1.0,2.496999,380.0,629.0,51.389448,9.12482,...,0.0,0.0,17361410.0,0.0,0.0,983194200000.0,1959747000000.0,2929644000000.0,752.0,1132615000.0
2,1.076403,-0.057993,-0.032339,-0.005114,1.0,2.028811,425.0,435.0,32.345881,12.249977,...,0.0,0.0,4831920.0,1.0,1.0,415384100000.0,816260600000.0,1207333000000.0,706.0,247846700.0
3,1.083679,-0.072918,-0.031834,-0.004072,1.0,2.406142,425.0,538.0,46.956311,9.386179,...,0.0,0.0,11710680.0,0.0,0.0,457429000000.0,911627600000.0,1364312000000.0,700.0,566258200.0
4,1.050232,-0.047799,-0.0439,0.013567,1.0,2.456812,410.0,525.0,45.288313,9.376401,...,0.0,0.0,6048610.0,0.0,1.0,285422800000.0,562231500000.0,836442200000.0,689.0,219879900.0


In [4]:
# remove 1 column between 2 highly correlated
def correl(data, threshold):
    corr = data.corr()
    print('Corr calculated')
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >= threshold:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    return data

X = correl(X, 0.9)

X.head()

Corr calculated


Unnamed: 0,Flow rate__ar_coefficient__k_10__coeff_1,Flow rate__ar_coefficient__k_10__coeff_4,"Flow rate__augmented_dickey_fuller__attr_""usedlag""",Flow rate__cid_ce__normalize_True,Flow rate__count_above_mean,Flow rate__count_below_mean,"Flow rate__fft_aggregated__aggtype_""centroid""","Flow rate__fft_aggregated__aggtype_""kurtosis""","Flow rate__fft_coefficient__coeff_10__attr_""angle""","Flow rate__fft_coefficient__coeff_11__attr_""angle""",...,Zone9_Pressure__partial_autocorrelation__lag_9,Zone9_Pressure__quantile__q_0.7,Zone9_Pressure__range_count__max_1__min_-1,Zone9_Pressure__skewness,Zone9_Pressure__spkt_welch_density__coeff_2,Zone9_Pressure__spkt_welch_density__coeff_5,Zone9_Pressure__spkt_welch_density__coeff_8,Zone9_Pressure__sum_of_reoccurring_data_points,Zone9_Pressure__symmetry_looking__r_0.1,Zone9_Pressure__symmetry_looking__r_0.15000000000000002
0,1.097224,0.013075,1.0,2.979672,372.0,544.0,49.084752,8.469481,2.951076,168.159905,...,-0.002299,0.0,749.0,2.331234,941450500.0,31325780.0,14951980.0,0.0,0.0,1.0
1,1.157133,-0.040777,1.0,2.496999,380.0,629.0,51.389448,9.12482,-105.444874,177.279461,...,-0.015294,0.0,752.0,1.532461,5196042000.0,291949300.0,33340110.0,0.0,0.0,0.0
2,1.076403,-0.005114,1.0,2.028811,425.0,435.0,32.345881,12.249977,65.125545,-55.917921,...,-0.026681,0.0,706.0,2.915235,5852888.0,376989.5,46760.94,0.0,1.0,1.0
3,1.083679,-0.004072,1.0,2.406142,425.0,538.0,46.956311,9.386179,136.341265,-148.918332,...,-0.015482,0.0,700.0,1.60486,3360284000.0,80318150.0,26510420.0,0.0,0.0,0.0
4,1.050232,0.013567,1.0,2.456812,410.0,525.0,45.288313,9.376401,-161.838119,122.674278,...,-0.010105,0.0,689.0,2.391474,105701200.0,14036070.0,6431597.0,0.0,0.0,1.0


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns.values)
df['target'] = y

# Divide dataframe depending on target
df_correct = df.loc[df['target'] == 1]
df_incorrect = df.loc[df['target'] == 0]

# Divide on train/test
df_train, df_test_correct = train_test_split(df_correct, test_size=0.2, random_state=0)
df_test_incorrect = df_incorrect

X_train = df_train.iloc[:,:-1]
X_test_correct = df_test_correct.iloc[:,:-1]
X_test_incorrect = df_test_incorrect.iloc[:,:-1]

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.97).fit(X_train)
X_train = pca.transform(X_train)
X_test_correct = pca.transform(X_test_correct)
X_test_incorrect = pca.transform(X_test_incorrect)

In [9]:
def MyOneClassSVM(kernel='rbf'):
    import collections
    from sklearn.svm import OneClassSVM
    
    model = OneClassSVM(kernel=kernel, gamma='auto', nu=0.1).fit(X_train)
    pred_correct = model.predict(X_test_correct)
    pred_incorrect = model.predict(X_test_incorrect)
    pred_train = model.predict(X_train)

    print("PRED TRAIN", collections.Counter(pred_train))
    print("PRED CORRECT", collections.Counter(pred_correct))
    print("PRED INCORRECT", collections.Counter(pred_incorrect))
    
MyOneClassSVM(kernel='rbf')

PRED TRAIN Counter({1: 3328, -1: 716})
PRED CORRECT Counter({1: 696, -1: 315})
PRED INCORRECT Counter({1: 4091, -1: 868})
