In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [None]:
data = pd.read_csv("data/data_n.csv")
target = pd.read_csv("data/target_n.csv")

In [None]:
#print(data.head)
print("INFO data")
print(data.info(verbose=True))
print("INFO target")
print(target.info(verbose=True))

In [None]:
# Data train into numpy
X_train = np.array(data.values)
print("INFO data train ndim ", X_train.ndim)
print("INFO data train shape ", X_train.shape)
# Data target into numpy
y_train = np.array(target.values)
print("INFO data target ndim ", y_train.ndim)
print("INFO data target shape ", y_train.shape)

In [None]:
# Fake the test data
X_test = X_train
y_test = y_train

In [None]:
print("X_train ", X_train)
print("y_train ", y_train)

In [None]:
# Supervised SVC parameter grid search
n_samples = len(X_train)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), n_splits=1, tuned_parameters, scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
# DBSCAN
from sklearn.cluster import DBSCAN
from sklearn import metrics

# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.7, min_samples=100).fit(X_train)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X_train, labels))
print('Estimated clusters ', labels)

# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X_train[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X_train[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [None]:
# BIRCH
from numpy import unique
from numpy import where
from sklearn.cluster import Birch
from matplotlib import pyplot
# define the model
model = Birch(threshold=0.1, n_clusters=6)
# fit the model
model.fit(X_train)
# assign a cluster to each example
yhat = model.predict(X_train)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X_train[row_ix, 0], X_train[row_ix, 1])
# show the plot
pyplot.show()

In [None]:
# Kmeans
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from matplotlib import pyplot
# define the model
model = KMeans(n_clusters=6)
# fit the model
model.fit(X_train)
# assign a cluster to each example
yhat = model.predict(X_train)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X_train[row_ix, 0], X_train[row_ix, 1])
# show the plot
pyplot.show()

In [None]:
# optics clustering
from sklearn.datasets import make_classification
from sklearn.cluster import OPTICS
from matplotlib import pyplot
# define the model
model = OPTICS(eps=0.8, min_samples=100)
# fit model and predict clusters
yhat = model.fit_predict(X_train)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X_train[row_ix, 0], X_train[row_ix, 1])
# show the plot
pyplot.show()

In [None]:
# gaussian mixture clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot
# define the model
model = GaussianMixture(n_components=6)
# fit the model
model.fit(X_train)
# assign a cluster to each example
yhat = model.predict(X_train)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X_train[row_ix, 0], X_train[row_ix, 1])
# show the plot
pyplot.show()

In [None]:
"""
Feature Selection is a technique which is used when we you know the target variable(Supervised Learning)
When we talk with respect to Unsupervised Learning, there is no exact technique which could do that. But there is something which can help us in those lines i.e., Dimensionality Reduction, this technique is used to reduce the number of features and give us the features which explains the most about the dataset. The features would be derived from the existing features and might or might not be the same features.
There are different techniques which are available for doing so:

# PCA
# Linear discriminant analysis
Non-negative Matrix Factorization
Generalized discriminant analysis and many more.
The outcome of Feature Selection would be the same features which explain the most with respect to the target variable but the outcome of the Dimensionality Reduction might or might not be the same features as these are derived from the given input.
"""


In [None]:
# PCA
from sklearn import decomposition
pca = decomposition.PCA(n_components=6)
X_reduced = pca.fit_transform(X_train)

print('Projecting %d-dimensional data to 2D' % X_train.shape[1])

plt.figure(figsize=(12,10))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=yhat, 
            edgecolor='none', alpha=0.7, s=40,
            cmap=plt.cm.get_cmap('nipy_spectral', 10))
plt.colorbar()
plt.title(' PCA projection');

In [None]:
# gaussian mixture clustering after PCA
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.mixture import GaussianMixture
from matplotlib import pyplot
# define the model
model = GaussianMixture(n_components=6)
# fit the model
model.fit(X_reduced)
# assign a cluster to each example
yhat = model.predict(X_reduced)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X_reduced[row_ix, 0], X_reduced[row_ix, 1])
# show the plot
pyplot.show()
print(yhat[100:])