In [1]:
import numpy as np
import pandas as pd
import math
import sys

import matplotlib.pyplot as plt 
%matplotlib inline

import random as rand
from collections import defaultdict

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
from sklearn.linear_model import RidgeClassifier
from sklearn.manifold import Isomap 
from sklearn.manifold import MDS
from sklearn.manifold import TSNE 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

from sklearn import manifold 

from time import time

from umap import UMAP

In [2]:
adult_test_df = pd.read_csv('datasets/benchmark/adult_clean_test.csv')
adult_train_df = pd.read_csv('datasets/benchmark/adult_clean_train.csv')
titanic_test_df = pd.read_csv('datasets/benchmark/titanic_clean_test.csv')
titanic_train_df = pd.read_csv('datasets/benchmark/titanic_clean_train.csv')
weather_test_df = pd.read_csv('datasets/benchmark/weatherAUS_clean_test.csv')
weather_train_df = pd.read_csv('datasets/benchmark/weatherAUS_clean_train.csv')
wine_test_df = pd.read_csv('datasets/benchmark/wine_test.csv')
wine_train_df = pd.read_csv('datasets/benchmark/wine_train.csv')

In [3]:
def extract_X_y(df, col_y):
    X = df.drop(col_y, axis=1)
    y = df[col_y]
    return X, y

In [4]:
adult_test_X, adult_test_y = extract_X_y(adult_test_df, 'income_>50k')
adult_train_X, adult_train_y = extract_X_y(adult_train_df, 'income_>50k')

titanic_test_X, titanic_test_y = extract_X_y(titanic_test_df, 'Survived')
titanic_train_X, titanic_train_y = extract_X_y(titanic_train_df, 'Survived')

weather_test_X, weather_test_y = extract_X_y(weather_test_df, 'RainTomorrow')
weather_train_X, weather_train_y = extract_X_y(weather_train_df, 'RainTomorrow')

wine_test_X, wine_test_y = extract_X_y(wine_test_df, 'quality')
wine_train_X, wine_train_y = extract_X_y(wine_train_df, 'quality')

In [16]:
adult_test_iso_X = Isomap(n_neighbors=10, n_jobs=-1).fit_transform(adult_test_X.values) # n_components=2

In [15]:
adult_test_iso_X.shape

(9769, 2)

In [19]:
adult_test_tsne_X = TSNE().fit_transform(adult_test_X.values) # n_components=2



In [21]:
adult_test_tsne_X.shape 

(9769, 2)

In [20]:
adult_test_umap_X = UMAP().fit_transform(adult_test_X.values) #n_components=2

In [22]:
adult_test_umap_X.shape

(9769, 2)

In [24]:
adult_test_mds_X = MDS().fit_transform(adult_test_X.values) # n_components=2, 

In [25]:
adult_test_mds_X.shape

(9769, 2)

In [5]:
def compute_score(model, test_X, test_y):
    pred = model.predict(test_X)
    acc = accuracy_score(pred, test_y)
    f1 = f1_score(pred, test_y)
    conf = confusion_matrix(pred, test_y)

    return acc, f1, conf

In [6]:
def fit_compute_score(model, train_X, train_y, test_X, test_y):
    t0 = time()
    model.fit(train_X, train_y)
    duration = time() - t0
    train_acc, train_f1, train_conf = compute_score(model, train_X, train_y)
    test_acc, test_f1, test_conf = compute_score(model, test_X, test_y)
    print("duration", duration)
    print("train accuracy", train_acc)
    print("train f1", train_f1)
    print("train tn, fp, fn, tp", train_conf.ravel()) 
    print("test accuracy", test_acc)
    print("test f1", test_f1)
    print("test tn, fp, fn, tp", test_conf.ravel())
    print()
    
    return train_acc, train_f1, train_conf, test_acc, test_f1, test_conf

In [7]:
def evaluate_reduction(model_factory, model_name, train_X, train_y, test_X, test_y):
    clf_ridge_default = RidgeClassifier()
    print('ridge_default')
    ridge_default_train_acc, ridge_default_train_f1, ridge_default_train_conf, \
        ridge_default_test_acc, ridge_default_test_f1, ridge_default_test_conf = \
        fit_compute_score(clf_ridge_default, train_X, train_y, test_X, test_y)

    n_estimators = 1000
    clf_randfor_default = RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators)
    print('randfor_default')
    randfor_default_train_acc, randfor_default_train_f1, randfor_default_train_conf, \
        randfor_default_test_acc, randfor_default_test_f1, randfor_default_test_conf = \
        fit_compute_score(clf_randfor_default, train_X, train_y, test_X, test_y)

    ridge_train_acc = []
    ridge_train_f1 = []
    ridge_train_conf = []
    ridge_test_acc = []
    ridge_test_f1 = []
    ridge_test_conf = []
    
    randfor_train_acc = []
    randfor_train_f1 = []
    randfor_train_conf = []
    randfor_test_acc = []
    randfor_test_f1 = []
    randfor_test_conf = []

    for i in range(2, 11, 2):
        percent = 0.1*i
        n_features = math.ceil(train_X.shape[1]*percent)

        t0 = time()
        reduction_model = model_factory(n_features)
        train_X_reduced = reduction_model.fit_transform(train_X)
        test_X_reduced  = reduction_model.fit_transform(test_X)
        duration = time() - t0
        print("transform duration", duration, "reduce percent", percent, "n_features", n_features)
        print()

        clf_ridge_hash = RidgeClassifier()
        ridge_name = 'ridge_' + model_name
        print(ridge_name)
        ridge_hash_train_acc, ridge_hash_train_f1, ridge_hash_train_conf, \
            ridge_hash_test_acc, ridge_hash_test_f1, ridge_hash_test_conf = \
            fit_compute_score(clf_ridge_hash, train_X_reduced, train_y, test_X_reduced, test_y)
        ridge_train_acc.append(ridge_hash_train_acc)
        ridge_train_f1.append(ridge_hash_train_f1)
        ridge_train_conf.append(ridge_hash_train_conf)
        ridge_test_acc.append(ridge_hash_test_acc)
        ridge_test_f1.append(ridge_hash_test_f1)
        ridge_test_conf.append(ridge_hash_test_conf)

        clf_randfor_hash = RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators)
        randfor_name = 'randfor_' + model_name
        print(randfor_name)
        randfor_hash_train_acc, randfor_hash_train_f1, randfor_hash_train_conf, \
            randfor_hash_test_acc, randfor_hash_test_f1, randfor_hash_test_conf = \
            fit_compute_score(clf_randfor_hash, train_X_reduced, train_y, test_X_reduced, test_y)
        randfor_train_acc.append(randfor_hash_train_acc)
        randfor_train_f1.append(randfor_hash_train_f1)
        randfor_train_conf.append(randfor_hash_train_conf)
        randfor_test_acc.append(randfor_hash_test_acc)
        randfor_test_f1.append(randfor_hash_test_f1)
        randfor_test_conf.append(randfor_hash_test_conf)
    
    t = np.arange(2, 11, 2)

    fig, ax = plt.subplots(2, 2, figsize=(15,10), tight_layout=True)
    
    ax[0,0].plot(t, ridge_default_train_acc*np.ones(5), label='ridge_default')
    ax[0,0].plot(t, randfor_default_train_acc*np.ones(5), label='randfor_default')
    ax[0,0].plot(t, ridge_train_acc, label=ridge_name)
    ax[0,0].plot(t, randfor_train_acc, label=randfor_name)
    ax[0,0].set(xlabel='reduce %', ylabel='accuracy', title='train')
    ax[0,0].legend()

    ax[0,1].plot(t, ridge_default_test_acc*np.ones(5), label='ridge_default')
    ax[0,1].plot(t, randfor_default_test_acc*np.ones(5), label='randfor_default')
    ax[0,1].plot(t, ridge_test_acc, label=ridge_name)
    ax[0,1].plot(t, randfor_test_acc, label=randfor_name)
    ax[0,1].set(xlabel='reduce %', ylabel='accuracy', title='test')
    ax[0,1].legend()
    
    ax[1,0].plot(t, ridge_default_train_f1*np.ones(5), label='ridge_default')
    ax[1,0].plot(t, randfor_default_train_f1*np.ones(5), label='randfor_default')
    ax[1,0].plot(t, ridge_train_f1, label=ridge_name)
    ax[1,0].plot(t, randfor_train_f1, label=randfor_name)
    ax[1,0].set(xlabel='reduce %', ylabel='f1', title='train')
    ax[1,0].legend()
    
    ax[1,1].plot(t, ridge_default_test_f1*np.ones(5), label='ridge_default')
    ax[1,1].plot(t, randfor_default_test_f1*np.ones(5), label='randfor_default')
    ax[1,1].plot(t, ridge_test_f1, label=ridge_name)
    ax[1,1].plot(t, randfor_test_f1, label=randfor_name)
    ax[1,1].set(xlabel='reduce %', ylabel='f1', title='test')
    ax[1,1].legend()

    plt.show()

In [None]:
# n_neighbors = 10 because warning
evaluate_reduction(lambda n: Isomap(n_neighbors=10, n_jobs=-1, n_components=n), "iso", adult_train_X, adult_train_y, adult_test_X, adult_test_y)

ridge_default
duration 0.09359264373779297
train accuracy 0.8433445089959819
train f1 0.6108462076419353
train tn, fp, fn, tp [28148  4476  1645  4804]
test accuracy 0.8356024158050978
test f1 0.6044334975369458
test tn, fp, fn, tp [6936 1180  426 1227]

randfor_default
duration 14.188953399658203
train accuracy 0.9999488137588616
train f1 0.9998922413793103
train tn, fp, fn, tp [29792     1     1  9279]
test accuracy 0.8556658818712253
test f1 0.678669097538742
test tn, fp, fn, tp [6870  918  492 1489]



  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
