In [15]:
import numpy as np, pandas as pd
from matplotlib import pyplot as plt

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.decomposition import PCA, KernelPCA

from WISE_tools import plot_confusion_matrix

%matplotlib inline

# Read in Data

In [4]:
good = pd.read_csv('../data/bright_sample/massive.csv.gz')
variability_df = pd.read_csv('first_order_statistics.csv')
timescale_df = pd.read_csv('timescales.csv')
classification_df = pd.read_csv('classifications.csv')
joined = good.merge(classification_df,on='source_id')
joined = joined.merge(variability_df,left_on='designation',right_on='WISE_Designation')
joined = joined.merge(timescale_df,left_on='designation',right_on='WISE_Designation')
len(joined),joined.columns

(6484,
 Index(['BP-RP', 'Galaxy', 'M_G', 'M_W1', 'W1-W2', 'W2-W3', 'W3-W4', 'a_g_val',
        'allwise_oid', 'dec', 'designation', 'original_ext_source_id',
        'parallax', 'parallax_error', 'phot_bp_mean_mag', 'phot_g_mean_mag',
        'phot_rp_mean_mag', 'r_est', 'r_hi', 'r_len', 'r_lo', 'ra', 'source_id',
        'w1mpro', 'w1mpro_error', 'w2mpro', 'w2mpro_error', 'w3mpro',
        'w3mpro_error', 'w4mpro', 'w4mpro_error', 'CommonName_x', 'MKType',
        'SimbadOType', 'SpT', 'Jmag', 'e_Jmag', 'Hmag', 'e_Hmag', 'Kmag',
        'e_Kmag', 'G-J', 'J-H', 'H-K', 'K-W1', 'J-W1', 'Unnamed: 0_x',
        'CommonName_y', 'Class', 'CoarseClass', 'IsBinary', 'Unnamed: 0_y',
        'WISE_Designation_x', 'N_obs', 'W1chi2', 'W1chi2red', 'W1_MAD',
        'W1_EWM', 'W2chi2', 'W2chi2red', 'W2_MAD', 'W2_EWM', 'W1-W2chi2',
        'W1-W2chi2red', 'W1-W2_MAD', 'W1-W2_EWM', 'Unnamed: 0',
        'WISE_Designation_y', 'N_visits', 'W1_Nzeroes', 'W1_meanDt', 'W1_stdDt',
        'W2_Nzeroes', 'W2_

In [5]:
#Let's throw out classes that we don't care about/aren't specific enough, plus binaries, plus one
#bad entry
training_indices = np.array([c not in np.array(['Misc. Variable','OBA','Unknown/Candidate']) 
 for c in joined['Class'].values]) & np.array([i == 0 for i in joined['IsBinary'].values]) & (joined['J-H'] < 2)
training_data = joined[training_indices].fillna(0).reset_index()

#Our features will be color and G/W1 magnitude.
X = training_data[['M_G','G-J','J-H','H-K','K-W1','W1-W2','W2-W3','W3-W4','M_W1']]
y = training_data['Class']

len(y)

2948

In [18]:
#What features are we going to use? Let's use a PCA, a Kernel PCA, and some of the original
#features to do that!
pca = PCA(n_components=9) #9 linear PCA components
kpca = KernelPCA(n_components=9)
selection = SelectKBest(k=3) #3 original features

combined_features = FeatureUnion([("pca", pca), ("kpca", kpca), ("univ_select", selection)])

# Use combined features to transform dataset:
X_features = combined_features.fit(X, y).transform(X)

tree = DecisionTreeClassifier()
forest = RandomForestClassifier(n_jobs=-1)
gm = GaussianMixture(n_components=(len(np.unique(y))),
                     means_init=[X_features[y == c].mean(axis=0) for c in np.unique(y)])

In [19]:
#Now set up our pipelines for the three algorithms
tree_pipeline = Pipeline([("features", combined_features), ("clf", tree)])
forest_pipeline = Pipeline([("features", combined_features), ("clf", forest)])
gm_pipeline = Pipeline([("features", combined_features), ("clf", gm)])

In [20]:
tree_param_grid = dict(features__pca__n_components = [3, 6, 9],
                       features__pca__whiten = [True,False],
                       features__kpca__kernel = ['linear','poly','rbf','sigmoid'],
                       features__kpca__coef0 = np.linspace(0,9,21),
                       features__kpca__gamma = np.array([0.5,1,2])/len(X.columns),
                       features__kpca__degree = [2,3,4],
                       features__univ_select__k = [1, 2, 3, 4, 5],
                       clf__max_depth = np.arange(2,15,1),
                       clf__max_features = ['sqrt','log2',None])

forest_param_grid = dict(features__pca__n_components = [3, 6, 9],
                         features__pca__whiten = [True,False],
                         features__kpca__kernel = ['linear','poly','rbf','sigmoid'],
                         features__kpca__coef0 = np.linspace(0,9,21),
                         features__kpca__gamma = np.array([0.5,1,2])/len(X.columns),
                         features__kpca__degree = [2,3,4],
                         features__univ_select__k = [1, 2, 3, 4, 5],
                         clf__max_depth = [int(x) for x in np.linspace(10, 110, 11)]+[None],
                         clf__max_features = ['sqrt','log2',None],
                         clf__n_estimators = np.arange(70,140,10))

gm_param_grid = dict(features__pca__n_components = [3, 6, 9],
                     features__pca__whiten = [True,False],
                     features__kpca__kernel = ['linear','poly','rbf','sigmoid'],
                     features__kpca__coef0 = np.linspace(0,9,21),
                     features__kpca__gamma = np.array([0.5,1,2])/len(X.columns),
                     features__kpca__degree = [2,3,4],
                     features__univ_select__k = [1, 2, 3, 4, 5],
                     clf__covariance_type = ['full','tied','diag','spherical'])

In [21]:
tree_grid_search = RandomizedSearchCV(tree_pipeline, tree_param_grid, n_iter=500, cv=4, n_jobs=-1)
forest_grid_search =RandomizedSearchCV(forest_pipeline, forest_param_grid, n_iter=500, cv=4, n_jobs=-1)
gm_grid_search = RandomizedSearchCV(gm_pipeline, gm_param_grid, n_iter=500, cv=4, n_jobs=-1)

print('Finding tree...')
tree_grid_search.fit(X, y)
print('Finding forest...')
forest_grid_search.fit(X, y)
print('Finding Gaussian Mixture...')
gm_grid_search.fit(X, y)

best_tree = tree_grid_search.best_estimator_
best_forest = forest_grid_search.best_estimator_
best_gm = gm_grid_search.best_estimator_

Finding tree...




Finding forest...




Finding Gaussian Mixture...


ValueError: The parameter 'means' should have the shape of (10, 13), but got (10, 21)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

#CM plots