<a href="https://colab.research.google.com/github/thissop/MAXI-J1535/blob/main/code/notebooks/December-%202021-2022/classifier_decider.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale as normalize
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, median_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


data_df = pd.read_csv('https://raw.githubusercontent.com/thissop/MAXI-J1535/main/data/processed/fixed_merged.csv?token=AQN5JT7ZW3ZKV5EKEEUCVITB3RP4Y')

In [None]:
# Split data into qpo present and qpo absent 

no_qpo_idx = np.where(data_df['first_par1s']==0)[0]
has_qpo_idx = np.where(data_df['first_par1s']!=0)[0]

has_qpo_df = data_df.iloc[has_qpo_idx, :].sample(frac=1)
no_qpo_df = data_df.iloc[no_qpo_idx, :].sample(frac=1)

print(len(has_qpo_df.index))

57


In [None]:
# prepare data for analysis

# QPO DATA

# X

arr_names =  ['hardness', 'tins', 'disk_norm', 'gammas', 'nthcomp_norms', 'intensities']
(hardness, tins, disk_norms, gammas, nthcomp_norms, intensities) = (normalize(np.array(has_qpo_df[arr_name])) for arr_name in arr_names)

## Stack x values
x_vals = np.array([], dtype=np.int64).reshape(0,6) # why are they type int

for a, b, c, d, e, f in zip(hardness, tins, disk_norms, gammas, nthcomp_norms, intensities): 
    new_arr = np.array([float(a), float(b), float(c), float(d), float(e), float(f)])
    x_vals = np.vstack([x_vals, new_arr])

# num QPOS
num_qpos = np.array(has_qpo_df['num_qpos']).reshape(len(has_qpo_df), 1).astype(int)

# Y 
y_vals = np.array([], dtype=np.float32).reshape(0, 3)

freq1s, freq2s, freq3s = (normalize(np.array(has_qpo_df[arr_name])) for arr_name in ['first_par1s', 'second_par1s', 'third_par1s'])

for freq1, freq2, freq3 in zip(freq1s, freq2s, freq3s): 
    new_arr = np.array([float(freq1), float(freq2), float(freq3)])
    y_vals = np.vstack([y_vals, new_arr])

# NO QPO DATA

# X

arr_names =  ['hardness', 'tins', 'disk_norm', 'gammas', 'nthcomp_norms', 'intensities']
(hardness_no_qpo, tins_no_qpo, disk_norms_no_qpo, gammas_no_qpo, nthcomp_norms_no_qpo, intensities_no_qpo) = (normalize(np.array(no_qpo_df[arr_name])) for arr_name in arr_names)

## Stack x values
x_vals_no_qpo = np.array([], dtype=np.int64).reshape(0,6) # why are they type int

for a, b, c, d, e, f in zip(hardness_no_qpo, tins_no_qpo, disk_norms_no_qpo, gammas_no_qpo, nthcomp_norms_no_qpo, intensities_no_qpo): 
    new_arr = np.array([float(a), float(b), float(c), float(d), float(e), float(f)])
    x_vals_no_qpo = np.vstack([x_vals_no_qpo, new_arr])

# num QPOS
num_qpos_no_qpo = np.array(no_qpo_df['num_qpos']).reshape(len(no_qpo_df), 1).astype(int)

# Y 
y_vals_no_qpo = np.array([], dtype=np.float32).reshape(0, 3)

freq1s_no_qpo, freq2s_no_qpo, freq3s_no_qpo = (normalize(np.array(no_qpo_df[arr_name])) for arr_name in ['first_par1s', 'second_par1s', 'third_par1s'])

for freq1, freq2, freq3 in zip(freq1s_no_qpo, freq2s_no_qpo, freq3s_no_qpo): 
    new_arr = np.array([float(freq1), float(freq2), float(freq3)])
    y_vals_no_qpo = np.vstack([y_vals_no_qpo, new_arr])

In [None]:
# General functions

def split_data(x_vals, y_vals, num_qpos): 
    # Train test split
    X_train, X_test, y_train, y_test, qpo_train, qpo_test = train_test_split(x_vals, y_vals, num_qpos, test_size=0.1)
    return X_train, X_test, y_train, y_test, qpo_train, qpo_test

def knn_predict(k, xtrain, ytrain, xtest, ytest): 
    knn = KNeighborsClassifier(k)
    knn.fit(xtrain, ytrain)
    predictions = knn.predict(xtest)
    acc = accuracy_score(predictions, ytest.flatten())

    return acc, predictions

def cullBuffers(x, qpo_classes):
    new_x = np.array([], dtype=np.float64).reshape(0,3)
    for x_row, qpo_class in zip(x, qpo_classes): 
        if qpo_class == 0: 
            new_x = np.vstack([new_x, np.zeros(3)])

        elif qpo_class == 1:
            new_x = np.vstack([new_x, [x_row[0], 0, 0]])

        elif qpo_class == 2: 
            new_x = np.vstack([new_x, [x_row[0], x_row[1], 0]])

        elif qpo_class == 3: 
            new_x = np.vstack([new_x, x_row[0:3]])

    return new_x

def drop_no_qpo(X_test, y_test, knn_qpo_predictions): 
    non_zero_indices = np.where(knn_qpo_predictions!=0)
    return X_test[non_zero_indices], y_test[non_zero_indices]

In [None]:
def custom_final_split(): 
    X_train, X_test, y_train, y_test, qpo_train, qpo_test = split_data(x_vals, y_vals, num_qpos) 
    X_train_no_qpo, X_test_no_qpo, y_train_no_qpo, y_test_no_qpo, qpo_train_no_qpo, qpo_test_no_qpo = split_data(x_vals_no_qpo, y_vals_no_qpo, num_qpos_no_qpo) 
    X_train_combined = np.concatenate((X_train, X_train_no_qpo))
    X_test_combined = np.concatenate((X_test, X_test_no_qpo))
    qpo_train_combined = np.concatenate((qpo_train, qpo_train_no_qpo))
    qpo_test_combined = np.concatenate((qpo_test, qpo_test_no_qpo))
    y_test_combined = np.concatenate((y_test, y_test_no_qpo))

    return X_train_combined, qpo_train_combined, X_test_combined, qpo_test_combined

In [None]:
knn_vals = [3,5,7,9]
knn_accs_arr = []



for model in [KNeighborsClassifier, RandomForestClassifier]: 
    model_name = str(model).split('\'')[1:2][0].split('.')[-1]

    if model_name == 'KNeighborsClassifier': 
        for i in knn_vals: 
            clf = model(n_neighbors=i)
            accs_arr = []
            for i in range(1000): 
                X_train_combined, qpo_train_combined, X_test_combined, qpo_test_combined = custom_final_split()
                clf.fit(X_train_combined, np.ravel(qpo_train_combined))
                clf_predictions = clf.predict(X_test_combined)
                accs_arr.append(accuracy_score(clf_predictions, qpo_test_combined))
            knn_accs_arr.append(np.mean(accs_arr))

    elif model_name == 'RandomForestClassifier': 


In [None]:
print(np.max(knn_accs_arr), knn_vals[np.argmax(knn_accs_arr)])

0.9330000000000002 3


In [None]:
print(knn_accs_arr)

[0.9330000000000002, 0.9220526315789475, 0.9223684210526317, 0.9237894736842106]


In [None]:
rf_params = {'min_samples_leaf':[1, 2, 4],'min_samples_split':[2, 5, 10]}
rf = RandomForestClassifier() 
clf = GridSearchCV(rf, rf_params)
rf_accs = []

clf.fit(X_train_combined, np.ravel(qpo_train_combined))
clf.best_score_

0.9333333333333333

In [None]:
clf.best_params_

{'min_samples_leaf': 4, 'min_samples_split': 2}

In [None]:
clf = RandomForestClassifier(min_samples_leaf=4, min_samples_split=2)
accs_arr = []
for i in range(1000): 
    X_train_combined, qpo_train_combined, X_test_combined, qpo_test_combined = custom_final_split()
    clf.fit(X_train_combined, np.ravel(qpo_train_combined))
    clf_predictions = clf.predict(X_test_combined)
    accs_arr.append(accuracy_score(clf_predictions, qpo_test_combined))

np.mean(accs_arr)

0.9328421052631579

In [None]:
knn_accs_arr = []
rf_accs_arr = []
clf = RandomForestClassifier(min_samples_leaf=4, min_samples_split=2)
for i in range(25): 
    X_train_combined, qpo_train_combined, X_test_combined, qpo_test_combined = custom_final_split()
    knn = KNeighborsClassifier(3)
    knn.fit(X_train_combined, np.ravel(qpo_train_combined))
    knn_predictions = knn.predict(X_test_combined)
    knn_accs_arr.append(accuracy_score(knn_predictions, qpo_test_combined))

    clf.fit(X_train_combined, np.ravel(qpo_train_combined))
    clf_predictions = clf.predict(X_test_combined)
    rf_accs_arr.append(accuracy_score(clf_predictions, qpo_test_combined))

In [None]:
print(np.mean(knn_accs_arr), np.mean(rf_accs_arr))

0.9242105263157893 0.9221052631578947


In [None]:
import time

In [None]:
knn = KNeighborsClassifier(3)
knn_start = time.time()
knn.fit(X_train_combined, np.ravel(qpo_train_combined))
knn_predictions = knn.predict(X_test_combined)
knn_acc = accuracy_score(knn_predictions, qpo_test_combined)
knn_end = time.time()
knn_end-knn_start

0.0051996707916259766

In [None]:
clf = RandomForestClassifier(min_samples_leaf=4, min_samples_split=2)
rf_start = time.time()
clf.fit(X_train_combined, np.ravel(qpo_train_combined))
clf_predictions = clf.predict(X_test_combined)
rf_acc = accuracy_score(clf_predictions, qpo_test_combined)
rf_end = time.time()
rf_end-rf_start

0.16596364974975586

## RESULTS
* The classification algo going forward will be randomforestclassifier with min_samples_leaf set to 4 and min_samples_split set to 2
* actually not sure. knn is a lot faster than rf