In [1]:
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns
sns.set(style="whitegrid")
np.random.seed(203)

Using TensorFlow backend.


In [2]:
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn import metrics

In [3]:
def tsne_plot(x1, y1, name="graph.png"):
    tsne = TSNE(n_components=2, random_state=0)
    X_t = tsne.fit_transform(x1)

    plt.figure(figsize=(12, 8))
    for i in range(0,y1.astype('int32').max()+1):
        plt.scatter(X_t[np.where(abs(y1) == i), 0], X_t[np.where(abs(y1) == i), 1], marker='o', color=plt.cm.nipy_spectral(i / 10.), linewidth='1', alpha=0.8, label='{}_label'.format(i))
        
    plt.legend(loc='best');
    plt.savefig(name);
    plt.show();

In [4]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66', 'Var38'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66', 'Var38'])
data_label_id = data_label['Var1']

data_nolabel_v_f = data_nolabel_v.fillna(data_nolabel_v.mean())
data_label_v_f = data_label_v.fillna(data_label_v.mean())


In [5]:
X_label = data_label_v_f.values
Y_label = data_label['Var66'].values

In [6]:
tsne_plot(X_label, Y_label, "original.png")

KeyboardInterrupt: 

In [None]:
y_nolabel_ones = np.zeros(data_nolabel_v_f.shape[0])
tsne_plot(data_nolabel_v_f.values, y_nolabel_ones, "nolabel_original.png")

In [None]:
## input layer 
input_layer = Input(shape=(X_label.shape[1],))

## encoding part
encoded = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(50, activation='relu')(encoded)

## decoding part
decoded = Dense(50, activation='tanh')(encoded)
decoded = Dense(100, activation='tanh')(decoded)

## output layer
output_layer = Dense(X_label.shape[1], activation='relu')(decoded)

In [None]:
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adadelta", loss="mse")

In [None]:
x = data_label_v_f.values
y = data_label['Var66'].values

In [None]:
x_scale = preprocessing.MinMaxScaler().fit_transform(x)
x_normal, x_bankrupt = x_scale[y==0], x_scale[y == 1]

In [None]:
autoencoder.fit(x_normal, x_normal, 
                batch_size = 256, epochs = 10, 
                shuffle = True, validation_split = 0.20);

In [None]:
#autoencoder.save('3_24_2.h5')

In [None]:
hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])

In [None]:
norm_hid_rep = hidden_representation.predict(x_normal)
bankrupt_hid_rep = hidden_representation.predict(x_bankrupt)

rep_x = np.append(norm_hid_rep, bankrupt_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(bankrupt_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)
tsne_plot(rep_x, rep_y, "latent_representation.png")

In [None]:
# cluser = DBSCAN(eps=0.1, min_samples=10).fit(rep_x)
# labels = cluser.labels_
# y_pred = pd.DataFrame(labels, columns=["Var66"])
# y_pred.loc[y_pred['Var66'] == -1, 'Var66'] = 1
# tsne_plot(rep_x, labels, "{}.png".format('DBSCAN'))
# y_pred['Var66'].value_counts()

In [None]:
# cluster = KMeans(n_clusters=2).fit(rep_x)
# labels = cluster.labels_
# y_pred = pd.DataFrame(labels, columns=["Var66"])
# y_pred['Var66'].value_counts()
# tsne_plot(rep_x, labels, "{}.png".format('KMeans'))

In [None]:
# from sklearn import mixture
# cluster = mixture.GaussianMixture(n_components=2, covariance_type='full').fit(rep_x)
# labels = cluster.predict(rep_x)
# y_pred = pd.DataFrame(labels, columns=["Var66"])
# tsne_plot(rep_x, labels, "{}.png".format('KMeans'))
# y_pred['Var66'].value_counts()

In [None]:
# y_pred.loc[y_pred['Var66'] == 0, 'Var66'] = 2
# y_pred.loc[y_pred['Var66'] == 1, 'Var66'] = 0
# y_pred.loc[y_pred['Var66'] == 2, 'Var66'] = 1

In [None]:
x_scale_no = preprocessing.MinMaxScaler().fit_transform(data_nolabel_v_f.values)
test_x = hidden_representation.predict(x_scale_no)

In [None]:
test_y = np.zeros(data_nolabel_v_f.shape[0])
tsne_plot(test_x, test_y, "test.png")

In [None]:
# cluser = DBSCAN(eps=0.1, min_samples=10).fit(data_nolabel_v_f.values)
# labels = cluser.labels_
# y_pred = pd.DataFrame(labels, columns=["Var66"])
# y_pred.loc[y_pred['Var66'] == -1, 'Var66'] = 1
# tsne_plot(data_nolabel_v_f.values, labels, "{}.png".format('DBSCAN'))
# y_pred['Var66'].value_counts()

In [None]:
cluser = DBSCAN(eps=0.1, min_samples=5).fit(test_x)
labels = cluser.labels_
y_pred = pd.DataFrame(labels, columns=["Var66"])
#y_pred.loc[y_pred['Var66'] == -1, 'Var66'] = 1
y_pred['Var66'].value_counts()

In [None]:
y_pred.loc[y_pred['Var66'] == -1, 'Var66'] = 1
tsne_plot(test_x, labels, "{}.png".format('DBSCAN'))

In [None]:
# from sklearn import mixture
# cluster = mixture.GaussianMixture(n_components=2, covariance_type='full').fit(test_x)
# labels = cluster.predict(test_x)
# y_pred = pd.DataFrame(labels, columns=["Var66"])
# tsne_plot(test_x, labels, "{}.png".format('KMeans'))
# y_pred['Var66'].value_counts()

In [None]:
# cluster = KMeans(n_clusters=2).fit(test_x)
# labels = cluster.labels_
# y_pred = pd.DataFrame(labels, columns=["Var66"])
# y_pred.loc[y_pred['Var66'] == -1, 'Var66'] = 1
# tsne_plot(test_x, labels, "{}.png".format('DBSCAN'))
# y_pred['Var66'].value_counts()