In [74]:
import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Embedding, Dense, BatchNormalization, Dropout, Input, Flatten
from keras.optimizers import SGD
from keras.utils import to_categorical

In [2]:
def rebalance(frame, col='hab_lbl', factor=1):
    max_size = frame[col].value_counts().max()
    lst = [frame]
    for class_index, group in frame.groupby(col):
        lst.append(group.sample(int((max_size-len(group)) / factor), replace=True))
    frame_new = pd.concat(lst)
    
    return frame_new

In [3]:
df = pd.read_csv('dataset-rocky-no-STemp.csv')
df = df.drop('P. Habitable', axis=1)
y = df['hab_lbl']
#df.drop('hab_lbl', axis=1, inplace=True)

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
cat_vars = df.columns[np.where(df.dtypes == 'int64')]
cat_vars = cat_vars.tolist()
cat_vars.remove('hab_lbl')

In [6]:
cont_vars = df.columns[np.where(df.dtypes != 'int64')].tolist()

In [7]:
label_encoders = {}

In [8]:
for cat_col in cat_vars:
    label_encoders[cat_col] = LabelEncoder()
    df[cat_col] = label_encoders[cat_col].fit_transform(df[cat_col])

In [9]:
cat_vars

['P. Zone Class',
 'P. Mass Class',
 'P. Composition Class',
 'P. Atmosphere Class']

In [10]:
train_df, test_df = train_test_split(df, train_size=0.7)



In [11]:
train_df = rebalance(train_df)

In [12]:
train_df.drop('hab_lbl', axis=1, inplace=True)
test_df.drop('hab_lbl', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [13]:
train_df.head()

Unnamed: 0,P. Zone Class,P. Mass Class,P. Composition Class,P. Atmosphere Class,P. Min Mass (EU),P. Mass (EU),P. Radius (EU),P. Density (EU),P. Gravity (EU),P. Esc Vel (EU),...,S. Appar Mag,S. Mag from Planet,S. Size from Planet (deg),S. Hab Zone Min (AU),S. Hab Zone Max (AU),P. HZD,P. HZC,P. HZA,P. HZI,P. ESI
1015,2,1,1,2,18.217914,26.81,2.5,1.72,4.3,3.28,...,12.8,-31.9,5.0042,1.133,2.644,-2.31,-0.11,0.46,0.3,0.24
1102,2,1,1,2,18.217914,14.46,2.17,1.41,3.06,2.58,...,15.4,-34.4,17.5499,0.76,1.788,-2.42,-0.13,-0.43,0.29,0.25
898,2,1,1,2,18.217914,20.8,2.36,1.58,3.72,2.97,...,13.9,-30.4,3.1627,0.605,1.437,-2.1,-0.12,0.63,0.31,0.28
1037,2,1,1,2,18.217914,6.25,1.76,1.15,2.02,1.89,...,15.2,-29.5,2.2966,0.517,1.238,-1.91,-0.14,-0.07,0.34,0.37
806,2,3,1,2,18.217914,2.86,1.1,2.16,2.38,1.61,...,16.5,-29.3,5.1659,0.066,0.174,-1.78,-0.93,-0.28,0.33,0.41


In [14]:
y = to_categorical(np.array(y))

In [50]:
cat_inps = [list(df[x]) for x in cat_vars]
cont_inp = np.array(list(df[cont_vars]))

In [51]:
df[cont_vars].shape

(1713, 37)

In [55]:
lci = len(cat_inps)

In [82]:
model_cat_inps = [Input(shape=(1,)) for _ in cat_inps]
model_cont_inp = Input(shape=(1, 37), name='cont_inp')

embeddings = [Embedding(input_dim=len(np.unique(x)),
                        output_dim=round(1.6 * len(np.unique(x)) ** 0.56)
                       )(y) for x, y in zip(cat_inps, model_cat_inps)]
bn1 = BatchNormalization(name='first_bn')(model_cont_inp)

concat = keras.layers.concatenate([*embeddings, bn1], name='concatenate')

relu = Dense(5, activation='relu', name='dense1')(concat)
bn = BatchNormalization(name='bn1')(relu)
drop = Dropout(0.2, name='dropout1')(bn)

relu = Dense(5, activation='relu', name='dense2')(drop)
bn = BatchNormalization()(relu)
drop = Dropout(0.2)(bn)

flat = Flatten()(drop)
out = Dense(3, activation='softmax', name='dense3')(flat)

model = Model(inputs=[*model_cat_inps, model_cont_inp], outputs=out)

In [83]:
model.compile(SGD(0.1), loss='categorical_crossentropy', metrics=['accuracy'])

In [84]:
model.layers[-2].output.shape

TensorShape([Dimension(None), Dimension(None)])

In [85]:
len(cat_inps)

4

In [91]:
model.fit(x=[*cat_inps, np.array(df[cont_vars]).reshape(1713, 1, 37)], y=y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efb9f54b400>