In [None]:
import os
import numpy as np
import pandas as pd
import keras
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.utils import to_categorical
import pickle

from sklearn.pipeline import Pipeline

In [None]:
load = True  # if pickle models are saved or not!

In [None]:
df = None
read_multiple = False
if read_multiple:
    import glob
    samples = glob.glob(os.path.join(os.getcwd(), 'SPLIT') + '/*')
    dfs = []
    for sample in samples:
        print(sample)
        df = pd.read_csv(sample, names = ['id', 'text', 'birthyear', 'fame', 'gender', 'occupation'])
        dfs.append(df)

    df = pd.concat(dfs, axis=0, ignore_index=True)
    del dfs
    df.reset_index()
    df.to_csv('all_data_cleaned.csv')
elif not load:  # skip if the pickled files are present
    df = pd.read_csv('all_data_cleaned.csv')

In [None]:
if df:
    df = df.sample(frac=1)  # shuffle it!
    df.head()

In [None]:
if load:
    with open('pickles/years.pickle', 'rb') as handle:
        birthyear_labels = pickle.load(handle)
else:
    # squish the birthyears to a scale from 0->1
    birthyears = df.birthyear.unique()
    _min = min(birthyears)
    _max = max(birthyears)

    def normalize_birthyear(year):
        return (year-_min)/(_max-_min)

    birthyear_labels = df.birthyear.apply(normalize_birthyear)
    birthyear_labels = birthyear_labels.values

    with open('pickles/years.pickle', 'wb') as handle:
        pickle.dump(birthyear_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if load:
    with open('pickles/labels.pickle', 'rb') as handle:
        labels = pickle.load(handle)
else:
    labels_to_onehot = ['fame', 'gender', 'occupation']
    labels = {}

    for label in labels_to_onehot:
        unique_classes = len(df[label].unique())
        print('{} unique classes in {}'.format(unique_classes, label))
        labels_nd = df[label].values  # the values in the respective column
        labels[label] = pd.get_dummies(labels_nd)  # one-hot
    with open('pickles/labels.pickle', 'wb') as handle:
        pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

fame_labels = labels['fame']
gender_labels = labels['gender']
occ_labels = labels['occupation']

In [None]:
tokenizer = None
vocab_size = 15000

if load:
    with open('pickles/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
else:
    from keras.preprocessing.text import Tokenizer

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(df.text)
    
    # saving
    with open('pickles/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
vocab_size = 15000


In [None]:
if load:
    with open('pickles/textmatrix.pickle', 'rb') as handle:
        X = pickle.load(handle)
else:
    X = tokenizer.texts_to_matrix(df.text, mode='tfidf')
    with open('pickles/textmatrix.pickle', 'wb') as handle:
        pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)
# can safely delete df

In [None]:
num_items = birthyear_labels.shape[0]
test_size = int(num_items * 0.2)  # the amount of rows to use as validation set
SIZE = num_items - test_size
SIZE

In [None]:
# split train and validation set 80:20
x_train, x_val = X[:SIZE], X[SIZE:]

birthyear_train, birthyear_val = birthyear_labels[:SIZE], birthyear_labels[SIZE:]

fame_train, fame_val = fame_labels[:SIZE], fame_labels[SIZE:]

gender_train, gender_val = gender_labels[:SIZE], gender_labels[SIZE:]

occ_train, occ_val = occ_labels[:SIZE], occ_labels[SIZE:]

In [None]:
shape = (vocab_size,)
input_layer = Input(shape)

in_activation = 'relu'
branch_activation = 'selu'
mid_activation = 'relu'
out_activation = 'softmax'  # sigmoid/tanh/relu

year_activation = 'sigmoid'

_year = 'birthyear_out'
_fame = 'fame_out'
_gend = 'gender_out'
_occu = 'occ_out'


INPUT_DIM = 1024
DROPOUT = 0.3
HIDDEN_DIM = 1024
MID_DIM = 128

loss_fn = 'categorical_crossentropy'

# before splitting categories
hidden_layer = Dense(units=INPUT_DIM, activation=in_activation)(input_layer)

dropout_layer = Dropout(DROPOUT)(hidden_layer)

branch_layer = Dense(units=HIDDEN_DIM, activation=in_activation)(dropout_layer)
# birthyear
_ = Dense(units=MID_DIM, activation=mid_activation)(branch_layer)
birthyear_out = Dense(units=1, activation=year_activation, name=_year)(_)

# fame
_ = Dense(units=MID_DIM, activation=mid_activation)(branch_layer)
fame_out = Dense(units=fame_labels.shape[1], activation=out_activation, name=_fame)(_)

# gender
_ = Dense(units=MID_DIM, activation=mid_activation)(branch_layer)
gender_out = Dense(units=gender_labels.shape[1], activation=out_activation, name=_gend)(_)

# occupation
_ = Dense(units=MID_DIM, activation=mid_activation)(branch_layer)
occ_out = Dense(units=occ_labels.shape[1], activation=out_activation, name=_occu)(_)

OUTPUTS = [birthyear_out, fame_out, gender_out, occ_out]
model = Model(
    inputs=input_layer,
    outputs=OUTPUTS)
#model = Model(input=input_layer, outputs=fame_out)


model.compile(optimizer='adam',
              loss={
                _year: 'mse',
                _fame: loss_fn,
                _gend: loss_fn,
                _occu: loss_fn},
              loss_weights={
                _year: 1.2,
                _fame: 1.1,
                _gend: 1.0,
                _occu: 1.2},
              metrics={
                'birthyear_out': 'mae',
                'fame_out': 'accuracy',
                'gender_out': 'accuracy',
                'occ_out': 'accuracy'}
             )

model.summary()

In [None]:
board = keras.callbacks.TensorBoard(log_dir='./tensorboard/final_run-10241024-20epoch', histogram_freq=0, write_graph=True, write_images=True)

MIN_CHANGE_REQUIRED = 0  # change in values between epochs
EPOCHS_TO_WAIT = 2
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss',
                                          patience=EPOCHS_TO_WAIT,
                                          baseline=None,
                                          restore_best_weights=False)

chkpt = keras.callbacks.ModelCheckpoint(filepath='models/best_model.h5', monitor='occ_out_loss', save_best_only=True)


In [None]:
y_train = [birthyear_train, fame_train, gender_train, occ_train]
start_epoch = 0
end_epoch = 20
bs = 32
split_fac = 0
callbacks = [board, chkpt]
model.fit(x_train, y_train,
          epochs=end_epoch,
          batch_size=bs,
          callbacks=callbacks,
          initial_epoch=start_epoch,
          validation_split=split_fac,
         )


In [None]:
model = None
import gc
gc.collect()

In [None]:
model_name = "30dropout20epoch.h5"
model.save('models/' + model_name)

In [None]:
best = 'best_model.h5'
model_path = os.path.join(os.getcwd(), 'models', model_name)
model = keras.models.load_model(model_path)

In [None]:
scores = model.evaluate(x_val, [birthyear_val, fame_val, gender_val, occ_val])
scores

In [None]:
_min = 1940
_max = 2008
def predict_user(model, user_vec, actual=None, verbose=False):
    result = model.predict([[user_vec]])
    fame = ["rising", "star", "superstar"]
    gender = ["female", "male", "nonbinary"]
    occupation = ["creator", "manager", "performer", "politics", "professional", "religious", "science", "sports"]

    birth_pred = result[0][0][0]
    fame_pred = result[1][0]
    gender_pred = result[2][0]
    occ_pred = result[3][0]
    
    errors = False  # check for errors on the fame/gender/occupation metrics
    
    year_pred = int(birth_pred * (_max - _min) + _min)
    year_real = int(birthyear_val[actual] * (_max - _min) + _min)
    
    fame_pred = fame[fame_pred.argmax()]
    fame_real = fame_val.iloc[actual].idxmax()
    
    gend_pred = gender[gender_pred.argmax()]
    gend_real = gender_val.iloc[actual].idxmax()
    
    occu_pred = occupation[occ_pred.argmax()]
    occu_real = occ_val.iloc[actual].idxmax()
    
    errors = [fame_pred != fame_real, gend_pred != gend_real, occu_pred != occu_real]

    if verbose and sum(errors) != 0:  # only print wrong predictions!
        #print("Predicted values (real ones in parentheses)")
        print('Birthyear:\t{} ({})'.format(year_pred, year_real))
        print('Fame status:\t{} ({})'.format(fame_pred, fame_real))
        print('Gender:   \t{} ({})'.format(gend_pred, gend_real))
        print('Occupation:\t{} ({})'.format(occu_pred, occu_real))
        # print some of the text...
        print("--------------------------------------------")
        
    return abs(year_pred - year_real), errors

In [None]:
birthyear_errors = []
errors = {'fame': 0, 'gender': 0, 'occupation': 0}
# test_size
tests = test_size
for i in range(tests):
    year_diff, error = predict_user(model, x_val[i], actual=i, verbose=False)
    birthyear_errors.append(year_diff)
    errors['fame'] += error[0]
    errors['gender'] += error[1]
    errors['occupation'] += error[2]
    
    
print('Average off by {} years'.format(sum(birthyear_errors)/len(birthyear_errors)))
print('Fame err: {}/{}'.format(errors['fame'], tests))
print('Gender err: {}/{}'.format(errors['gender'], tests))
print('Occupation err: {}/{}'.format(errors['occupation'], tests))

# create object with which labels were misclassified


In [None]:
test_size