In [1]:
import os
import numpy as np
import pandas as pd
import keras
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.utils import to_categorical
import pickle

from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
load = True  # if pickle models are saved or not!

In [3]:
df = None
read_multiple = False
if read_multiple:
    import glob
    samples = glob.glob(os.path.join(os.getcwd(), 'SPLIT') + '/*')
    dfs = []
    for sample in samples:
        print(sample)
        df = pd.read_csv(sample, names = ['id', 'text', 'birthyear', 'fame', 'gender', 'occupation'])
        dfs.append(df)

    df = pd.concat(dfs, axis=0, ignore_index=True)
    del dfs
    df.reset_index()
    df.to_csv('all_data_cleaned.csv')
elif not load:  # skip if the pickled files are present
    df = pd.read_csv('all_data_cleaned.csv')

In [4]:
if df:
    df = df.sample(frac=1)  # shuffle it!
    df.head()

In [5]:
if load:
    with open('pickles/years.pickle', 'rb') as handle:
        birthyear_labels = pickle.load(handle)
else:
    # squish the birthyears to a scale from 0->1
    birthyears = df.birthyear.unique()
    _min = min(birthyears)
    _max = max(birthyears)

    def normalize_birthyear(year):
        return (year-_min)/(_max-_min)

    birthyear_labels = df.birthyear.apply(normalize_birthyear)
    birthyear_labels = birthyear_labels.values

    with open('pickles/years.pickle', 'wb') as handle:
        pickle.dump(birthyear_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
if load:
    with open('pickles/labels.pickle', 'rb') as handle:
        labels = pickle.load(handle)
else:
    labels_to_onehot = ['fame', 'gender', 'occupation']
    labels = {}

    for label in labels_to_onehot:
        unique_classes = len(df[label].unique())
        print('{} unique classes in {}'.format(unique_classes, label))
        labels_nd = df[label].values  # the values in the respective column
        labels[label] = pd.get_dummies(labels_nd)  # one-hot
    with open('pickles/labels.pickle', 'wb') as handle:
        pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

fame_labels = labels['fame']
gender_labels = labels['gender']
occ_labels = labels['occupation']

In [7]:
tokenizer = None
vocab_size = 15000

if load:
    with open('pickles/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
else:
    from keras.preprocessing.text import Tokenizer

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(df.text)
    
    # saving
    with open('pickles/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
vocab_size = 15000


In [9]:
if load:
    with open('pickles/textmatrix.pickle', 'rb') as handle:
        X = pickle.load(handle)
else:
    X = tokenizer.texts_to_matrix(df.text, mode='tfidf')
    with open('pickles/textmatrix.pickle', 'wb') as handle:
        pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)
# can safely delete df

In [10]:
num_items = birthyear_labels.shape[0]
test_size = int(num_items * 0.2)  # the amount of rows to use as validation set
SIZE = num_items - test_size
SIZE

27061

In [11]:
# split train and validation set 80:20
x_train, x_val = X[:SIZE], X[SIZE:]

birthyear_train, birthyear_val = birthyear_labels[:SIZE], birthyear_labels[SIZE:]

fame_train, fame_val = fame_labels[:SIZE], fame_labels[SIZE:]

gender_train, gender_val = gender_labels[:SIZE], gender_labels[SIZE:]

occ_train, occ_val = occ_labels[:SIZE], occ_labels[SIZE:]

In [37]:
shape = (vocab_size,)
input_layer = Input(shape)

in_activation = 'relu'
branch_activation = 'selu'
mid_activation = 'relu'
out_activation = 'softmax'  # sigmoid/tanh/relu

year_activation = 'sigmoid'

_year = 'birthyear_out'
_fame = 'fame_out'
_gend = 'gender_out'
_occu = 'occ_out'


INPUT_DIM = 1024
DROPOUT = 0.6
HIDDEN_DIM = 512
MID_DIM = 128

loss_fn = 'categorical_crossentropy'

# before splitting categories
hidden_layer = Dense(units=INPUT_DIM, activation=in_activation)(input_layer)

dropout_layer = Dropout(DROPOUT)(hidden_layer)

branch_layer = Dense(units=HIDDEN_DIM, activation=in_activation)(dropout_layer)
# birthyear
_ = Dense(units=MID_DIM, activation=mid_activation)(branch_layer)
birthyear_out = Dense(units=1, activation=year_activation, name=_year)(_)

# fame
_ = Dense(units=MID_DIM, activation=mid_activation)(branch_layer)
fame_out = Dense(units=fame_labels.shape[1], activation=out_activation, name=_fame)(_)

# gender
_ = Dense(units=MID_DIM, activation=mid_activation)(branch_layer)
gender_out = Dense(units=gender_labels.shape[1], activation=out_activation, name=_gend)(_)

# occupation
_ = Dense(units=MID_DIM, activation=mid_activation)(branch_layer)
occ_out = Dense(units=occ_labels.shape[1], activation=out_activation, name=_occu)(_)

OUTPUTS = [birthyear_out, fame_out, gender_out, occ_out]
model = Model(
    inputs=input_layer,
    outputs=OUTPUTS)
#model = Model(input=input_layer, outputs=fame_out)


model.compile(optimizer='adam',
              loss={
                _year: 'mse',
                _fame: loss_fn,
                _gend: loss_fn,
                _occu: loss_fn},
              loss_weights={
                _year: 1.8,
                _fame: 1.0,
                _gend: 1.0,
                _occu: 1.5},
              metrics={
                'birthyear_out': 'mae',
                'fame_out': 'accuracy',
                'gender_out': 'accuracy',
                'occ_out': 'accuracy'}
             )

#model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 15000)        0                                            
__________________________________________________________________________________________________
dense_25 (Dense)                (None, 1024)         15361024    input_5[0][0]                    
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 1024)         0           dense_25[0][0]                   
__________________________________________________________________________________________________
dense_26 (Dense)                (None, 512)          524800      dropout_5[0][0]                  
__________________________________________________________________________________________________
dense_27 (

In [38]:
board = keras.callbacks.TensorBoard(log_dir='./tensorboard/final_run-1', histogram_freq=0, write_graph=True, write_images=True)

MIN_CHANGE_REQUIRED = 0  # change in values between epochs
EPOCHS_TO_WAIT = 2
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss',
                                          patience=EPOCHS_TO_WAIT,
                                          baseline=None,
                                          restore_best_weights=False)

chkpt = keras.callbacks.ModelCheckpoint(filepath='models/best_model.h5', monitor='val_loss', save_best_only=True)


In [39]:
y_train = [birthyear_train, fame_train, gender_train, occ_train]
start_epoch = 0
end_epoch = 30
bs = 32
split_fac = 0.1
callbacks = [board, chkpt]
model.fit(x_train, y_train,
          epochs=end_epoch,
          batch_size=bs,
          callbacks=callbacks,
          initial_epoch=start_epoch,
          validation_split=split_fac,
         )


Train on 24354 samples, validate on 2707 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30


Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x2240c228be0>

In [23]:
model = None
import gc
gc.collect()

70

In [16]:
model_name = "model-30epochs-60dropout-final.h5"
#model.save('model/' + model_name)

In [24]:
model_path = os.path.join(os.getcwd(), model_name)
model = keras.models.load_model(model_path)

OSError: Unable to open file (unable to open file: name = 'C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\model-35epochs-60dropout.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [34]:
scores = model.evaluate(x_val, [birthyear_val, fame_val, gender_val, occ_val])
scores



[1.8442904035338454,
 0.024066422071310652,
 0.48857188614227115,
 0.18288909037379275,
 0.7530065819059402,
 0.11878173513295293,
 0.7992609017175476,
 0.9315594974219666,
 0.7739837398550199]

In [35]:
_min = 1940
_max = 2008
def predict_user(model, user_vec, actual=None, verbose=False):
    result = model.predict([[user_vec]])
    fame = ["rising", "star", "superstar"]
    gender = ["female", "male", "nonbinary"]
    occupation = ["creator", "manager", "performer", "politics", "professional", "religious", "science", "sports"]

    birth_pred = result[0][0][0]
    fame_pred = result[1][0]
    gender_pred = result[2][0]
    occ_pred = result[3][0]
    
    errors = False  # check for errors on the fame/gender/occupation metrics
    
    year_pred = int(birth_pred * (_max - _min) + _min)
    year_real = int(birthyear_val[actual] * (_max - _min) + _min)
    
    fame_pred = fame[fame_pred.argmax()]
    fame_real = fame_val.iloc[actual].idxmax()
    
    gend_pred = gender[gender_pred.argmax()]
    gend_real = gender_val.iloc[actual].idxmax()
    
    occu_pred = occupation[occ_pred.argmax()]
    occu_real = occ_val.iloc[actual].idxmax()
    
    errors = [fame_pred != fame_real, gend_pred != gend_real, occu_pred != occu_real]

    if verbose and sum(errors) != 0:  # only print wrong predictions!
        #print("Predicted values (real ones in parentheses)")
        print('Birthyear:\t{} ({})'.format(year_pred, year_real))
        print('Fame status:\t{} ({})'.format(fame_pred, fame_real))
        print('Gender:   \t{} ({})'.format(gend_pred, gend_real))
        print('Occupation:\t{} ({})'.format(occu_pred, occu_real))
        # print some of the text...
        print("--------------------------------------------")
        
    return abs(year_pred - year_real), errors

In [36]:
birthyear_errors = []
errors = {'fame': 0, 'gender': 0, 'occupation': 0}
# test_size
tests = 1000
for i in range(tests):
    year_diff, error = predict_user(model, x_val[i], actual=i, verbose=False)
    birthyear_errors.append(year_diff)
    errors['fame'] += error[0]
    errors['gender'] += error[1]
    errors['occupation'] += error[2]
    
    
print('Average off by {} years'.format(sum(birthyear_errors)/len(birthyear_errors)))
print('Fame err: {}/{}'.format(errors['fame'], tests))
print('Gender err: {}/{}'.format(errors['gender'], tests))
print('Occupation err: {}/{}'.format(errors['occupation'], tests))

# create object with which labels were misclassified


Average off by 8.034 years
Fame err: 202/1000
Gender err: 70/1000
Occupation err: 241/1000


In [None]:
test_size