In [1]:
import os
import numpy as np
import pandas as pd
import keras
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.utils import to_categorical
import pickle

from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [None]:
df = None
read_multiple = True
if read_multiple:
    import glob
    samples = glob.glob(os.path.join(os.getcwd(), 'SPLIT') + '/*')
    dfs = []
    for sample in samples:
        print(sample)
        df = pd.read_csv(sample, names = ['id', 'text', 'birthyear', 'fame', 'gender', 'occupation'])
        dfs.append(df)

    df = pd.concat(dfs, axis=0, ignore_index=True)
    del dfs
    df.reset_index()
    df.to_csv('all_data_cleaned.csv')
else:
    df = pd.read_csv('all_data_cleaned.csv')

C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed00.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed01.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed02.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed03.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed04.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed05.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed06.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed07.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed08.csv


In [None]:
df = df.sample(frac=1)  # shuffle it!
df.head()

In [None]:
# squish the birthyears to a scale from 0->1
birthyears = df.birthyear.unique()
_min = min(birthyears)
_max = max(birthyears)

def normalize_birthyear(year):
    return (year-_min)/(_max-_min)

birthyear_labels = df.birthyear.apply(normalize_birthyear)
birthyear_labels = birthyear_labels.values
birthyear_labels

In [None]:
labels_to_onehot = ['fame', 'gender', 'occupation']

labels = {}

for label in labels_to_onehot:
    unique_classes = len(df[label].unique())
    print('{} unique classes in {}'.format(unique_classes, label))
    labels_nd = df[label].values  # the values in the respective column
    labels[label] = pd.get_dummies(labels_nd)  # one-hot

In [None]:
fame_labels = labels['fame']
gender_labels = labels['gender']
occ_labels = labels['occupation']

In [None]:
load = True
tokenizer = None
if load:
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
else:
    from keras.preprocessing.text import Tokenizer
    vocab_size = 15000

    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(df.text)
    
    # saving
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
if load:
    with open('textmatrix.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
else:
    X = tokenizer.texts_to_matrix(df.text, mode='tfidf')
    with open('textmatrix.pickle', 'wb') as handle:
        pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)
# can safely delete df

In [None]:
num_items = df.shape[0]
test_size = int(num_items * 0.2)  # the amount of rows to use as validation set
SIZE = num_items - test_size
SIZE

In [None]:
# split train and validation set 80:20
x_train, x_val = X[:SIZE], X[SIZE:]

birthyear_train, birthyear_val = birthyear_labels[:SIZE], birthyear_labels[SIZE:]

fame_train, fame_val = fame_labels[:SIZE], fame_labels[SIZE:]

gender_train, gender_val = gender_labels[:SIZE], gender_labels[SIZE:]

occ_train, occ_val = occ_labels[:SIZE], occ_labels[SIZE:]

In [None]:
#del df

In [None]:
shape = (vocab_size,)
input_layer = Input(shape)
# before splitting categories
hidden_layer = Dense(units=1024, activation='relu')(input_layer)

dropout_layer = Dropout(0.3)(hidden_layer)

branch_layer = Dense(units=512, activation='relu')(dropout_layer)
# birthyear
_ = Dense(units=128, activation='relu')(branch_layer)
birthyear_out = Dense(units=1, activation='sigmoid', name='birthyear_out')(_)

# fame
_ = Dense(units=128, activation='relu')(branch_layer)
fame_out = Dense(units=fame_labels.shape[1], activation='softmax', name='fame_out')(_)

# gender
_ = Dense(units=128, activation='relu')(branch_layer)
gender_out = Dense(units=gender_labels.shape[1], activation='softmax', name='gender_out')(_)

# occupation
_ = Dense(units=128, activation='')(branch_layer)
occ_out = Dense(units=occ_labels.shape[1], activation='softmax', name='occ_out')(_)

model = Model(input=input_layer, outputs=[birthyear_out, fame_out, gender_out, occ_out])
#model = Model(input=input_layer, outputs=fame_out)
loss_fn = 'categorical_crossentropy'

model.compile(optimizer='adam',
              loss={'birthyear_out': 'mse', 'fame_out': loss_fn, 'gender_out': loss_fn, 'occ_out': loss_fn},
              metrics={'birthyear_out': 'mae', 'fame_out': 'accuracy', 'gender_out': 'accuracy', 'occ_out': 'accuracy'}
             )

#model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
model.summary()

In [None]:
shape = (vocab_size,)
input_layer = Input(shape)

out_activation = 'softmax'  # sigmoid/tanh/relu
mid_activation = 'relu'

# before splitting categories
hidden_layer = Dense(units=1024, activation='relu')(input_layer)

dropout_layer = Dropout(0.3)(hidden_layer)

branch_layer = Dense(units=512, activation='selu')(dropout_layer)
# birthyear
_ = Dense(units=128, activation=mid_activation)(branch_layer)
birthyear_out = Dense(units=1, activation='sigmoid', name='birthyear_out')(_)

# fame
_ = Dense(units=128, activation=mid_activation)(branch_layer)
fame_out = Dense(units=fame_labels.shape[1], activation=out_activation, name='fame_out')(_)

# gender
_ = Dense(units=128, activation=mid_activation)(branch_layer)
gender_out = Dense(units=gender_labels.shape[1], activation=out_activation, name='gender_out')(_)

# occupation
_ = Dense(units=128, activation=mid_activation)(branch_layer)
occ_out = Dense(units=occ_labels.shape[1], activation=out_activation, name='occ_out')(_)

model = Model(input=input_layer, outputs=[birthyear_out, fame_out, gender_out, occ_out])
#model = Model(input=input_layer, outputs=fame_out)
loss_fn = 'categorical_crossentropy'

model.compile(optimizer='adam',
              loss={'birthyear_out': 'mse',
                    'fame_out': loss_fn,
                    'gender_out': loss_fn,
                    'occ_out': loss_fn},
              metrics={'birthyear_out': 'accuracy',
                       'fame_out': 'accuracy',
                       'gender_out': 'accuracy',
                       'occ_out': 'accuracy'}
             )

#model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
model.summary()

In [None]:
board = keras.callbacks.TensorBoard(log_dir='./tensorboard/run7', histogram_freq=0, write_graph=True, write_images=True)

In [None]:
y_train = [birthyear_train, fame_train, gender_train, occ_train]
start_epoch = 0
model.fit(x_train, y_train, epochs=50, batch_size=32, callbacks=[board], initial_epoch=start_epoch)


In [None]:
model.save("model-7epochs.h5")

In [None]:
scores = model.evaluate(x_val, [birthyear_val, fame_val, gender_val, occ_val])
scores

In [None]:
def predict_user(model, user_vec, actual=None, verbose=False):
    result = model.predict([[user_vec]])
    fame = ["rising", "star", "superstar"]
    gender = ["female", "male", "nonbinary"]
    occupation = ["creator", "manager", "performer", "politics", "professional", "religious", "science", "sports"]

    birth_pred = result[0][0][0]
    fame_pred = result[1][0]
    gender_pred = result[2][0]
    occ_pred = result[3][0]
    
    errors = False  # check for errors on the fame/gender/occupation metrics
    
    year_pred = int(birth_pred * (_max - _min) + _min)
    year_real = int(birthyear_val[actual] * (_max - _min) + _min)
    
    fame_pred = fame[fame_pred.argmax()]
    fame_real = fame_val.iloc[actual].idxmax()
    
    gend_pred = gender[gender_pred.argmax()]
    gend_real = gender_val.iloc[actual].idxmax()
    
    occu_pred = occupation[occ_pred.argmax()]
    occu_real = occ_val.iloc[actual].idxmax()
    
    errors = [fame_pred != fame_real, gend_pred != gend_real, occu_pred != occu_real]

    if verbose and sum(errors) != 0:  # only print wrong predictions!
        #print("Predicted values (real ones in parentheses)")
        print('Birthyear:\t{} ({})'.format(year_pred, year_real))
        print('Fame status:\t{} ({})'.format(fame_pred, fame_real))
        print('Gender:   \t{} ({})'.format(gend_pred, gend_real))
        print('Occupation:\t{} ({})'.format(occu_pred, occu_real))
        # print some of the text...
        print("--------------------------------------------")
        
    return abs(year_pred - year_real), errors

In [None]:
birthyear_errors = []
errors = {'fame': 0, 'gender': 0, 'occupation': 0}
# test_size
tests = test_size
for i in range(tests):
    year_diff, error = predict_user(model, x_val[i], actual=i, verbose=True)
    birthyear_errors.append(year_diff)
    errors['fame'] += error[0]
    errors['gender'] += error[1]
    errors['occupation'] += error[2]
    
    
print('Average off by {} years'.format(sum(birthyear_errors)/len(birthyear_errors)))
print('Fame err: {}/{}'.format(errors['fame'], tests))
print('Gender err: {}/{}'.format(errors['gender'], tests))
print('Occupation err: {}/{}'.format(errors['occupation'], tests))

# create object with which labels were misclassified


In [None]:
test_size