In [1]:
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import seaborn
from sklearn import linear_model
import numpy as np
import pickle
import xgboost as xgb
import time

In [2]:
# pull data from Mongodb server and return Numpy arrays
def gather_mongo_data():
    client = MongoClient('127.0.0.1', 3001)
    db = client.meteor
    images = list(db.facebook.find({}))

    df = pd.DataFrame(images)
    df = df.dropna(subset = ['normalized_log_likes'])
    # drop data with missing username
    df = df[df['user'] != 'profile.php']

    # build numpy array from DataFrame
    # there has to be a better way to do this -- I haven't investigated it yet
    likes = np.zeros((len(df), 1))
    pool = np.zeros((len(df), 2048))
    categories = np.zeros((len(df), 1008))
    facedata = np.zeros((len(df), 3))

    j = 0
    for i in df.index:
        likes[j, :] = df['normalized_log_likes'][i]
        pool[j, :] = df['inception_pool'][i]
        categories[j, :] = df['inception_classification'][i]
        facedata[j, :] = [df['faces'][i]['num'], df['faces'][i]['total'], df['faces'][i]['largest']]
        j += 1

    users = set(df['user'])
    users = list(users)
    user_hot = np.zeros((len(df), len(users)))
    user_num = np.zeros(len(df))

    j = 0
    for i in df.index:
        user_index = users.index(df['user'][i])
        user_num[j] = user_index
        user_hot[j, user_index] = 1
        j += 1

    predictors = np.hstack((pool, facedata))
    
    return (predictors, likes, users, user_hot, user_num)

In [3]:
# calculate the n*(n-1)/2 comparisons of elements and measure how many of them are made correctly
def correct_comparisons(y, pred_y):
    comparison_true = (y.reshape(1,-1) - y.reshape(-1, 1)) > 0
    comparison_est = (pred_y.reshape(1,-1) - pred_y.reshape(-1,1)) > 0
        
    return ((np.sum(comparison_true == comparison_est) - len(y))/2, (len(y)**2 - len(y))/2)

In [4]:
# calculate all correct comparisons across test user set
def calculate_correct_comparisons(test_observations, test_users, user_hot, predicted_likes, likes):
    total_correct = 0
    total_comparisons = 0

    test_user_hot = user_hot[test_observations, :]
    
    for i in range(len(test_users)):
        user_test_set = np.any(test_user_hot[:, [test_users[i]]], axis = 1).nonzero()[0]
        
        ypred = np.ravel(predicted_likes[user_test_set])
        y = np.ravel(likes[user_test_set])

        (correct, total) = correct_comparisons(y, ypred)
        total_correct += correct
        total_comparisons += total

    return float(total_correct)/total_comparisons

In [5]:
# split the facebook dataset into training, validation, and test components

def split_datasets(users, user_hot, predictors, likes, seed=5):
    print('Seed value is: ' + str(seed))
    np.random.seed(seed)

    # choose 100 random users to be the test set
    test_users = np.random.choice(len(users), 100)
    test_set = np.any(user_hot[:, test_users], axis = 1).nonzero()[0]
    # choose ~100 random users to be the validation set
    validation_users = [v for v in np.random.choice(len(users), 103) if v not in test_users]
    validation_set = np.any(user_hot[:, validation_users], axis = 1).nonzero()[0]
    
    # training set is everything left
    training_users = [v for v in range(len(users)) if v not in test_users and v not in validation_users]
    training_set = [v for v in range(len(predictors)) if v not in test_set and v not in validation_set]

    print("Training set length: " + str(len(training_set)))
    print("Test set length: " + str(len(test_set)))
    print("Validation set length: " + str(len(validation_set)))
    
    return {"training": {"observations": training_set, "users": training_users, "X": predictors[training_set, :], "y": likes[training_set]}, 
            "validation": {"observations": validation_set, "users": validation_users, "X": predictors[validation_set, :], "y": likes[validation_set]}, 
            "test": {"observations": test_set, "users": test_users, "X": predictors[test_set, :], "y": likes[test_set]}}

In [6]:
(predictors, likes, users, user_hot, user_num) = gather_mongo_data()

In [7]:
data = split_datasets(users, user_hot, predictors, likes, seed=5)

training_data = xgb.DMatrix(data["training"]["X"], label=data["training"]["y"])
validation_data = xgb.DMatrix(data["validation"]["X"], label=data["validation"]["y"])

Seed value is: 5
Training set length: 68280
Test set length: 1851
Validation set length: 1435


In [41]:
best_score = 1
best_params = []

eta = 0.1

for subsample in [0.5, 1]:
    for max_depth in [1, 2, 3, 4, 6]:
        params = {"nthread": 4, "eta": eta, "max_depth": max_depth, "subsample": subsample, "silent": 1}
        print("params: " + str(eta) + "," + str(max_depth) + ", " + str(subsample))

        num_early_stop = 50

        trained = xgb.train(params, training_data, num_boost_round=5000, evals = [[validation_data, "validation"]], 
                  feval = lambda preds, dtrain: 
                            list([["error", (1 - calculate_correct_comparisons(data["validation"]["observations"], data["validation"]["users"], user_hot, preds, dtrain.get_label()))]]),
                  early_stopping_rounds=num_early_stop, verbose_eval=False)

        if trained.best_score < best_score:
            print("   ! new best score: " + str(trained.best_score))
            best_score = trained.best_score
            best_params = [max_depth, eta, subsample]

params: 0.3,1, 0.5
   ! new best score: 0.440609
params: 0.3,2, 0.5
params: 0.3,3, 0.5
params: 0.3,4, 0.5
params: 0.3,6, 0.5
params: 0.1,1, 0.5
params: 0.1,2, 0.5
   ! new best score: 0.436762
params: 0.1,3, 0.5
params: 0.1,4, 0.5
params: 0.1,6, 0.5
params: 0.03,1, 0.5


KeyboardInterrupt: 

In [43]:
best_score = 1
best_params = []

eta = 0.1

norm_likes = likes

for j in range(user_hot.shape[1]):
        like_error = np.mean(likes[user_hot[:,j] == 1])
        norm_likes[user_hot[:,j] == 1] -= like_error

norm_data = split_datasets(users, user_hot, predictors, norm_likes, seed=5)
training_data = xgb.DMatrix(norm_data["training"]["X"], label=norm_data["training"]["y"])
        
for subsample in [0.5, 1]:
    for max_depth in [1, 2, 3, 4, 6]:
        params = {"nthread": 4, "eta": eta, "max_depth": max_depth, "subsample": subsample, "silent": 1}
        print("params: " + str(eta) + "," + str(max_depth) + ", " + str(subsample))

        num_early_stop = 50

        trained = xgb.train(params, training_data, num_boost_round=5000, evals = [[validation_data, "validation"]], 
                  feval = lambda preds, dtrain: 
                            list([["error", (1 - calculate_correct_comparisons(data["validation"]["observations"], data["validation"]["users"], user_hot, preds, dtrain.get_label()))]]),
                  early_stopping_rounds=num_early_stop, verbose_eval=False)

        if trained.best_score < best_score:
            print("   ! new best score: " + str(trained.best_score))
            best_score = trained.best_score
            best_params = [max_depth, eta, subsample]

Seed value is: 5
Training set length: 68280
Test set length: 1851
Validation set length: 1435
params: 0.1,1, 0.5
   ! new best score: 0.444363
params: 0.1,2, 0.5
   ! new best score: 0.437438
params: 0.1,3, 0.5
   ! new best score: 0.428826
params: 0.1,4, 0.5
   ! new best score: 0.426167
params: 0.1,6, 0.5
params: 0.1,1, 1
params: 0.1,2, 1


KeyboardInterrupt: 

In [36]:
trained = xgb.train(params, training_data, num_boost_round=1, evals = [[validation_data, "validation"]], 
                      feval = lambda preds, dtrain: 
                                list([["error", (1 - calculate_correct_comparisons(data["validation"]["observations"], data["validation"]["users"], user_hot, preds, dtrain.get_label()))]]),
                      early_stopping_rounds=40, verbose_eval=20)

[0]	validation-error:0.477304
Will train until validation-error hasn't improved in 40 rounds.


In [37]:
norm_data = split_datasets(users, user_hot, predictors, likes, seed=5)

training_data = xgb.DMatrix(norm_data["training"]["X"], label=norm_data["training"]["y"])
validation_data = xgb.DMatrix(norm_data["validation"]["X"], label=norm_data["validation"]["y"])

eta = 0.05
subsample = 1.0
max_depth = 4

score = calculate_correct_comparisons(norm_data["validation"]["observations"], norm_data["validation"]["users"], user_hot, norm_data["validation"]["y"]*0, norm_data["validation"]["y"])
print("starting score: " + str(score))

user_train = user_hot[norm_data["training"]["observations"], :]
norm_likes = likes[norm_data["training"]["observations"]]

for i in range(50):
    params = {"nthread": 4, "eta": eta, "max_depth": max_depth, "subsample": subsample, "silent": 1}
    print("params: " + str(eta) + "," + str(max_depth) + ", " + str(subsample))


    if (i != 0):
        trained = xgb.train(params, training_data, num_boost_round=10, 
              verbose_eval=True, xgb_model='save.xgb')
    else:
        trained = xgb.train(params, training_data, num_boost_round=10,
                  verbose_eval=True)

    trained.save_model('save.xgb')
    
    score = calculate_correct_comparisons(norm_data["validation"]["observations"], norm_data["validation"]["users"], user_hot, trained.predict(validation_data), norm_data["validation"]["y"])
    
    pred_likes = trained.predict(training_data)
    like_errors = []
    
    for j in norm_data["training"]["users"]:
        like_error = np.mean(norm_likes[user_train[:, j] == 1] - pred_likes[user_train[:,j] == 1])
        like_errors.append(like_error)
        norm_likes[user_train[:,j] == 1] -= like_error * 0.2

#     plt.figure()
#     plt.hist(like_errors)
#     plt.show()
        
    training_data = xgb.DMatrix(norm_data["training"]["X"], label=norm_likes)
    
    eta = eta * 1.07
    
    print("   ! new score: " + str(score))

Seed value is: 5
Training set length: 68280
Test set length: 1851
Validation set length: 1435
starting score: 0.499720184057
params: 0.05,4, 1.0
   ! new score: 0.55227117274
params: 0.0495,4, 1.0
   ! new score: 0.553351573187
params: 0.049005,4, 1.0
   ! new score: 0.554773970899
params: 0.04851495,4, 1.0
   ! new score: 0.555007150852
params: 0.0480298005,4, 1.0
   ! new score: 0.557136861087
params: 0.047549502495,4, 1.0
   ! new score: 0.55778199229
params: 0.04707400747,4, 1.0
   ! new score: 0.557929672926
params: 0.0466032673953,4, 1.0
   ! new score: 0.558598122124
params: 0.0461372347214,4, 1.0
   ! new score: 0.559196617336
params: 0.0456758623742,4, 1.0
   ! new score: 0.560727832359
params: 0.0452191037504,4, 1.0
   ! new score: 0.560510197737
params: 0.0447669127129,4, 1.0
   ! new score: 0.560867740331
params: 0.0443192435858,4, 1.0
   ! new score: 0.561023193633
params: 0.0438760511499,4, 1.0
   ! new score: 0.561372963562
params: 0.0434372906384,4, 1.0
   ! new score: 

KeyboardInterrupt: 

In [39]:
norm_data = split_datasets(users, user_hot, predictors, likes, seed=5)

training_data = xgb.DMatrix(norm_data["training"]["X"], label=norm_data["training"]["y"])
validation_data = xgb.DMatrix(norm_data["validation"]["X"], label=norm_data["validation"]["y"])

eta = 0.05
subsample = 1.0
max_depth = 4
per_round = 10

score = calculate_correct_comparisons(norm_data["validation"]["observations"], norm_data["validation"]["users"], user_hot, norm_data["validation"]["y"]*0, norm_data["validation"]["y"])
print("starting score: " + str(score))

user_train = user_hot[norm_data["training"]["observations"], :]
norm_likes = likes[norm_data["training"]["observations"]]

for i in range(50):
    params = {"nthread": 4, "eta": eta, "max_depth": max_depth, "subsample": subsample, "silent": 1}
    print("params: " + str(eta) + "," + str(max_depth) + ", " + str(subsample))


    if (i != 0):
        trained = xgb.train(params, training_data, num_boost_round=per_round, 
              verbose_eval=True, xgb_model='save.xgb')
    else:
        trained = xgb.train(params, training_data, num_boost_round=per_round,
                  verbose_eval=True)

    trained.save_model('save.xgb')
    
    score = calculate_correct_comparisons(norm_data["validation"]["observations"], norm_data["validation"]["users"], user_hot, trained.predict(validation_data), norm_data["validation"]["y"])
    
    pred_likes = trained.predict(training_data)
    like_errors = []
    
    for j in norm_data["training"]["users"]:
        like_error = np.mean(norm_likes[user_train[:, j] == 1] - pred_likes[user_train[:,j] == 1])
        like_errors.append(like_error)
        norm_likes[user_train[:,j] == 1] -= like_error * 0.2

#     plt.figure()
#     plt.hist(like_errors)
#     plt.show()
        
    training_data = xgb.DMatrix(norm_data["training"]["X"], label=norm_likes)
    
    eta = eta * 1.07
    
    print("   " + str((i+1)*per_round) + "-score: " + str(score))

Seed value is: 5
Training set length: 68280
Test set length: 1851
Validation set length: 1435
starting score: 0.499720184057
params: 0.05,4, 1.0
   10-score: 0.55227117274
params: 0.0535,4, 1.0
   20-score: 0.554369792314
params: 0.057245,4, 1.0
   30-score: 0.55485169755
params: 0.06125215,4, 1.0
   40-score: 0.555877689342
params: 0.0655398005,4, 1.0
   50-score: 0.55570669071
params: 0.070127586535,4, 1.0
   60-score: 0.556895908469
params: 0.0750365175925,4, 1.0
   70-score: 0.557408904365
params: 0.0802890738239,4, 1.0
   80-score: 0.558706939435
params: 0.0859093089916,4, 1.0
   90-score: 0.56120196493
params: 0.091922960621,4, 1.0
   100-score: 0.56164500684
params: 0.0983575678645,4, 1.0
   110-score: 0.561124238279
params: 0.105242597615,4, 1.0
   120-score: 0.561761596816
params: 0.112609579448,4, 1.0
   130-score: 0.563215085188
params: 0.120492250009,4, 1.0
   140-score: 0.563728081084
params: 0.12892670751,4, 1.0
   150-score: 0.565585748041
params: 0.137951577036,4, 1.0
 

KeyboardInterrupt: 

In [37]:
# from sklearn.ensemble import GradientBoostingRegressor
# gbr = GradientBoostingRegressor(learning_rate=0.1, n_estimators=1, max_depth=3, warm_start=True, verbose=1)

for i in range(15, 25):
    gbr.set_params(n_estimators = (i+1)*10)
    gbr.fit(train_data, np.ravel(train_label))
    
    print(calculate_correct_comparisons(validation_users, user_hot, predictors, likes))

      Iter       Train Loss   Remaining Time 
       151           0.2185            3.87m
       152           0.2184            3.58m
       153           0.2183            3.56m
       154           0.2182            3.16m
       155           0.2182            2.77m
       156           0.2181            2.13m
       157           0.2180            1.55m
       158           0.2179            1.01m
       159           0.2178           29.57s
       160           0.2178            0.00s
0.565026099925
      Iter       Train Loss   Remaining Time 
       161           0.2177            3.62m
       162           0.2176            3.29m
       163           0.2176            2.84m
       164           0.2175            2.44m
       165           0.2174            2.05m
       166           0.2173            1.65m
       167           0.2172            1.23m
       168           0.2172           49.02s
       169           0.2171           24.47s
       170           0.2170           

In [None]:
calculate_correct_comparisons(user_train, user_hot, predictors, likes)



In [32]:
user_hot.shape

(71566, 5863)