### Image data
The goal of this notebook is to train and evaluated HT risk classification using statistics from the images and the faces detect in the images associated to the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.
[Most of the code from Mayank repo]

### Input files
1. clusters images based features for training and testing

### Outputs
1. eval file

In [1]:
import os
import json
import codecs
import re
import numpy as np
from random import shuffle

In [2]:
def convert_string_to_float_list(string):
    return [float(i) for i in re.split(', ', string[1:-1])]

In [3]:
def l2_norm_on_matrix(matrix):
    """
    Takes a np.matrix style object and l2-normalizes it.
    :param matrix:
    :return matrix:
    """
    from sklearn.preprocessing import normalize
    warnings.filterwarnings("ignore")
    return normalize(matrix)

In [4]:
def sample_and_extend(list_of_vectors, total_samples):
    """
    Oversampling code for balanced training. We will do deep re-sampling, assuming that the vectors contain
    atoms.
    :param list_of_vectors: the list of vectors that are going to be re-sampled (randomly)
    :param total_samples: The total number of vectors that we want in the list. Make sure that this number
    is higher than the length of list_of_vectors
    :return: the over-sampled list
    """
    if len(list_of_vectors) >= total_samples:
        raise Exception('Check your lengths!')

    indices = range(0, len(list_of_vectors))
    shuffle(indices)
    desired_samples = total_samples-len(list_of_vectors)
    # print desired_samples>len(list_of_vectors)
    while desired_samples > len(indices):
        new_indices = list(indices)
        shuffle(new_indices)
        indices += new_indices
    new_data = [list(list_of_vectors[i]) for i in indices[0:desired_samples]]
    # print new_data
    return np.append(list_of_vectors, new_data, axis=0)

In [5]:
def prepare_for_ML_classification(pos_neg_file, normalize=False):
    """
    We need to read in embeddings
    :param pos_neg_file: The file generated in one of the preprocess_filtered_* files
    :return: A dictionary where a 0,1 label references a numpy matrix.
    """
    result = dict()
    pos_features = list()
    neg_features = list()
    with codecs.open(pos_neg_file, 'r', 'utf-8') as f:
        for line in f:
            line = line[0:-1]
            cols = re.split('\t',line)
            # print list(cols[1])
            # break
            if int(cols[2]) == 1:
                pos_features.append(convert_string_to_float_list(cols[1]))
            elif int(cols[2]) == 0:
                neg_features.append(convert_string_to_float_list(cols[1]))
            else:
                print 'error; label not recognized'
    # print np.matrix(pos_features)
    if normalize == True:
        result[0] = l2_norm_on_matrix(np.matrix(neg_features))
        result[1] = l2_norm_on_matrix(np.matrix(pos_features))
    else:
        if len(pos_features) != 0:
            result[1] = pos_features
        if len(neg_features) != 0:
            result[0] = neg_features
    return result

In [6]:
def prepare_train_test_data(pos_neg_file, train_percent = 0.3, randomize=True, balanced_training=True, data_vectors=None):
    """
    :param pos_neg_file:
    :param train_percent:
    :param randomize: If true, we'll randomize the data we're reading in from pos_neg_file. Otherwise, the initial
    train_percent fraction goes into the training data and the rest of it in the test data
    :param balanced_training: if True, we will equalize positive and negative training samples by oversampling
    the lesser class. For example, if we have 4 positive samples and 7 negative samples, we will randomly re-sample
    3 positive samples from the 4 positive samples, meaning there will be repetition. Use with caution.
    :param data_vectors: this should be set if pos_neg_file is None. It is mostly for internal uses, so
    that we can re-use this function by invoking it from some of the other _prepare_ files.
    :return: dictionary containing training/testing data/labels
    """
    import math
    if pos_neg_file:
        data = prepare_for_ML_classification(pos_neg_file)
    elif data_vectors:
        data = data_vectors
    else:
        raise Exception('Neither pos_neg_file nor data_vectors argument is specified. Exiting.')

    # print len(data[1])
    # print len(data[0])
    train_pos_num = int(math.ceil(len(data[1])*train_percent))
    train_neg_num = int(math.ceil(len(data[0])*train_percent))
    # print train_pos_num
    # print train_neg_num
    test_pos_num = len(data[1])-train_pos_num
    test_neg_num = len(data[0])-train_neg_num
    if test_pos_num == 0:
        test_pos_num = 1
    if test_neg_num == 0:
        test_neg_num = 1

    test_labels_pos = [[1] * test_pos_num]
    test_labels_neg = [[0] * test_neg_num]

    if not randomize:
        train_data_pos = data[1][0:train_pos_num]
        train_data_neg = data[0][0:train_neg_num]
        if train_pos_num < len(data[1]):
            test_data_pos = data[1][train_pos_num:]
        else:
            test_data_pos = [data[1][-1]]

        if train_neg_num < len(data[0]):
            test_data_neg = data[0][train_neg_num:]
        else:
            test_data_neg = [data[0][-1]]

    else:
        all_pos_indices = range(0, len(data[1]))
        all_neg_indices = range(0, len(data[0]))
        shuffle(all_pos_indices)
        shuffle(all_neg_indices)

        train_data_pos = [data[1][i] for i in all_pos_indices[0:train_pos_num]]
        train_data_neg = [data[0][i] for i in all_neg_indices[0:train_neg_num]]

        if train_pos_num < len(data[1]):
            test_data_pos = [data[1][i] for i in all_pos_indices[train_pos_num:]]
        else:
            test_data_pos = [data[1][-1]]

        if train_neg_num < len(data[0]):
            test_data_neg = [data[0][i] for i in all_neg_indices[train_neg_num:]]
        else:
            test_data_neg = [data[0][-1]]

    if balanced_training:
        if train_pos_num < train_neg_num:
            train_labels_pos = [[1] * train_neg_num]
            train_labels_neg = [[0] * train_neg_num]
            train_data_pos = sample_and_extend(train_data_pos, total_samples=train_neg_num)
        elif train_pos_num > train_neg_num:
            train_labels_pos = [[1] * train_pos_num]
            train_labels_neg = [[0] * train_pos_num]
            train_data_neg = sample_and_extend(train_data_neg, total_samples=train_pos_num)
        else:
            train_labels_pos = [[1] * train_pos_num]
            train_labels_neg = [[0] * train_neg_num]
    else:
        train_labels_pos = [[1] * train_pos_num]
        train_labels_neg = [[0] * train_neg_num]

    # print len(train_data_pos)
    # print len(train_data_neg)
    train_data = np.append(train_data_pos, train_data_neg, axis=0)
    test_data = np.append(test_data_pos, test_data_neg, axis=0)
    train_labels = np.append(train_labels_pos, train_labels_neg)
    test_labels = np.append(test_labels_pos, test_labels_neg)

    results = dict()
    results['train_data'] = train_data
    results['train_labels'] = train_labels
    results['test_data'] = test_data
    results['test_labels'] = test_labels

    return results

In [7]:
def train_and_test_classifier(train_data, train_labels, test_data, test_labels, classifier_model, test_ids=None):
        """
        Take three numpy matrices and compute a bunch of metrics. Hyperparameters must be changed manually,
        we do not take them in as input.
        This method is for BINARY CLASSIFICATION only, although there is some support for regression.
        :param train_data:
        :param train_labels:
        :param test_data:
        :param test_labels:
        :param classifier_model:
        :return:
        """
        from sklearn.linear_model import LogisticRegression, LinearRegression
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, precision_recall_fscore_support
        if classifier_model == 'random_forest':
            model = RandomForestClassifier()
            model.fit(train_data, train_labels)
            # joblib.dump(model, '/Users/mayankkejriwal/git-projects/dig-random-indexing-extractor/test/model')
            predicted_labels = model.predict(test_data)
            print predicted_labels
            predicted_probabilities = model.predict_proba(test_data)
            print predicted_probabilities
            # print predicted_labels[0:10]
            # print predicted_probabilities[0:10]
        elif classifier_model == 'knn':
            k = 9
            model = neighbors.KNeighborsClassifier(n_neighbors=k, weights='uniform')
            model.fit(train_data, train_labels)
            predicted_labels = model.predict(test_data)
            predicted_probabilities = model.predict_proba(test_data)
            print predicted_probabilities
        elif classifier_model == 'logistic_regression':
            model = LogisticRegression()
            model.fit(train_data, train_labels)
            predicted_labels = model.predict(test_data)
            predicted_probabilities = model.predict_proba(test_data)
        elif classifier_model == 'linear_regression': # this is a regressor; be careful.
            model = LinearRegression()
            model.fit(train_data, train_labels)
            predicted_labels = model.predict(test_data)

        final_results = list()
        if test_ids is not None:
            final_results.append(test_ids)
            final_results.append(predicted_probabilities)
            return final_results
        else:
            print 'AUC (Area Under Curve): ',
            print roc_auc_score(test_labels, predicted_labels)

        # precision, recall, thresholds = precision_recall_curve(test_labels, predicted_labels)
        # plt.clf()
        # plt.plot(recall, precision, label='precision-recall-curve')
        # plt.xlabel('Recall')
        # plt.ylabel('Precision')
        # plt.ylim([0.0, 1.05])
        # plt.xlim([0.0, 1.0])
        # plt.title('Precision-Recall curve')
        # plt.savefig('/home/mayankkejriwal/Downloads/memex-cp4-october/tmp/fig.png')
        if classifier_model not in ['linear_regression']:
            print 'Accuracy: ',
            print accuracy_score(test_labels, predicted_labels)
            # print precision_score(test_labels, predicted_labels)
            prf = ['Precision: ', 'Recall: ', 'F-score: ', 'Support: ']
            print 'Class 0\tClass 1'
            k = precision_recall_fscore_support(test_labels, predicted_labels)
            for i in range(0, len(k)):
                print prf[i],
                print k[i]
            return [k[0][1], k[1][1], k[2][1]]

In [8]:
# set some parameters
data_dir = "../data"
prefix = "train"
#prefix = "test"
if prefix=="train":
    input_file = "train_adjusted.json"
else:
    input_file = "test_adjusted_unlabelled.json"

In [9]:
train_percent = 0.5
pos_neg_file = os.path.join(data_dir, prefix+"_images_faces_stats_mayank.tsv")
data_dict = prepare_train_test_data(pos_neg_file, train_percent=train_percent)
# 'logistic_regression', 'random_forest', 'knn'
data_dict['classifier_model'] = 'logistic_regression'
results = train_and_test_classifier(**data_dict)

ImportError: dlopen(/Users/svebor/anaconda/lib/python2.7/site-packages/scipy-0.18.1-py2.7-macosx-10.6-x86_64.egg/scipy/sparse/linalg/isolve/_iterative.so, 2): Library not loaded: libmkl_intel_lp64.dylib
  Referenced from: /Users/svebor/anaconda/lib/python2.7/site-packages/scipy-0.18.1-py2.7-macosx-10.6-x86_64.egg/scipy/sparse/linalg/isolve/_iterative.so
  Reason: image not found