In [182]:
import os
import glob
import json
import numpy as np

from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import label_ranking_average_precision_score

First task is to get the data into the system. The data is located at `/media/kashgar/data/pnn_training/ph_straight_mdonly/` and `/media/kashgar/data/pnn_training/ph_straight_videos/`.

Counting the json files.

In [159]:
PATHS = ['/media/kashgar/data/pnn_training/ph_straight_mdonly/*.json', '/media/kashgar/data/pnn_training/ph_straight_videos/*.json']

In [160]:
n = 0

for path in PATHS:
    for file in glob.glob(path):
        n += 1
        this = file
print(n)

72255


See what's in one json file.

In [161]:
with open(this, 'r') as f:
    data = json.load(f)
    print(json.dumps(data, indent=4, sort_keys=True))

{
    "age_limit": 18,
    "categories": [
        "Pornstar",
        "Reality",
        "Teen",
        "POV",
        "Small Tits",
        "For Women",
        "HD"
    ],
    "comment_count": 154,
    "dislike_count": 5521,
    "display_id": "ph58b4678809eed",
    "downloadtime": "2017-09-25 05:42:22",
    "duration": 479,
    "ext": "mp4",
    "extractor": "PornHub",
    "extractor_key": "PornHub",
    "format": "0 - unknown",
    "format_id": "0",
    "http_headers": {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "en-us,en;q=0.5",
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)"
    },
    "id": "ph58b4678809eed",
    "like_count": 17462,
    "playlist": null,
    "playlist_index": null,
    "pornstars": [
        "/pornstar/alexa-grace"
    ],
    "productio

Let's get all keys from all json files.

In [162]:
keys = set()

for path in PATHS:
    for file in tqdm(glob.glob(path)):
        with open(file, 'r') as f:
            data = json.load(f)
            keys.update(list(data.keys()))
print(keys)

100%|██████████| 52191/52191 [00:02<00:00, 23123.66it/s]
100%|██████████| 20064/20064 [00:01<00:00, 17602.93it/s]

{'http_headers', 'webpage_url', 'playlist_index', 'requested_subtitles', 'display_id', 'format_id', 'playlist', 'production', 'ext', 'dislike_count', 'downloadtime', 'thumbnails', 'comment_count', 'like_count', 'url', 'categories', 'thumbnail', 'webpage_url_basename', 'duration', 'extractor_key', 'view_count', 'tags', 'age_limit', 'uploader', 'protocol', 'title', 'extractor', 'id', 'pornstars', 'format'}





In [163]:
keys

{'age_limit',
 'categories',
 'comment_count',
 'dislike_count',
 'display_id',
 'downloadtime',
 'duration',
 'ext',
 'extractor',
 'extractor_key',
 'format',
 'format_id',
 'http_headers',
 'id',
 'like_count',
 'playlist',
 'playlist_index',
 'pornstars',
 'production',
 'protocol',
 'requested_subtitles',
 'tags',
 'thumbnail',
 'thumbnails',
 'title',
 'uploader',
 'url',
 'view_count',
 'webpage_url',
 'webpage_url_basename'}

Keys of interest:
* `categories`
* `pornstars`
* `production`
* `tags`
* `title`, `view_count`, `comment_count`, `like_count`, `dislike_count` (maybe)

For `title`, we'd have to tokenize it and then clean up the results, probably with stemming or the like. For the others in the last bullet point we'd have to discretize it somehow. 

Now, let's count each of the features, and see how they're distributed. 

In [164]:
c = Counter()
koi = ['categories', 'pornstars', 'production', 'tags']

for path in PATHS:
    for file in tqdm(glob.glob(path)):
        with open(file, 'r') as f:
            data = json.load(f)
            for k in koi:
                these = data[k]
                these = [k + '__' + x for x in these]
                c.update(these)

100%|██████████| 52191/52191 [00:02<00:00, 17730.74it/s]
100%|██████████| 20064/20064 [00:01<00:00, 17637.24it/s]


In [165]:
q = list(c.items())
q = sorted(q, key=lambda x:x[1], reverse=True)

In [166]:
n = 0
for tag, count in q:
    if count >= 500:
        n += 1

In [167]:
print(n, len(q))

326 43995


There are 43,995 features, but only 820 occur in more than 100 instances.

Tentatively, we will limit features of interest to only these 820 features. The problem is, given our original pool of 72,225 videos, if we eliminate many of these features, we'll probably eliminate a lot of these videos as well. How many videos will be left over?

In [168]:
vocab = set([k for k, v in q if v >= 500])
n = 0
for path in PATHS:
    for file in tqdm(glob.glob(path)):
        with open(file, 'r') as f:
            data = json.load(f)
            this = []
            for k in koi:
                these = data[k]
                these = [k + '__' + x for x in these]
                this += these
            this = [x for x in this if x in vocab]
            if this:
                n += 1
print(n)

100%|██████████| 52191/52191 [00:02<00:00, 21160.70it/s]
100%|██████████| 20064/20064 [00:00<00:00, 21144.11it/s]

72255





Doesn't seem to be a problem.

In [169]:
features = []
for path in PATHS:
    for file in tqdm(glob.glob(path)):
        with open(file, 'r') as f:
            data = json.load(f)
            this = []
            for k in koi:
                these = data[k]
                these = [k + '__' + x for x in these]
                this += these
            this = [x for x in this if x in vocab]
            features.append(this)

100%|██████████| 52191/52191 [00:02<00:00, 19710.89it/s]
100%|██████████| 20064/20064 [00:00<00:00, 20761.89it/s]


In [170]:
l_c = Counter()
for f in features:
    l_c.update([len(f)])


In [171]:
l_c.most_common()

[(17, 7172),
 (16, 7057),
 (18, 6786),
 (15, 6530),
 (19, 5917),
 (14, 5511),
 (20, 5026),
 (13, 4607),
 (21, 3704),
 (12, 3700),
 (11, 3025),
 (22, 2463),
 (10, 2404),
 (9, 1893),
 (23, 1347),
 (8, 1316),
 (7, 987),
 (6, 735),
 (24, 696),
 (5, 387),
 (25, 357),
 (4, 232),
 (26, 151),
 (3, 102),
 (27, 59),
 (28, 39),
 (29, 18),
 (2, 15),
 (1, 9),
 (30, 5),
 (31, 2),
 (32, 2),
 (36, 1)]

In [172]:
f_c = Counter()
for f in features:
    f_c.update(f)
f_c.most_common()

[('production__professional', 65964),
 ('categories__HD', 32369),
 ('categories__Pornstar', 27458),
 ('tags__blowjob', 25909),
 ('tags__brunette', 20806),
 ('categories__Amateur', 18389),
 ('tags__big tits', 17611),
 ('tags__cumshot', 17238),
 ('categories__Teen', 16793),
 ('categories__Big Tits', 16029),
 ('tags__blonde', 15754),
 ('tags__hardcore', 14703),
 ('categories__Hardcore', 14245),
 ('tags__natural tits', 13728),
 ('tags__amateur', 13646),
 ('tags__young', 12747),
 ('categories__Brunette', 12252),
 ('tags__teen', 10818),
 ('tags__small tits', 10653),
 ('tags__big boobs', 10301),
 ('categories__Blonde', 10108),
 ('categories__Anal', 10016),
 ('tags__facial', 9595),
 ('tags__teenager', 9498),
 ('tags__anal', 9108),
 ('tags__babe', 9081),
 ('tags__homemade', 8764),
 ('tags__big dick', 8701),
 ('categories__Masturbation', 8672),
 ('categories__Babe', 8665),
 ('categories__Blowjob', 8221),
 ('categories__Big Dick', 8195),
 ('categories__MILF', 8122),
 ('tags__orgasm', 7845),
 ('ta

Use sklearn's multilabel binarizer to make features. 

In [173]:
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(features)

In [174]:
X.shape

(72255, 326)

Features are ready. Now it's time to move into modeling it.

It turns out that tied-weight autoencoders are a little unintuitive to implement in Keras. I will try to use an implementation similar to https://github.com/AmirAlavi/tied-autoencoder-keras/blob/master/tied_autoencoder_keras/autoencoders.py.

In [175]:
from keras import backend as K
from keras.engine import InputSpec
from keras.layers import Dense, Dropout
from keras.engine.topology import Layer


class DenseLayerAutoencoder(Dense):
    def __init__(self, layer_sizes, l2_normalize=False, dropout=0.0, *args, **kwargs):
        self.layer_sizes = layer_sizes
        self.l2_normalize = l2_normalize
        self.dropout = dropout
        self.kernels = []
        self.biases = []
        self.biases2 = []
        self.uses_learning_phase = True
        super().__init__(units=1, *args, **kwargs)  # 'units' not used

    def compute_output_shape(self, input_shape):
        return input_shape

    def build(self, input_shape):
        assert len(input_shape) >= 2
        input_dim = input_shape[-1]
        self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})

        for i in range(len(self.layer_sizes)):

            self.kernels.append(
                self.add_weight(
                    shape=(
                        input_dim,
                        self.layer_sizes[i]),
                    initializer=self.kernel_initializer,
                    name='ae_kernel_{}'.format(i),
                    regularizer=self.kernel_regularizer,
                    constraint=self.kernel_constraint))

            if self.use_bias:
                self.biases.append(
                    self.add_weight(
                        shape=(
                            self.layer_sizes[i],
                        ),
                        initializer=self.bias_initializer,
                        name='ae_bias_{}'.format(i),
                        regularizer=self.bias_regularizer,
                        constraint=self.bias_constraint))
            input_dim = self.layer_sizes[i]

        if self.use_bias:
            for n, i in enumerate(range(len(self.layer_sizes)-2, -1, -1)):
                self.biases2.append(
                    self.add_weight(
                        shape=(
                            self.layer_sizes[i],
                        ),
                        initializer=self.bias_initializer,
                        name='ae_bias2_{}'.format(n),
                        regularizer=self.bias_regularizer,
                        constraint=self.bias_constraint))
            self.biases2.append(self.add_weight(
                        shape=(
                            input_shape[-1],
                        ),
                        initializer=self.bias_initializer,
                        name='ae_bias2_{}'.format(len(self.layer_sizes)),
                        regularizer=self.bias_regularizer,
                        constraint=self.bias_constraint))

        self.built = True

    def call(self, inputs):
        return self.decode(self.encode(inputs))

    def _apply_dropout(self, inputs):
        dropped =  K.dropout(inputs, self.dropout)
        return K.in_train_phase(dropped, inputs)

    def encode(self, inputs):
        latent = inputs
        for i in range(len(self.layer_sizes)):
            if self.dropout > 0:
                latent = self._apply_dropout(latent)
            latent = K.dot(latent, self.kernels[i])
            if self.use_bias:
                latent = K.bias_add(latent, self.biases[i])
            if self.activation is not None:
                latent = self.activation(latent)
        if self.l2_normalize:
            latent = latent / K.l2_normalize(latent, axis=-1)
        return latent

    def decode(self, latent):
        recon = latent
        for i in range(len(self.layer_sizes)):
            if self.dropout > 0:
                recon = self._apply_dropout(recon)
            recon = K.dot(recon, K.transpose(self.kernels[len(self.layer_sizes) - i - 1]))
            if self.use_bias:
                recon = K.bias_add(recon, self.biases2[i])
            if self.activation is not None:
                recon = self.activation(recon)
        return recon

    def get_config(self):
        config = {
            'layer_sizes': self.layer_sizes
        }
        base_config = super().get_config()
        base_config.pop('units', None)
        return dict(list(base_config.items()) + list(config.items()))

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [185]:
from keras.models import Model
from keras.layers import Input
from keras import regularizers

In [186]:
inputs = Input(shape=(X.shape[-1],))
x = DenseLayerAutoencoder([50], activation='sigmoid', kernel_regularizer=regularizers.l2(0.001))(inputs)
model = Model(inputs=inputs, outputs=x)

In [187]:
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 326)               0         
_________________________________________________________________
dense_layer_autoencoder_7 (D (None, 326)               16676     
Total params: 16,676
Trainable params: 16,676
Non-trainable params: 0
_________________________________________________________________


In [188]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[label_ranking_average_precision_score])

TypeError: len is not well defined for symbolic Tensors. (dense_layer_autoencoder_7_target:0) Please call `x.shape` rather than `len(x)` for shape information.

In [184]:
model.fit(X, X, epochs=50, batch_size=32, validation_split=0.2, shuffle=True)

Train on 57804 samples, validate on 14451 samples
Epoch 1/50


FailedPreconditionError:  Error while reading resource variable _AnonymousVar132 from Container: localhost. This could mean that the variable was uninitialized. Not found: Resource localhost/_AnonymousVar132/N10tensorflow3VarE does not exist.
	 [[node ReadVariableOp_109 (defined at /home/vqmalic/projects/ptropes/venv/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_keras_scratch_graph_3222210]

Function call stack:
keras_scratch_graph


In [124]:
n = 5
x0 = X[n]
x0_pred = model.predict(X[n:n+1])[0]

In [125]:
for i in range(len(x0)):
    print(x0[i], x0_pred[i])

0 0.0055862367
1 0.25297603
0 0.13905764
0 0.0024789274
0 0.052024603
0 0.118876845
0 0.0027572215
0 0.0036103725
0 0.07977435
0 0.11586332
0 0.2215958
0 0.012039751
0 0.13759279
0 0.11255753
0 0.020021826
0 0.004940301
0 0.007494867
0 0.16869831
0 0.005712062
0 0.009024918
0 0.005071312
0 0.0069226325
0 0.03320676
0 0.0031497777
0 0.036252916
0 0.002589941
0 0.055633783
0 0.006905526
0 0.021773517
0 0.05048102
0 0.034555912
0 0.047313362
0 0.06365639
0 0.0060328543
0 0.0014701188
0 0.0046596527
0 0.0035141706
0 0.017704278
0 0.009564221
1 0.44552124
0 0.017811418
0 0.1984475
0 0.0054636598
0 0.055607855
0 0.0026760995
0 0.02800566
0 0.0017437339
0 0.038307697
0 0.072633594
0 0.11158806
0 0.012283921
0 0.12199268
0 0.026130974
0 0.0070393384
0 0.0076319575
0 0.032125533
0 0.09201354
0 0.0017075539
1 0.01068005
0 0.005044192
0 0.041381568
0 0.37995452
1 0.051893383
0 0.017658144
0 0.05188206
0 0.030171067
0 0.00883469
0 0.035774767
0 0.0063951015
0 0.0037451982
0 0.009423584
0 0.0920989

In [126]:
w = model.layers[-1].get_weights()[0]

In [127]:
w.shape

(820, 50)

In [146]:
for t in range(w.shape[-1]):
    print("*"*50)
    print(t)
    print("*"*50)
    vec = w[:, t]

    sort = vec.argsort()[::-1]

    for i in sort[:10]:
        print("\t", vec[i], labels[i])
    print("*"*50)

**************************************************
0
**************************************************
	 0.0009763749 tags__masturbation
	 0.00096291694 tags__cum
	 0.0009267547 categories__Webcam
	 0.0008441281 categories__Amateur
	 0.0008105555 tags__busty
	 0.00080281094 tags__cam
	 0.00072215695 tags__blowjob
	 0.0007168276 tags__masturbate
	 0.00071373273 tags__dark haired
	 0.0007084691 tags__anal
**************************************************
**************************************************
1
**************************************************
	 0.0012164691 tags__blowjob
	 0.0012132467 tags__hardcore
	 0.0010412809 tags__russian
	 0.0010254694 tags__redhead
	 0.00096230383 tags__anal
	 0.0009469682 tags__cum
	 0.00090844487 categories__Russian
	 0.000832817 categories__Creampie
	 0.0008315255 tags__big tits
	 0.0007576238 tags__facial
**************************************************
**************************************************
2
***********************************

In [137]:
for i, cls in enumerate(labels):
    if '__foot' in cls:
        print(i, cls)

372 tags__foot
373 tags__foot fetish
374 tags__footjob


In [141]:
np.round(model.layers[-1].get_weights()[0], 4)

array([[-1.e-04, -3.e-04, -3.e-04, ..., -2.e-04, -0.e+00, -2.e-04],
       [ 8.e-04,  5.e-04,  4.e-04, ...,  1.e-03,  7.e-04,  3.e-04],
       [-1.e-04,  3.e-04, -1.e-04, ...,  1.e-04, -2.e-04,  6.e-04],
       ...,
       [-0.e+00, -0.e+00, -0.e+00, ..., -0.e+00, -0.e+00, -0.e+00],
       [ 0.e+00,  0.e+00,  0.e+00, ...,  0.e+00,  0.e+00,  0.e+00],
       [-4.e-04, -5.e-04, -6.e-04, ..., -4.e-04, -3.e-04, -4.e-04]],
      dtype=float32)