In [24]:
import os
import glob
import json
from tqdm import tqdm
from collections import Counter

First task is to get the data into the system. The data is located at `/media/kashgar/data/pnn_training/ph_straight_mdonly/` and `/media/kashgar/data/pnn_training/ph_straight_videos/`.

Counting the json files.

In [6]:
PATHS = ['/media/kashgar/data/pnn_training/ph_straight_mdonly/*.json', '/media/kashgar/data/pnn_training/ph_straight_videos/*.json']

In [8]:
n = 0

for path in PATHS:
    for file in glob.glob(path):
        n += 1
        this = file
print(n)

72255


See what's in one json file.

In [11]:
with open(this, 'r') as f:
    data = json.load(f)
    print(json.dumps(data, indent=4, sort_keys=True))

{
    "age_limit": 18,
    "categories": [
        "Pornstar",
        "Reality",
        "Teen",
        "POV",
        "Small Tits",
        "For Women",
        "HD"
    ],
    "comment_count": 154,
    "dislike_count": 5521,
    "display_id": "ph58b4678809eed",
    "downloadtime": "2017-09-25 05:42:22",
    "duration": 479,
    "ext": "mp4",
    "extractor": "PornHub",
    "extractor_key": "PornHub",
    "format": "0 - unknown",
    "format_id": "0",
    "http_headers": {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "en-us,en;q=0.5",
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)"
    },
    "id": "ph58b4678809eed",
    "like_count": 17462,
    "playlist": null,
    "playlist_index": null,
    "pornstars": [
        "/pornstar/alexa-grace"
    ],
    "productio

Let's get all keys from all json files.

In [17]:
keys = set()

for path in PATHS:
    for file in tqdm(glob.glob(path)):
        with open(file, 'r') as f:
            data = json.load(f)
            keys.update(list(data.keys()))
print(keys)

100%|██████████| 52191/52191 [06:16<00:00, 138.66it/s] 
100%|██████████| 20064/20064 [04:45<00:00, 70.25it/s]

{'age_limit', 'comment_count', 'duration', 'extractor_key', 'http_headers', 'url', 'downloadtime', 'webpage_url', 'pornstars', 'dislike_count', 'id', 'requested_subtitles', 'playlist_index', 'display_id', 'like_count', 'thumbnails', 'protocol', 'playlist', 'categories', 'ext', 'view_count', 'thumbnail', 'title', 'uploader', 'production', 'tags', 'webpage_url_basename', 'format', 'format_id', 'extractor'}





In [18]:
keys

{'age_limit',
 'categories',
 'comment_count',
 'dislike_count',
 'display_id',
 'downloadtime',
 'duration',
 'ext',
 'extractor',
 'extractor_key',
 'format',
 'format_id',
 'http_headers',
 'id',
 'like_count',
 'playlist',
 'playlist_index',
 'pornstars',
 'production',
 'protocol',
 'requested_subtitles',
 'tags',
 'thumbnail',
 'thumbnails',
 'title',
 'uploader',
 'url',
 'view_count',
 'webpage_url',
 'webpage_url_basename'}

Keys of interest:
* `categories`
* `pornstars`
* `production`
* `tags`
* `title`, `view_count`, `comment_count`, `like_count`, `dislike_count` (maybe)

For `title`, we'd have to tokenize it and then clean up the results, probably with stemming or the like. For the others in the last bullet point we'd have to discretize it somehow. 

Now, let's count each of the features, and see how they're distributed. 

In [26]:
c = Counter()
koi = ['categories', 'pornstars', 'production', 'tags']

for path in PATHS:
    for file in tqdm(glob.glob(path)):
        with open(file, 'r') as f:
            data = json.load(f)
            for k in koi:
                these = data[k]
                these = [k + '__' + x for x in these]
                c.update(these)

100%|██████████| 52191/52191 [00:02<00:00, 17533.57it/s]
100%|██████████| 20064/20064 [00:01<00:00, 17463.83it/s]


In [30]:
q = list(c.items())
q = sorted(q, key=lambda x:x[1], reverse=True)

In [33]:
n = 0
for tag, count in q:
    if count >= 100:
        n += 1

In [35]:
print(n, len(q))

820 43995


There are 43,995 features, but only 820 occur in more than 100 instances.

Tentatively, we will limit features of interest to only these 820 features. The problem is, given our original pool of 72,225 videos, if we eliminate many of these features, we'll probably eliminate a lot of these videos as well. How many videos will be left over?

In [44]:
vocab = set([k for k, v in q if v >= 100])
n = 0
for path in PATHS:
    for file in tqdm(glob.glob(path)):
        with open(file, 'r') as f:
            data = json.load(f)
            this = []
            for k in koi:
                these = data[k]
                these = [k + '__' + x for x in these]
                this += these
            this = [x for x in this if x in vocab]
            if this:
                n += 1
print(n)

100%|██████████| 52191/52191 [00:02<00:00, 21419.37it/s]
100%|██████████| 20064/20064 [00:00<00:00, 20536.58it/s]

72255





Doesn't seem to be a problem.

In [46]:
features = []
for path in PATHS:
    for file in tqdm(glob.glob(path)):
        with open(file, 'r') as f:
            data = json.load(f)
            this = []
            for k in koi:
                these = data[k]
                these = [k + '__' + x for x in these]
                this += these
            this = [x for x in this if x in vocab]
            features.append(this)

100%|██████████| 52191/52191 [00:02<00:00, 20006.03it/s]
100%|██████████| 20064/20064 [00:00<00:00, 20430.80it/s]


In [61]:
l_c = Counter()
for f in features:
    l_c.update([len(f)])


In [63]:
l_c.most_common()

[(18, 7525),
 (19, 7511),
 (20, 7108),
 (17, 7009),
 (21, 5814),
 (16, 5805),
 (15, 4777),
 (22, 4155),
 (14, 3790),
 (13, 2963),
 (23, 2777),
 (12, 2342),
 (11, 1966),
 (24, 1653),
 (10, 1617),
 (9, 1272),
 (8, 915),
 (25, 859),
 (7, 727),
 (6, 484),
 (26, 449),
 (5, 237),
 (27, 179),
 (4, 105),
 (28, 84),
 (3, 53),
 (29, 46),
 (30, 15),
 (31, 8),
 (2, 7),
 (32, 2),
 (37, 1)]

Use sklearn's multilabel binarizer to make features. 