In [1]:
# Import data
import pandas as pd
import numpy as np

# To show all data in dataframe
pd.set_option('display.max_colwidth', None)

raw = pd.read_json("kym.json")                   # 28799 meme templates from Know Your Meme, 16 features
#vision = pd.read_json("kym_vision.json").T         # computer vision data with links to meme instances
#about = pd.read_json("kym_spotlight.json").T       # text entities from the About section on Know Your Meme

In [2]:
# Changing large integers to readable dates
# Selecting memes that were edited in 2019–2021Q2
# That makes ~10k memes in total

import datetime
from collections import Counter

memes = raw[raw['category'] == "Meme"] # Selecting only memes

# Changing large integers to readable dates
memes['added'] = memes['added'].apply(lambda x: datetime.datetime.fromtimestamp(x) if not pd.isnull(x) else x)
memes['last_update_source'] = memes['last_update_source'].apply(lambda x: datetime.datetime.fromtimestamp(x) if not pd.isnull(x) else x)
years = []
for year in memes['last_update_source']:
    years.append(int(str(year)[:4]) if not pd.isnull(year) else 0)

# Filtering out memes that were last updated in 2019-2021
memes['last_update_source'] = years
fresh = memes[memes['last_update_source'] >= 2019]

# Removing (two) entries that don't have a description
descriptions = []
missing_desc = []
for title, d in zip(fresh['title'], fresh['content']):
    if 'about' in d.keys() and 'text' in d['about'].keys():
        descriptions.append(d['about']['text'][0])
    else:
        missing_desc.append(title)

fresh = fresh[~fresh['title'].isin(missing_desc)]
fresh['about'] = descriptions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  memes['added'] = memes['added'].apply(lambda x: datetime.datetime.fromtimestamp(x) if not pd.isnull(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  memes['last_update_source'] = memes['last_update_source'].apply(lambda x: datetime.datetime.fromtimestamp(x) if not pd.isnull(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [4]:
# Prepare data for Toloka
# template_image_url and about
toloka = fresh[['template_image_url', 'about']]
toloka.head()
# toloka.to_csv("toloka.tsv", header = False, index = False, sep="\t") # Quotes might be messed up, see Toloka

Unnamed: 0,template_image_url,about
0,https://i.kym-cdn.com/entries/icons/original/000/000/021/relevant-to-my-interests.jpg,"""This is Relevant To My Interests"" is an expression used to convey approval and enthusiasm over a given topic. It is most commonly used on media-sharing websites in the form of text-based comments or image macros featuring photographs of concerned-looking animals."
1,https://i.kym-cdn.com/entries/icons/original/000/000/023/roflcopter-2.jpg,"ROFLcopter is an Internet slang term used as a superlative of the abbreviation ROFL (""rolling on the floor laughing""). Like the slang term Lollerskates, ROFLcopter inspired a GIF animation of an ASCII helicopter with spinning ""ROFL"" blades."
2,https://i.kym-cdn.com/entries/icons/original/000/000/022/bitches-dont-know.jpg,"Bitches Don't Know is an exploitable image macro based upon a picture of a man wearing a shirt saying ""BITCHES DONT KNOW BOUT MY DICK"". The exploitable is commonly used by replacing the man's face and wording on his shirt via Photoshop to provide the appearance that another character is wearing the shirt."
3,https://i.kym-cdn.com/entries/icons/original/000/000/016/brittnee.jpg,"""Leave Britney Alone"" is a viral video in which YouTube celebrity Chris Crocker cries while pleading with viewers to leave pop star Britney Spears alone."
4,https://i.kym-cdn.com/entries/icons/original/000/000/015/oreally.jpg,"O RLY is internet slang for “OH REALLY?” with implicit sarcasm. Since gaining traction through with usage on Internet forums, O RLY has become a popular deadpan response to any statement that is deemed either highly doubtful or obviously true."


In [72]:
# Exploration: siblings are weird

c = 0
for title, url, sib in zip(raw['title'], raw['url'], raw['siblings']):
    if type(sib) == list:
        print(title, url, sib)

    c += 1
    if c == 10:
        break

ROFLcopter https://knowyourmeme.com/memes/roflcopter ['https://knowyourmeme.com/memes/lollerskates', 'https://knowyourmeme.com/memes/roflcopter']
Leave Britney Alone https://knowyourmeme.com/memes/leave-britney-alone ['https://knowyourmeme.com/memes/leave-britney-alone']
O RLY? https://knowyourmeme.com/memes/o-rly ['https://knowyourmeme.com/memes/normie', 'https://knowyourmeme.com/memes/oh-hai', 'https://knowyourmeme.com/memes/epic-thread', 'https://knowyourmeme.com/memes/op-is-a-faggot', 'https://knowyourmeme.com/memes/you-win-the-internet', 'https://knowyourmeme.com/memes/u-mad', 'https://knowyourmeme.com/memes/enjoy-your-aids', 'https://knowyourmeme.com/memes/y-helo-thar', 'https://knowyourmeme.com/memes/what-is-this-i-dont-even', 'https://knowyourmeme.com/memes/inb4--2', 'https://knowyourmeme.com/memes/i-has-a-shuvel', 'https://knowyourmeme.com/memes/what-has-been-seen-cannot-be-unseen', 'https://knowyourmeme.com/memes/i-did-it-for-the-lulz', 'https://knowyourmeme.com/memes/pics-or

In [74]:
# Web entity detection in Google's Vision AI data
vision.head()

Unnamed: 0,labelAnnotations,safeSearchAnnotation,webDetection,error
https://knowyourmeme.com/memes/this-is-relevant-to-my-interests,"[{'mid': '/m/01yrx', 'description': 'Cat', 'sc...","{'adult': 'VERY_UNLIKELY', 'spoof': 'VERY_LIKE...","{'webEntities': [{'entityId': '/m/04wpw', 'sco...",
https://knowyourmeme.com/memes/leave-britney-alone,"[{'mid': '/m/0dzct', 'description': 'Face', 's...","{'adult': 'UNLIKELY', 'spoof': 'LIKELY', 'medi...","{'webEntities': [{'entityId': '/m/03bzkh4', 's...",
https://knowyourmeme.com/memes/i-like-turtles,"[{'mid': '/m/068jd', 'description': 'Photograp...","{'adult': 'UNLIKELY', 'spoof': 'VERY_LIKELY', ...","{'webEntities': [{'entityId': '/m/09dzg', 'sco...",
https://knowyourmeme.com/memes/bitches-dont-know,"[{'mid': '/m/09j2d', 'description': 'Clothing'...","{'adult': 'VERY_UNLIKELY', 'spoof': 'VERY_LIKE...","{'webEntities': [{'entityId': '/m/09kqc', 'sco...",
https://knowyourmeme.com/memes/o-rly,"[{'mid': '/m/04hgtk', 'description': 'Head', '...","{'adult': 'VERY_UNLIKELY', 'spoof': 'VERY_LIKE...","{'webEntities': [{'entityId': '/m/06ng23', 'sc...",


In [103]:
c = 0
print(instances.keys())
print()
for instances in vision['webDetection']:
    if 'partialMatchingImages' in instances:
        print(instances['partialMatchingImages'])
    print()
    #print(instances['webEntities'])

    c += 1
    if c == 20:
        break

dict_keys(['webEntities', 'fullMatchingImages', 'partialMatchingImages', 'pagesWithMatchingImages', 'visuallySimilarImages', 'bestGuessLabels'])

[{'url': 'https://cdn2.psychologytoday.com/assets/styles/manual_crop_4_3_600x450/public/blogs/1023/2011/12/81873-72368.gif?itok=3hTXTKTS'}, {'url': 'https://play-lh.googleusercontent.com/a-/AOh14Gi1NGbpcgf9juih2-clnO0eDbdlHZlXEw-zv9MPig'}, {'url': 'https://qph.fs.quoracdn.net/main-thumb-1781520-200-9TfEHEPI2J0GJWjND9yHMAXDVbBCoGci.jpeg'}, {'url': 'https://slideplayer.com/slide/14333424/89/images/29/Relevance%28NOUN%29+connected+with+the+matter+at+hand.jpg'}, {'url': 'https://img.desmotivaciones.es/201106/25342_384764295137_711920137_3685334_4987243_n.jpg'}, {'url': 'https://i.pinimg.com/236x/7d/02/00/7d0200b95accd637b839b4231a00a8f6.jpg'}, {'url': 'https://i.kym-cdn.com/photos/images/newsfeed/000/041/227/19042.jpg'}, {'url': 'https://i.warosu.org/data/tg/thumb/0094/61/1272375777042s.jpg'}, {'url': 'https://slideplayer.gr/slide/12065239/69/ima