In [7]:
# Import data
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # remove warning

# To show all data in dataframe
pd.set_option('display.max_colwidth', None)

raw = pd.read_json("data/kym.json")                 # 28799 templates from Know Your Meme, 16 features
events = pd.read_json("data/events.json")           # 15406 memes with timestamps from origin and spread
# vision = pd.read_json("data/kym_vision.json").T         # computer vision data with links to meme instances
# about = pd.read_json("data/kym_spotlight.json").T       # text entities from the About section on Know Your Meme

In [115]:
# Function definitions

# Finds SPREAD keywords in memes' spread data
# Input: kym dataframe. Output: tuple (list of True/False, list of frequency dictionaries of SPREAD keywords).
def matchSpread(memes):
    match = []
    freq_dics = []
    for d in memes['content']:
        if 'spread' in d.keys() and 'text' in d['spread'].keys():
            dic = {keyword: 0 for keyword in SPREAD}
            for text in d['spread']['text']:
                words = text.split(" ")
                for word in words:
                    if word.lower() in SPREAD:
                        dic[word.lower()] += 1
            if sum (dic.values()) == 0:
                match.append(False)
            else:
                match.append(True)
            freq_dics.append(dic)
        else:
            match.append(False)
    return (match, freq_dics)

In [10]:
# See available options for parameters

# Types
types = set()
typedata = raw[raw['details'].map(lambda x: 'type' in x.keys())]['details']
for d in typedata:
    for t in d['type']:
        types.add(t.split("https://knowyourmeme.com/types/")[1])
print("Types available:", types)

# Spread (keyword search)
example_platforms = ["Facebook", "Twitter", "Instagram", "Snapchat", "YouTube", "WhatsApp", "TikTok", "Reddit", "Pinterest", "Tumblr", "LinkedIn", "9GAG", "4chan"]
print("\nPlatforms available (not conclusive):", example_platforms)

Types available: {'optical-illusion', 'song', 'politician', 'webcomic', 'social-game', 'cartoon', 'news-publication', 'remix', 'advertisement', 'scientist', 'podcast', 'character', 'performance', 'axiom', 'animal', 'tv-personality', 'hashtag', 'video-game', 'manga', 'competition', 'emoticon', 'auction', 'reference', 'fauna', 'model', 'gamer', 'cliche', 'catchphrase', 'book', 'hoax', 'blog', 'controversy', 'movement', 'social-media-page', 'social-network', 'athlete', 'influencer', 'award-ceremony', 'photoshop', 'application', 'lip-dub', 'musician', 'flash-mob', 'image-macro', 'theater', 'film', 'art', 'vlogger', 'holiday', 'visual-effect', 'hacker', 'organization', 'prank', 'activist', 'actor', 'viral-debate', 'parody', 'comedian', 'businessperson', 'dance', 'historical-figure', 'shock-media', 'raid', 'disaster', 'law', 'promotion', 'tv-show', 'filmmaker', 'viral-video', 'pop-culture-reference', 'crime', 'fetish', 'slang', 'tabletop-games', 'campaign', 'web-series', 'sport', 'sound-effe

In [133]:
# Select parameters for extraction

DATE_FROM = input("Select from date (e.g. 2019-05-30). You can leave this empty. ")
DATE_TO = input("Select to date (e.g. 2021-12-25). You can leave this empty. ")

# Select if you want to show results with empty about section
keepEmptyAbout = False

# Meme has at least 1 of the selected types. Keep in mind that roughly half of the memes don't have a type specified.
print('\nWhen choosing types, use "." for OR. Use ";" for AND. For example: "snowclone;image-macro.cliche". Leave empty to include all types.')
TYPE = input("Select types: ")

SPREAD = []  # "Spread" section of meme contains at least 1 instance of at least 1 of selected keywords. Leave empty to not filter by spread.

print('\nWhen choosing tags, use same system as with type. Leave empty to include all tags.')
TAGS = input("Select tags: ")

Select from date (e.g. 2019-05-30). You can leave this empty.  
Select to date (e.g. 2021-12-25). You can leave this empty.  



When choosing types, use "." for OR. Use ";" for AND. For example: "snowclone;image-macro.cliche". Leave empty to include all types.


Select types:  



When choosing tags, use same system as with type. Leave empty to include all tags.


Select tags:  russia


In [137]:
# Extract data based on parameters

from datetime import datetime
from collections import Counter

# Select only memes
memes = raw[raw['category'] == "Meme"]              

# Drop duplicates (same title)
memes = memes.loc[memes.astype(str).drop_duplicates(subset=['title']).index]
events = events.loc[events.astype(str).drop_duplicates(subset=['title']).index]

# Merge memes and events (timestamps)
memes = pd.merge(memes, events, on="title")

# Lower-case all parameters just in case
TYPE = TYPE.lower()
SPREAD = [word.lower() for word in SPREAD]
TAGS = TAGS.lower()

# Add "type" as separate column of strings
type_col = []
for d in memes['details']:
    if 'type' in d.keys():
        t_string = ""
        for t in d['type']:
            t_string += t.split("https://knowyourmeme.com/types/")[1] + ", "
        type_col.append(t_string.strip(", "))
    else:
        type_col.append("")
memes['type'] = type_col

# Drop unnecessary columns
memes = memes.drop(columns=["meta", "category", "ld", "additional_references", "search_keywords", "parent", "siblings", "children"])

# Changing large integers to readable dates
memes['added'] = memes['added'].apply(lambda x: datetime.fromtimestamp(x) if not pd.isnull(x) else x)
memes['last_update_source'] = memes['last_update_source'].apply(lambda x: datetime.fromtimestamp(x) if not pd.isnull(x) else x)

dates = []
for date in memes['last_update_source']:
    dates.append(datetime.strptime(str(date)[:10], "%Y-%m-%d") if not pd.isnull(date) else 0)
memes['last_update_source'] = dates


# Selecting memes that were last updated in DATE_FROM - DATE_TO
if len(DATE_FROM) > 1 and len(DATE_TO) > 1:
    memes = memes[(memes['last_update_source'] >= DATE_FROM) & (memes['last_update_source'] <= DATE_TO)]
elif len(DATE_FROM) > 1:
    memes = memes[memes['last_update_source'] >= DATE_FROM]
elif len(DATE_TO) > 1:
    memes = memes[memes['last_update_source'] <= DATE_TO]
                                                     
# Removing entries with empty "About" section
if not keepEmptyAbout:
    descriptions = []
    missing_desc = []
    for title, d in zip(memes['title'], memes['content']):
        if 'about' in d.keys() and 'text' in d['about'].keys():
            descriptions.append(d['about']['text'][0])
        else:
            missing_desc.append(title)
    memes = memes[~memes['title'].isin(missing_desc)]
    memes['about'] = descriptions  # Add "About" section as a separate column

# Select memes by TYPE    
if len(TYPE) > 1:
    typestring = TYPE.replace('.', '") | type.str.contains("').replace(';', '") & type.str.contains("')
    typequery = 'type.str.contains("' + typestring + '")'
    memes = memes.query(typequery, engine='python')

# Convert tag column to strings
tag_col = []
for taglist in memes['tags']:
    tag_col.append(", ".join(taglist))
memes['tags'] = tag_col  

# Select memes by TAGS    
if len(TAGS) > 1:
    tagstring = TAGS.replace('.', '") | tags.str.contains("').replace(';', '") & tags.str.contains("')
    tagquery = 'tags.str.contains("' + tagstring + '")'
    memes = memes.query(tagquery, engine='python')
    
# Select memes by SPREAD (content:spread contains at least 1 specified keyword)
if len(SPREAD) > 0:
    spread_match, freq_dics = matchSpread(memes)
    memes = memes[spread_match]

# Results
memes = memes.drop(columns=["content"])  # Drop more columns for visual quality
print("Found {} memes matching these criteria.".format(len(memes)))
memes.head()

Found 59 memes matching these criteria.


Unnamed: 0,title,url,last_update_source,template_image_url,added,details,tags,origin,spread,type,about
6,In Soviet Russia...,https://knowyourmeme.com/memes/in-soviet-russia,2021-04-17,https://i.kym-cdn.com/entries/icons/original/000/000/093/insovietrussia.jpg,NaT,"{'status': 'confirmed', 'origin': 'Leave It To Me (Musical)', 'year': '1938', 'type': ['https://knowyourmeme.com/types/snowclone']}","russia, russian reversal, image macro, soviet, yakov smirnoff, bob hope, jimmy stewart","[[1938-01-01T00:00:00, 1938, The earliest recorded version dates back to the 1938 Cole Porter]]","[[1958-01-01T00:00:00, 1958, comedian Bob Hope told a variation of the joke], [1958-04-07T00:00:00, April 7th, 1958, comedian Bob Hope told a variation of the joke], [2000-06-27T00:00:00, June 27th, 2000, the episode], [2002-01-01T00:00:00, 2002, The Simpsons]]",snowclone,"In Soviet Russia, also known as the Russian Reversal, is a joke format and phrasal template popularized by stand-up comedian Yakov Smirnoff. The joke was extremely popular in the late 1980s but fell out of vogue in the 90s. Online, the joke format has been given new life among various online communities particularly in ironic memes. The snowclone follows a variation on this template:"
422,PhotoExtreme/Фотоэкстрим,https://knowyourmeme.com/memes/photoextreme%D1%84%D0%BE%D1%82%D0%BE%D1%8D%D0%BA%D1%81%D1%82%D1%80%D0%B8%D0%BC,2018-07-18,https://i.kym-cdn.com/entries/icons/original/000/004/195/4.jpg,2010-10-20 09:18:51,"{'status': 'submission', 'origin': 'Encounter http://world.en.cx/', 'year': None}","russia, russian, staged, photo, encounter, participatory, game",[],[],,"PhotoExtreme is one of the games from the Russian website Encounter Urban Games, were very specific rules are described and photographs are taken that meet those requirements. The most famous being ""A Cheating Wife"":"
1539,Wadded Jacket (ватник),https://knowyourmeme.com/memes/wadded-jacket-%D0%B2%D0%B0%D1%82%D0%BD%D0%B8%D0%BA,2021-04-13,https://i.kym-cdn.com/entries/icons/original/000/017/933/VATnik.jpg,2015-05-01 22:30:45,"{'status': 'submission', 'origin': 'vk.com', 'year': '2011', 'type': ['https://knowyourmeme.com/types/character']}","russia, root vatnik, ukraine, politics, soviet, jacket, vatnik, anton chadsky","[[2011-09-09T00:00:00, September 9, 2011, Wadded Jacket was created]]","[[2012-01-01T00:00:00, 2012, Wadded Jacket spread significantly], [2014-01-01T00:00:00, 2014, A Vk.com group created for War Jacket boasts over 500,000 followers], [2014-01-01T00:00:00, 2014, The character increased], [2015-05-01T00:00:00, May 2015, A Vk.com group created for War Jacket boasts over 500,000 followers]]",character,Wadded Jacket (ватник) is a Russian character meme often used to mock patriotic Russians on the Internet which uses a grotesque character based on a wadded jacket to satire those considered to be ignorant Russian patriots.
1950,Ra Ra Rasputin,https://knowyourmeme.com/memes/ra-ra-rasputin,2021-04-18,https://i.kym-cdn.com/entries/icons/original/000/023/251/maxresdefault.jpg,2017-06-20 21:43:39,"{'status': 'submission', 'origin': 'Boney M', 'year': '1978', 'type': ['https://knowyourmeme.com/types/song']}","russia, funk, rasputin, boney m, pace audio","[[1978-01-01T00:00:00, 1978, "" Rasputin "" was released]]","[[2011-10-24T00:00:00, October 24th, 2011, nastyhobbit uploaded a video of several people trying the game 's dance that gained over 2.3 million views ( shown below], [2016-01-01T00:00:00, 2016, The track began], [2016-03-14T00:00:00, March 14th, 2016, was], [2016-04-05T00:00:00, April 5th, 2016, The most popular animated video of the song was released], [2016-11-07T00:00:00, November 7th, 2016, This style was popularized], [2021-05-19T00:00:00, May 19th, OfficialBillyBones uploaded a version with General Grievous that gained 1.8 million views ( shown below , right )], [2021-10-09T00:00:00, October 9th, they released the same animation]]",song,"Ra Ra Rasputin refers to the song ""Rasputin"" by European pop disco group Boney M. that sensationalizes the life of Grigori Rasputin, friend and advisor of Tsar Nicholas II in early 20th-century Russia. It grew popular online in 2016 after several animated videos using the song went viral."
1987,Jared Kushner Russian Collusion Speech,https://knowyourmeme.com/memes/jared-kushner-russian-collusion-speech,2020-05-04,https://i.kym-cdn.com/entries/icons/original/000/023/498/jared.jpg,2017-07-25 17:24:14,"{'status': 'submission', 'origin': 'Twitter', 'year': '2017', 'type': ['https://knowyourmeme.com/types/image-macro']}","jared kushner, politics, donald trump, russia, russiagate, news, ivanka trump, media, washington dc, white house","[[2016-01-01T00:00:00, 2016, President Donald Trump son - in - law and special counsel Jared Kushner held a press conference], [2017-07-24T00:00:00, July 24th, 2017, President Donald Trump son - in - law and special counsel Jared Kushner held a press conference]]",[],image-macro,Jared Kushner Russia Collusion Speech refers to a series of image macros and online jokes about Jared Kushner's press conference regarding his involvement in the Russiagate scandal.


In [13]:
# Collecting most important verbs in events (timestamps)
verbs = ["posted", "linked", "coined", "submitted", "made", "taken", "is", "was", "recorded", "uploaded", "began", "released", "created", "appeared", "used", "begun", "began", "launched", "featured"]
count = {key:0 for key in verbs}
for stamplist in memes["origin"]:
    for stamp in stamplist:
        for verb in verbs:
            if verb in stamp[2]:
                count[verb] += 1
                
print("Verb frequency:", dict(sorted(count.items(), key=lambda item: item[1], reverse=True)))

Verb frequency: {'was': 3150, 'is': 2442, 'posted': 1151, 'uploaded': 1120, 'released': 618, 'began': 460, 'submitted': 451, 'created': 372, 'appeared': 233, 'launched': 219, 'used': 208, 'made': 155, 'featured': 144, 'taken': 113, 'coined': 63, 'recorded': 44, 'linked': 8, 'begun': 6}


In [113]:
# Extract data for Toloka

# toloka = memes[['template_image_url', 'about']]
# toloka.head()
# toloka.to_csv("out/toloka.tsv", header = False, index = False, sep="\t")