In [3]:
# Import data and modules

import pandas as pd
import numpy as np

# Remove warning
pd.options.mode.chained_assignment = None

# To show all data in dataframe
pd.set_option('display.max_colwidth', None)

raw = pd.read_json("data/kym.json")        # meme templates from Know Your Meme, 16 features
events = pd.read_json("data/events.json")  # memes with timestamps from origin and spread

In [4]:
# Functions

# Finds SPREAD keywords in memes' spread data
# Input: kym dataframe. Output: tuple (list of True/False, list of frequency dictionaries of SPREAD keywords).
def matchSpread(memes):
    match = []
    freq_dics = []
    for d in memes['content']:
        if 'spread' in d.keys() and 'text' in d['spread'].keys():
            dic = {keyword: 0 for keyword in SPREAD}
            for text in d['spread']['text']:
                words = text.split(" ")
                for word in words:
                    if word.lower() in SPREAD:
                        dic[word.lower()] += 1
            if sum (dic.values()) == 0:
                match.append(False)
            else:
                match.append(True)
            freq_dics.append(dic)
        else:
            match.append(False)
    return (match, freq_dics)

In [6]:
# See available options for parameters

# Types
types = set()
typedata = raw[raw['details'].map(lambda x: 'type' in x.keys())]['details']
for d in typedata:
    for t in d['type']:
        types.add(t.split("https://knowyourmeme.com/types/")[1])
print("Types available:", types)

# Spread (keyword search)
example_platforms = ["Facebook", "Twitter", "Instagram", "Snapchat", "YouTube", "WhatsApp", "TikTok", "Reddit", "Pinterest", "Tumblr", "LinkedIn", "9GAG", "4chan"]
print("\nPlatforms available (not conclusive):", example_platforms)

Types available: {'song', 'writer', 'technology', 'fauna', 'filmmaker', 'cartoon', 'auction', 'election', 'comedian', 'participatory-media', 'convention', 'shock-media', 'tabletop-games', 'lip-dub', 'character', 'flash-mob', 'athlete', 'country', 'campaign', 'theater', 'artist', 'sound-effect', 'controversy', 'media-host', 'leak', 'viral-debate', 'actor', 'tv-show', 'law', 'prank', 'raid', 'exploitable', 'forum', 'company', 'activist', 'optical-illusion', 'video-game', 'reference', 'news-publication', 'performance', 'webcomic', 'advertisement', 'hashtag', 'conspiracy-theory', 'slang', 'emoticon', 'competition', 'protest', 'visual-effect', 'creepypasta', 'blog', 'axiom', 'comic-book', 'book', 'creator', 'product', 'programmer', 'religion', 'businessperson', 'movement', 'historical-figure', 'vlogger', 'hack', 'fetish', 'disaster', 'trial', 'parody', 'pop-culture-reference', 'food', 'award-ceremony', 'catchphrase', 'manga', 'album', 'organization', 'social-media-page', 'viral-video', 'mus

In [7]:
# Select parameters for extraction

DATE_FROM = input("Select from date (e.g. 2019-05-30). You can leave this empty. ")
DATE_TO = input("Select to date (e.g. 2021-12-25). You can leave this empty. ")

# Select if you want to show results with empty about section
keepEmptyAbout = False

# Meme has at least 1 of the selected types. Keep in mind that roughly half of the memes don't have a type specified.
print('\nWhen choosing types, use "." for OR. Use ";" for AND. For example: "snowclone;image-macro.cliche". Leave empty to include all types.')
TYPE = input("Select types: ")

SPREAD = []  # "Spread" section of meme contains at least 1 instance of at least 1 of selected keywords. Leave empty to not filter by spread.

print('\nWhen choosing tags, use same system as with type. Leave empty to include all tags.')
TAGS = input("Select tags: ")

Select from date (e.g. 2019-05-30). You can leave this empty.  
Select to date (e.g. 2021-12-25). You can leave this empty.  



When choosing types, use "." for OR. Use ";" for AND. For example: "snowclone;image-macro.cliche". Leave empty to include all types.


Select types:  



When choosing tags, use same system as with type. Leave empty to include all tags.


Select tags:  


In [None]:
# Extract data based on parameters

from datetime import datetime
from collections import Counter

def extract_data(raw, events, DATE_FROM="", DATE_TO="", TYPE="", SPREAD=[], TAGS="", keepEmptyAbout=True):
    # Select only memes
    memes = raw[raw['category'] == "Meme"] 

    # Drop duplicates (same title)
    memes = memes.loc[memes.astype(str).drop_duplicates(subset=['title']).index]
    events = events.loc[events.astype(str).drop_duplicates(subset=['title']).index]

    # Merge memes and events (timestamps)
    memes = pd.merge(memes, events, on="title")

    # Lower-case all parameters just in case
    TYPE = TYPE.lower()
    SPREAD = [word.lower() for word in SPREAD]
    TAGS = TAGS.lower()

    # Add "type" as separate column of strings
    type_col = []
    for d in memes['details']:
        if 'type' in d.keys():
            t_string = ""
            for t in d['type']:
                t_string += t.split("https://knowyourmeme.com/types/")[1] + ", "
            type_col.append(t_string.strip(", "))
        else:
            type_col.append("")
    memes['type'] = type_col

    # Drop unnecessary columns
    memes = memes.drop(columns=["meta", "category", "ld", "additional_references", "search_keywords", "parent", "siblings", "children"])

    # Changing large integers to readable dates
    memes['added'] = memes['added'].apply(lambda x: datetime.fromtimestamp(x) if not pd.isnull(x) else x)
    memes['last_update_source'] = memes['last_update_source'].apply(lambda x: datetime.fromtimestamp(x) if not pd.isnull(x) else x)

    dates = []
    for date in memes['last_update_source']:
        dates.append(datetime.strptime(str(date)[:10], "%Y-%m-%d") if not pd.isnull(date) else 0)
    memes['last_update_source'] = dates


    # Selecting memes that were last updated in DATE_FROM - DATE_TO
    if len(DATE_FROM) > 1 and len(DATE_TO) > 1:
        memes = memes[(memes['last_update_source'] >= DATE_FROM) & (memes['last_update_source'] <= DATE_TO)]
    elif len(DATE_FROM) > 1:
        memes = memes[memes['last_update_source'] >= DATE_FROM]
    elif len(DATE_TO) > 1:
        memes = memes[memes['last_update_source'] <= DATE_TO]

    # Removing entries with empty "About" section
    if not keepEmptyAbout:
        descriptions = []
        missing_desc = []
        for title, d in zip(memes['title'], memes['content']):
            if 'about' in d.keys() and 'text' in d['about'].keys():
                descriptions.append(d['about']['text'][0])
            else:
                missing_desc.append(title)
        memes = memes[~memes['title'].isin(missing_desc)]
        memes['about'] = descriptions  # Add "About" section as a separate column
        print("Removed {} entries with no About section.".format(len(missing_desc)))

    # Select memes by TYPE    
    if len(TYPE) > 1:
        typestring = TYPE.replace('.', '") | type.str.contains("').replace(';', '") & type.str.contains("')
        typequery = 'type.str.contains("' + typestring + '")'
        memes = memes.query(typequery, engine='python')

    # Select memes by TAGS    
    if len(TAGS) > 1:
        # Convert tag column to strings
        tag_col = []
        for taglist in memes['tags']:
            tag_col.append(", ".join(taglist))
        memes['tags'] = tag_col  

        tagstring = TAGS.replace('.', '") | tags.str.contains("').replace(';', '") & tags.str.contains("')
        tagquery = 'tags.str.contains("' + tagstring + '")'
        memes = memes.query(tagquery, engine='python')

    # Select memes by SPREAD (content:spread contains at least 1 specified keyword)
    if len(SPREAD) > 0:
        spread_match, freq_dics = matchSpread(memes)
        memes = memes[spread_match]

    print("Found {} memes matching these criteria.".format(len(memes)))
    return memes

In [15]:
memes = extract_data(raw, events, DATE_FROM="", DATE_TO="", TYPE="", SPREAD=[], TAGS="", keepEmptyAbout=True)

# Create dictionary of STATUS values
# TODO add this as separate column? @Riccardo

status_dict = {}
c = 0
for d in memes['details']:
    if 'status' in d.keys():
        status = d['status']
        if status in status_dict.keys():
            status_dict[status] += 1
        else:
            status_dict[status] = 1
            
print(status_dict)

Found 12654 memes matching these criteria.
{'confirmed': 3266, 'deadpool': 4478, 'submission': 4908, 'unlisted': 2}


In [125]:
# Export data: reaction, snowclone, exploitable
# KYM URL, IMAGE URL, ABOUT Text, Origin Text, [Taglist], Year, Last Updates, [Other Types]
memes = memes.drop(columns=["origin"])

# Add origin as separate column
origin_col = []
missing_origin = []
for title, d in zip(memes['title'], memes['content']):
        if 'origin' in d.keys() and 'text' in d['origin'].keys():
            origin_col.append(" ".join(d['origin']['text']))
        else:
            missing_origin.append(title)
memes = memes[~memes['title'].isin(missing_origin)]  # remove memes that have no "origin" specified
memes['origin'] = origin_col

# Add year as separate column
year_col = []
missing_year = []
for title, d in zip(memes['title'], memes['details']):
    if 'year' in d.keys() and d['year'] != None:
        year_col.append(d['year'])
    else:
        missing_year.append(title)
memes = memes[~memes['title'].isin(missing_year)]  # remove memes that have no "year" specified
memes['year'] = year_col

# Convert "type" back to a list
type_col = []
for typelist in memes['type']:
    t = typelist.split(", ")
    if TYPE in t:
        t.remove(TYPE)
    type_col.append(t)
memes['other_types'] = type_col

# Remove rows with empty "tag" list
missing_tags = []
for title, taglist in zip(memes['title'], memes['tags']):
    if len(taglist) == 0:
        missing_tags.append(title)
memes = memes[~memes['title'].isin(missing_tags)]

# Drop unnecessary columns
memes = memes.drop(columns=["title", "details", "spread", "content", "added"])

# Reorder columns
columns = ["url", "template_image_url", "about", "origin", "tags", "year", "last_update_source", "other_types"]
memes = memes[columns]

# TODO: export as csv (separately for three types: filter those out in the top)
memes.to_csv("out/{}.csv".format(TYPE), header = True, index = False)

In [107]:
# Collecting most important verbs in events (timestamps)
verbs = ["posted", "linked", "coined", "submitted", "made", "taken", "is", "was", "recorded", "uploaded", "began", "released", "created", "appeared", "used", "begun", "began", "launched", "featured"]
count = {key:0 for key in verbs}
for stamplist in memes["origin"]:
    for stamp in stamplist:
        for verb in verbs:
            if verb in stamp[2]:
                count[verb] += 1
                
print("Verb frequency:", dict(sorted(count.items(), key=lambda item: item[1], reverse=True)))

IndexError: string index out of range

In [113]:
# Extract data for Toloka

# toloka = memes[['template_image_url', 'about']]
# toloka.head()
# toloka.to_csv("out/toloka.tsv", header = False, index = False, sep="\t")