In [33]:
# Import data and modules

import pandas as pd
import numpy as np

# Remove warning
pd.options.mode.chained_assignment = None

# To show all data in dataframe
pd.set_option('display.max_colwidth', None)

raw = pd.read_json("data/kym.json")        # meme templates from Know Your Meme, 16 features
events = pd.read_json("data/events.json")  # memes with timestamps from origin and spread

In [59]:
# Finds SPREAD keywords in memes' spread data
# Input: kym dataframe. Output: tuple (list of True/False, list of frequency dictionaries of SPREAD keywords).
def matchSpread(memes):
    match = []
    freq_dics = []
    for d in memes['content']:
        if 'spread' in d.keys() and 'text' in d['spread'].keys():
            dic = {keyword: 0 for keyword in SPREAD}
            for text in d['spread']['text']:
                words = text.split(" ")
                for word in words:
                    if word.lower() in SPREAD:
                        dic[word.lower()] += 1
            if sum (dic.values()) == 0:
                match.append(False)
            else:
                match.append(True)
            freq_dics.append(dic)
        else:
            match.append(False)
    return (match, freq_dics)

In [80]:
# Extract data based on parameters

from datetime import datetime
from collections import Counter

def extract_data(raw, events, DATE_FROM="", DATE_TO="", TYPE="", SPREAD=[], TAGS="", keep_empty_about=True):
    """
    Returns selection of memes from KYM data merged with Events data. Removes duplicates, drops less relevant columns, parses dates.
    
    :param raw: Know Your Meme dataset
    :param events: Events dataset (timestamps)
    :param DATE_FROM: starting date (YYYY-MM-DD) (optional)
    :param DATE_TO: starting date (YYYY-MM-DD) (optional)
    :param TYPE: meme type(s). Use "." for OR. Use ";" for AND. Example: "snowclone;image-macro.cliche". (optional)
    :param SPREAD: meme spread by keyword search. "Spread" section of meme contains at least 1 instance of at least 1 of selected keywords. (optional)
    :param TAGS: meme tag(s). Use "." for OR. Use ";" for AND. Example: "reddit.video". (optional)
    :param keep_empty_about: include/exclude memes with empty About section (optional)
    """
    # Select only memes
    memes = raw[raw['category'] == "Meme"] 

    # Drop duplicates (same title)
    memes = memes.loc[memes.astype(str).drop_duplicates(subset=['title']).index]
    events = events.loc[events.astype(str).drop_duplicates(subset=['title']).index]

    # Merge memes and events (timestamps)
    memes = pd.merge(memes, events, on="title")

    # Lower-case all parameters just in case
    TYPE = TYPE.lower()
    SPREAD = [word.lower() for word in SPREAD]
    TAGS = TAGS.lower()

    # Add "type" as separate column
    type_col = []
    for d in memes['details']:
        if 'type' in d.keys():
            t_list = []
            for t in d['type']:
                t_list.append(t.split("https://knowyourmeme.com/types/")[1])
            type_col.append(t_list)
        else:
            type_col.append([])
    memes['type'] = type_col

    # Drop unnecessary columns
    memes = memes.drop(columns=["meta", "category", "ld", "search_keywords", "parent", "siblings", "children"])

    # Changing large integers to readable dates
    memes['added'] = memes['added'].apply(lambda x: datetime.fromtimestamp(x) if not pd.isnull(x) else x)
    memes['last_update_source'] = memes['last_update_source'].apply(lambda x: datetime.fromtimestamp(x) if not pd.isnull(x) else x)

    dates = []
    for date in memes['last_update_source']:
        dates.append(datetime.strptime(str(date)[:10], "%Y-%m-%d") if not pd.isnull(date) else 0)
    memes['last_update_source'] = dates


    # Selecting memes that were last updated in DATE_FROM - DATE_TO
    if len(DATE_FROM) > 1 and len(DATE_TO) > 1:
        memes = memes[(memes['last_update_source'] >= DATE_FROM) & (memes['last_update_source'] <= DATE_TO)]
    elif len(DATE_FROM) > 1:
        memes = memes[memes['last_update_source'] >= DATE_FROM]
    elif len(DATE_TO) > 1:
        memes = memes[memes['last_update_source'] <= DATE_TO]

    # Removing entries with empty "About" section
    descriptions = []
    missing_desc = []
    for title, d in zip(memes['title'], memes['content']):
        if 'about' in d.keys() and 'text' in d['about'].keys():
            descriptions.append(d['about']['text'][0])
        else:
            missing_desc.append(title)
            if keep_empty_about:
                descriptions.append("")
    if not keep_empty_about:
        memes = memes[~memes['title'].isin(missing_desc)]
        print("Removed {} entries with no About section.".format(len(missing_desc)))
    memes['about'] = descriptions  # Add "About" section as a separate column

    # Select memes by TYPE    
    if len(TYPE) > 0:
        original_typelist = memes['type']
        type_col = []
        for d in memes['details']:
            if 'type' in d.keys():
                t_string = ""
                for t in d['type']:
                    t_string += t.split("https://knowyourmeme.com/types/")[1] + ", "
                type_col.append(t_string.strip(", "))
            else:
                type_col.append("")
        memes['type'] = type_col
        
        typestring = TYPE.replace('.', '") | type.str.contains("').replace(';', '") & type.str.contains("')
        typequery = 'type.str.contains("' + typestring + '")'
        memes = memes.query(typequery, engine='python')
        memes['type'] = original_typelist

    # Select memes by TAGS    
    if len(TAGS) > 0:
        # Convert tag column to strings
        original_taglist = memes['tags']
        tag_col = []
        for taglist in memes['tags']:
            tag_col.append(", ".join(taglist))
        memes['tags'] = tag_col  

        tagstring = TAGS.replace('.', '") | tags.str.contains("').replace(';', '") & tags.str.contains("')
        tagquery = 'tags.str.contains("' + tagstring + '")'
        memes = memes.query(tagquery, engine='python')
        
        # Convert tag column back to list
        memes['tags'] = original_taglist
        

    # Select memes by SPREAD (content:spread contains at least 1 specified keyword)
    if len(SPREAD) > 0:
        spread_match, freq_dics = matchSpread(memes)
        memes = memes[spread_match]

    print("Found {} memes matching these criteria.".format(len(memes)))
    return memes

In [58]:
# See available options for parameters

# Types
types = set()
typedata = raw[raw['details'].map(lambda x: 'type' in x.keys())]['details']
for d in typedata:
    for t in d['type']:
        types.add(t.split("https://knowyourmeme.com/types/")[1])
# print("Types available:", types)

# Spread (keyword search)
example_platforms = ["Facebook", "Twitter", "Instagram", "Snapchat", "YouTube", "WhatsApp", "TikTok", "Reddit", "Pinterest", "Tumblr", "LinkedIn", "9GAG", "4chan"]
# print("\nPlatforms available (not conclusive):", example_platforms)

In [81]:
# Export data by type (reaction, snowclone, exploitable)
TYPE = "snowclone"
memes = extract_data(raw, events, TYPE=TYPE)
memes = memes.drop(columns=["origin"])  # this is NOT the meme Origin section, but from Events data

# Add origin as separate column
origin_col = []
missing_origin = []
for title, d in zip(memes['title'], memes['content']):
        if 'origin' in d.keys() and 'text' in d['origin'].keys():
            origin_col.append(" ".join(d['origin']['text']))
        elif 'origin' in d.keys() and 'subsections' in d['origin'].keys() and len(d['origin']['subsections'].keys()) != 0:
            orig_text = ""
            for key in d['origin']['subsections'].keys():
                if 'text' in d['origin']['subsections'][key].keys():
                    orig_text += " ".join(d['origin']['subsections'][key]['text']) + " "
            if len(orig_text) == 0:
                missing_origin.append(title)
                origin_col.append("")
            else:
                origin_col.append(orig_text.strip())
        else:
            missing_origin.append(title)
            origin_col.append("")
#memes = memes[~memes['title'].isin(missing_origin)]  # remove memes that have no "origin" specified
memes['origin'] = origin_col

# Add year as separate column
year_col = []
missing_year = []
for title, d in zip(memes['title'], memes['details']):
    if 'year' in d.keys() and d['year'] != None:
        year_col.append(d['year'])
    else:
        missing_year.append(title)
        year_col.append("")
#memes = memes[~memes['title'].isin(missing_year)]  # remove memes that have no "year" specified
memes['year'] = year_col

# Add new column other_types without selected TYPE
type_col = []
for t in memes['type']:
    if TYPE in t:
        t.remove(TYPE)
    type_col.append(t)
memes['other_types'] = type_col

# Remove rows with empty "tag" list
# missing_tags = []
# for title, taglist in zip(memes['title'], memes['tags']):
#     if len(taglist) == 0:
#         missing_tags.append(title)
# memes = memes[~memes['title'].isin(missing_tags)]

# Check if "about" is missing
missing_about = []
for title, d in zip(memes['title'], memes['content']):
        if 'about' in d.keys() and 'text' in d['about'].keys():
            pass
        else:
            missing_about.append(title)

# Remove memes that have NEITHER "about" nor "origin"
no_about_origin = set(missing_origin).intersection(set(missing_about))
memes = memes[~memes['title'].isin(no_about_origin)] 

# Add Imgflip references as separate column (some have multiple Imgflip references)                 
imgflip_col = []
for title, d in zip(memes['title'], memes['additional_references']):
    imgf = []
    for val in d.values():
        if "imgflip" in val:
            imgf.append(val)
    imgflip_col.append(imgf)
memes['imgflip'] = imgflip_col

# Merge "about" and "origin" into 1 column
# Add new column exist_about_origin for About/Origin: whether we have both, only origin, or only about
about_origin = []
existing_data = []
for about, origin in zip(memes['about'], memes['origin']):
    if about == "":
        about_origin.append(origin)
        existing_data.append("origin")
    elif origin == "":
        about_origin.append(about)
        existing_data.append("about")
    else:
        about_origin.append(about + " " + origin)
        existing_data.append("both")
memes['about_origin'] = about_origin
memes['exist_about_origin'] = existing_data

# Reorder and select columns
columns = ["url", "template_image_url", "about_origin", "tags", "year", "last_update_source", "other_types", "imgflip", "exist_about_origin"]
memes = memes[columns]

# Export all three types as csv
# memes.to_csv("out/{}.csv".format(TYPE), header = True, index = False)

Found 56 memes matching these criteria.


In [34]:
# Extract data for Toloka
# toloka = memes[['template_image_url', 'about_origin', 'url']]
# toloka.head()
# toloka.to_csv("out/toloka.tsv", header = False, index = False, sep="\t")

In [82]:
# Extract data for interactive datatable
# memes = extract_data(raw, events)
# memes = memes[['title', 'url', 'about']]
# memes.to_csv("out/small_noco.csv", header = True, index = False)