In [1]:
# !pip install opencv-python-headless
# !pip install textract

In [2]:
import pandas as pd
import os
import cv2
import textract
pd.set_option('display.max_colwidth', None)
pd.set_option('display.precision', 2)

In [4]:
def text_image_percent(df):
    text = df['body'].count()
    text_nan = df['body'].isna().sum()
    img = df['image_url'].count()
    img_nan = df['image_url'].isna().sum()
    text_ratio = (text/(text+text_nan))*100
    img_ratio = (img/(img+img_nan))*100
    return text_ratio, img_ratio

In [5]:
dataset_names = ["dataset_memes.csv", "dataset_CoronavirusMemes.csv", "dataset_dank_meme.csv", \
                "dataset_dankmeme.csv", "dataset_funny.csv", "dataset_HistoryMemes.csv", \
                "dataset_me_irl.csv", "dataset_MemeEconomy.csv", "dataset_TheLeftCantMeme.csv", \
                "dataset_TheRightCantMeme.csv"]

In [6]:
dataset_memes = pd.read_csv('dataset_memes.csv', index_col=0)
dataset_CoronavirusMemes = pd.read_csv('dataset_CoronavirusMemes.csv', index_col=0)
dataset_dank_meme = pd.read_csv('dataset_dank_meme.csv', index_col=0)
dataset_dankmeme = pd.read_csv('dataset_dankmeme.csv', index_col=0)
dataset_funny = pd.read_csv('dataset_funny.csv', index_col=0)
dataset_HistoryMemes = pd.read_csv('dataset_HistoryMemes.csv', index_col=0)
dataset_me_irl = pd.read_csv('dataset_me_irl.csv', index_col=0)
dataset_MemeEconomy = pd.read_csv('dataset_MemeEconomy.csv', index_col=0)
dataset_TheLeftCantMeme = pd.read_csv('dataset_TheLeftCantMeme.csv', index_col=0)
# dataset_TheRightCantMeme = pd.read_csv('dataset_TheRightCantMeme.csv', index_col=0)
all_datasets = [dataset_memes, dataset_CoronavirusMemes, dataset_dank_meme, \
               dataset_dankmeme, dataset_funny, dataset_HistoryMemes, \
               dataset_me_irl, dataset_MemeEconomy, dataset_TheLeftCantMeme]

## Distribution of text and image data

In [7]:
df = pd.DataFrame(columns=["name", "text ratio", "image ratio"])
for i, dt in enumerate(all_datasets):
    txt, img = text_image_percent(dt)
    name = dataset_names[i].split('.')[0].split('_')[1:]
    name = '_'.join(name)
    df.loc[len(df)] = [name, txt, img]
df

Unnamed: 0,name,text ratio,image ratio
0,memes,0.85,100.0
1,CoronavirusMemes,0.32,96.0
2,dank_meme,0.91,99.09
3,dankmeme,3.89,98.94
4,funny,4.45,97.07
5,HistoryMemes,2.49,99.79
6,me_irl,1.93,99.19
7,MemeEconomy,6.03,79.76
8,TheLeftCantMeme,10.45,88.44


## Distribution of scores

In [8]:
df = pd.DataFrame(columns=["name", "mean", "min", "25%", "50%", "75%", "max"])
for i, dt in enumerate(all_datasets):
    describe = dt['score'].describe()
    mean = describe['mean']
    min1 = describe['min']
    q25 = describe['25%']
    q50 = describe['50%']
    q75 = describe['75%']
    max1 = describe['max']
    name = dataset_names[i].split('.')[0].split('_')[1:]
    name = '_'.join(name)
    df.loc[len(df)] = [name, mean, min1, q25, q50, q75, max1]
df

Unnamed: 0,name,mean,min,25%,50%,75%,max
0,memes,1111.89,0.0,18.0,52.0,220.25,46405.0
1,CoronavirusMemes,138.15,0.0,9.0,39.0,214.0,1628.0
2,dank_meme,244.69,0.0,5.0,23.0,99.0,6107.0
3,dankmeme,139.85,0.0,30.0,92.0,184.0,1770.0
4,funny,2545.7,0.0,19.0,91.0,557.25,100260.0
5,HistoryMemes,1247.22,0.0,46.5,125.0,676.5,23840.0
6,me_irl,1446.26,0.0,32.0,76.5,275.75,70605.0
7,MemeEconomy,316.1,0.0,5.0,23.0,179.0,5616.0
8,TheLeftCantMeme,298.2,0.0,114.0,231.0,428.5,1509.0


## Author with the most submissions

In [9]:
df = pd.DataFrame(columns=["name", "author", "submissions"])
for i, dt in enumerate(all_datasets):
    author = list(dt['author'].value_counts().index)[0]
    value = list(dt['author'].value_counts().values)[0]
    name = dataset_names[i].split('.')[0].split('_')[1:]
    name = '_'.join(name)
    df.loc[len(df)] = [name, author, value]
df

Unnamed: 0,name,author,submissions
0,memes,superbloggity,9
1,CoronavirusMemes,schaefjl,29
2,dank_meme,Longjumping_Honey723,179
3,dankmeme,ryan_godzez,67
4,funny,Doctorphotograph,6
5,HistoryMemes,usefulrustychain,16
6,me_irl,33-9,21
7,MemeEconomy,My_Bird_Buddy,33
8,TheLeftCantMeme,DjDeadpig6934,55


## Top contributor

In [10]:
df = pd.DataFrame(columns=["name", "author", "bestAvgScore"])
for i, dt in enumerate(all_datasets):
    dt = dt.dropna(subset=["author"])
    authors = list(set(dt['author']))
    max_author = ""
    max_val = -1
    for auth in authors:
        avg_score = dt[dt['author']==auth]['score'].mean()
        if avg_score > max_val:
            max_val = avg_score
            max_author = auth
    name = dataset_names[i].split('.')[0].split('_')[1:]
    name = '_'.join(name)
    df.loc[len(df)] = [name, max_author, max_val]
df

Unnamed: 0,name,author,bestAvgScore
0,memes,Ehrenlauch3000,39581.0
1,CoronavirusMemes,Tattoosnscars,1628.0
2,dank_meme,RodriguezCarol780,3351.0
3,dankmeme,emilienordbach,865.5
4,funny,CeleryintheButt,76331.0
5,HistoryMemes,Hextor26,23840.0
6,me_irl,boddah666,44032.0
7,MemeEconomy,Kaidiwoomp,5616.0
8,TheLeftCantMeme,sushiman402,1013.0


## Authors contributing to #subreddits

In [11]:
df = pd.DataFrame(columns=["author", "#subreddit"])
set_authors = set()
for i, dt in enumerate(all_datasets):
    dt = dt.dropna(subset=["author"])
    author = list(dt['author'])
    for auth in author:
        set_authors.add(auth)
list_authors = list(set_authors)
cnt_authors = {}
for i, dt in enumerate(all_datasets):
    present = list(dt['author'])
    for auth in list_authors:
        if auth in present:
            if auth not in cnt_authors:
                cnt_authors[auth] = 0
            cnt_authors[auth] += 1
cnt_authors = sorted(cnt_authors.items(), key=lambda x: x[1], reverse=True)
for val in cnt_authors[:10]:
    df.loc[len(df)] = [val[0], val[1]]
df

Unnamed: 0,author,#subreddit
0,CutZealousideal5274,4
1,Twelve-Majestic-Lies,3
2,Darksabre_ALERTEAM,3
3,Life-Membership-1411,3
4,RealMundiRiki,3
5,prlugo4162,2
6,Large-Wheel-4181,2
7,Budget-Advance8904,2
8,peakpointhelmet,2
9,ActivatedFamiliar,2


## Post with highest #comments

In [12]:
df = pd.DataFrame(columns=["name", "postUrl", "author", "score", "highestNumComments"])
for i, dt in enumerate(all_datasets):
    max_comment = dt[dt['num_comments']==dt['num_comments'].max()]
    post_url = max_comment['post_url'].values[0]
    author = max_comment['author'].values[0]
    comment = max_comment['num_comments'].values[0]
    score = max_comment['score'].values[0]
    name = dataset_names[i].split('.')[0].split('_')[1:]
    name = '_'.join(name)
    df.loc[len(df)] = [name, post_url, author, score, comment]
df

Unnamed: 0,name,postUrl,author,score,highestNumComments
0,memes,https://www.reddit.com/r/memes/comments/13nrue2/it_is_not_that_hard/,formulas792,16263,2508
1,CoronavirusMemes,https://www.reddit.com/r/CoronavirusMemes/comments/oq0nn5/anti_vaxxers_are_the_stupidest_people_on_earth/,Christ_Puncher_,658,118
2,dank_meme,https://www.reddit.com/r/dank_meme/comments/13nzsop/google_is_wrong/,lutherkane2,4329,300
3,dankmeme,https://www.reddit.com/r/dankmeme/comments/10s2ruj/choice/,Super_Pea_3592,240,35
4,funny,https://www.reddit.com/r/funny/comments/13hpjhs/this_burger_king_found_his_burger_queen/,robinnuber,70868,2160
5,HistoryMemes,https://www.reddit.com/r/HistoryMemes/comments/13k9wad/some_names_were_more_creative_than_others/,Hextor26,23840,877
6,me_irl,https://www.reddit.com/r/me_irl/comments/13f6v6q/me_irl/,RogalikYT,42641,2655
7,MemeEconomy,https://www.reddit.com/r/MemeEconomy/comments/1346x59/easy_profits/,accrueddukas,2467,125
8,TheLeftCantMeme,https://www.reddit.com/r/TheLeftCantMeme/comments/127ixy0/why_the_fuck_is_a_guy_getting_deepthroated_in_a/,ScribhneoirIldanach,1118,441


## Distribution of #comments

In [13]:
df = pd.DataFrame(columns=["name", "mean", "min", "25%", "50%", "75%", "max"])
for i, dt in enumerate(all_datasets):
    describe = dt['num_comments'].describe()
    mean = describe['mean']
    min1 = describe['min']
    q25 = describe['25%']
    q50 = describe['50%']
    q75 = describe['75%']
    max1 = describe['max']
    name = dataset_names[i].split('.')[0].split('_')[1:]
    name = '_'.join(name)
    df.loc[len(df)] = [name, mean, min1, q25, q50, q75, max1]
df

Unnamed: 0,name,mean,min,25%,50%,75%,max
0,memes,27.69,0.0,1.0,4.0,11.0,2508.0
1,CoronavirusMemes,7.49,0.0,0.0,2.0,7.0,118.0
2,dank_meme,4.85,0.0,0.0,1.0,2.0,300.0
3,dankmeme,2.55,0.0,0.0,1.0,3.0,35.0
4,funny,74.97,0.0,6.0,13.0,42.0,2160.0
5,HistoryMemes,29.45,0.0,2.0,5.0,22.0,877.0
6,me_irl,20.12,0.0,1.0,2.0,6.0,2655.0
7,MemeEconomy,5.56,0.0,0.0,2.0,5.0,125.0
8,TheLeftCantMeme,56.27,1.0,17.0,37.0,74.0,441.0
