In [1]:
import pandas as pd

df = pd.read_csv("https://github.com/mrsndmn/memes-dataset/raw/main/memes-dataset.csv")
df.sample(5)

Unnamed: 0,template,mem_href,mem_description
13706,Drake-Hotline-Bling,/i/5w3916,\nIMAGE DESCRIPTION:\nyou get infinite life; die
10970,Disaster-Girl,/i/5thvus,\nIMAGE DESCRIPTION:\nTHEY REMINDED THE TEACHE...
36160,Unsettled-Tom,/i/4pmdsl,\nIMAGE DESCRIPTION:\nme: does an unboxing vid...
17992,Expanding-Brain,/i/5s2tjx,\nIMAGE DESCRIPTION:\nUPVOTING MEMES FOR POINT...
31935,Third-World-Skeptical-Kid,/i/2h85g7,\nIMAGE DESCRIPTION:\nSO YOU'RE TELLING ME YOU...


In [2]:
from IPython.display import Image
from IPython.core.display import HTML
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm.auto import tqdm

def show_meme(meme_url_path):
    """
    Ф-я для просмотра мема по mem_href
    """
    meme_url = "https://imgflip.com" + meme_url_path
    # print(meme_url)
    meme_url_resp = requests.get(meme_url)
    meme_parser = BeautifulSoup(meme_url_resp.content, 'html.parser')

    image_element = meme_parser.find("img", {"id": "im"})
    # print(image_element)

    image_url = "https:" + image_element.get('src')
    # print(image_url)
    return Image( url=image_url )

show_meme("/i/61bzbr")

In [3]:
df['mem_description'].isna().sum()

532

# Какая-то предобработка

In [4]:
# df.dropna(inplace=True)
df.fillna("", inplace=True)

In [5]:
desc_prefix = "\nIMAGE DESCRIPTION:\n"

df['mem_description'] = df['mem_description'].fillna("").apply( lambda x: x[len(desc_prefix):].lower() if x.startswith(desc_prefix) else x )
df['mem_description'].head()

0                       if you have a twin your lucky 
1    me; dad; me and my dad when mom screams who at...
2        me; a kid wearing the same shirt i'm wearing 
3                                         brands; you 
4     among us be like; your the impostor; no you are 
Name: mem_description, dtype: object

In [6]:
from collections import Counter
import string
from sklearn.base import TransformerMixin, BaseEstimator

class CharacterCounterTransformer(TransformerMixin, BaseEstimator):
    """
    Трансформер, который генерирует фичи по текстовому полю -- количество букв, которые встретились в описании текста
    """

    ascii_lowercase_list = list(string.ascii_lowercase)

    def __init__(self, transform_column):

        self.transform_column = transform_column

        return

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame), "X must be dataframe"

        batch_chars_counts = []
        
        for i, row in X.iterrows():

            # print(row, self.transform_column)
            meme_description = row[self.transform_column].lower()
            meme_description_chars_counter = Counter(meme_description)

            chars_counts = []
            for char in self.ascii_lowercase_list:
                chars_counts.append( meme_description_chars_counter.get(char, 0) )

            batch_chars_counts.append(chars_counts)

        X = X.copy()

        X[self.ascii_lowercase_list] = batch_chars_counts

        return X

# Обучаем модель

In [7]:
from sklearn.linear_model import SGDClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

np.random.seed(42)

steps = [
    ("count_characters", CharacterCounterTransformer('mem_description')),
    ("preprocessing", ColumnTransformer([
                                         [ "scale_chars_counts", StandardScaler(), CharacterCounterTransformer.ascii_lowercase_list ],
    ])),
    ("classifier", SGDClassifier()),
]

clf_pipeline = Pipeline(steps)
clf_pipeline.fit( df, df['template'] )

Pipeline(steps=[('count_characters',
                 CharacterCounterTransformer(transform_column='mem_description')),
                ('preprocessing',
                 ColumnTransformer(transformers=[['scale_chars_counts',
                                                  StandardScaler(),
                                                  ['a', 'b', 'c', 'd', 'e', 'f',
                                                   'g', 'h', 'i', 'j', 'k', 'l',
                                                   'm', 'n', 'o', 'p', 'q', 'r',
                                                   's', 't', 'u', 'v', 'w', 'x',
                                                   'y', 'z']]])),
                ('classifier', SGDClassifier())])

# Получаем предсказания, сохраняем их в файл, запускаем тест

In [8]:
from sklearn.metrics import accuracy_score, f1_score

predictions = clf_pipeline.predict( df )
accuracy_score(predictions, df['template'])

0.10232735597100344

In [9]:
pd.DataFrame({ "predictions": predictions }).to_csv("predictions.csv", header=None, index=None)

In [10]:
!head predictions.csv

Futurama-Fry
Hide-the-Pain-Harold
Drake-Hotline-Bling
Futurama-Fry
Roll-Safe-Think-About-It
This-Is-Fine
Expanding-Brain
Futurama-Fry
Futurama-Fry
This-Is-Fine


In [11]:
from sklearn.metrics import accuracy_score
import pandas as pd

def test_meme_predictions():
    """ Тест проверяет, что ваша модель смогла набрать минимальный скор хотя бы на тренировочных данных """

    predictions = pd.read_csv('predictions.csv', header=None)
    origial_df  = pd.read_csv("https://github.com/mrsndmn/memes-dataset/raw/main/memes-dataset.csv")
    assert accuracy_score(origial_df['template'], predictions.iloc[:, 0]) > 0.05

test_meme_predictions()

# Посмотрим, как моделька обучилась, попробуем с ней поиграться

In [12]:
pd.Series(clf_pipeline.predict( df )).value_counts()

Futurama-Fry                                             8626
Expanding-Brain                                          2952
Leonardo-Dicaprio-Cheers                                 2843
Success-Kid                                              2536
Roll-Safe-Think-About-It                                 2153
Buff-Doge-vs-Cheems                                      1873
Always-Has-Been                                          1760
This-Is-Fine                                             1725
Woman-Yelling-At-Cat                                     1658
I-Bet-Hes-Thinking-About-Other-Women                     1566
Evil-Toddler                                             1038
Shut-Up-And-Take-My-Money-Fry                            1036
X-X-Everywhere                                           1030
Is-This-A-Pigeon                                          986
50421420/Disappointed-Black-Guy                           967
Two-Buttons                                               930
Bike-Fal

In [13]:
df[ df['template'] == "Futurama-Fry" ].sample(10)[['mem_href', 'mem_description']]

Unnamed: 0,mem_href,mem_description
19832,/i/4cs0ed,me trying to make friends
19211,/i/5n2m7y,do you think the black power ranger got offend...
19137,/i/5p8n8a,trying to find out if i broke facebook's polic...
19711,/i/547lsd,not sure if; soccer or football
19025,/i/5v57lw,me when i go upsairs and i see my parents room...
19421,/i/5jq1d9,"mum during a fight ""my house!!!""; mum during c..."
19790,/i/4fqw64,not sure if reaper leviathan is tryna hug me; ...
19041,/i/5tls2u,be on cams on among us be like; 3 hours later
19528,/i/3so2wo,if apartments are called apartments; why are t...
19526,/i/1fuzq1,"when i see lovers' names carved in a tree, i d..."


In [14]:
show_meme("/i/20xwrd")

In [15]:
show_meme("/i/5qj2hp")

In [16]:
# ок, хотя бы i bet hes thinking угадала)

clf_pipeline.predict( pd.DataFrame( [ {"mem_description": "never has been"}, {"mem_description": "i bet he's thinkning about another woman"} ] ) )

array(['Futurama-Fry', 'I-Bet-Hes-Thinking-About-Other-Women'],
      dtype='<U53')