# Memotion 2.0 Dataset

In this notebook, the Memotion dataset will be loaded, including test, train, and validation set. The Memotion 2.0 dataset doesn't contain Hindi Language, so it's perfectly suitable for the task of meme classification in English.

## Setup

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


If the error "ImportError: cannot import name 'is_directory' from 'PIL._util'" occurs, please refer to the following link to downgrade your pillow:
https://www.datasciencelearner.com/importerror-cannot-import-name-pillow-version-from-pil-solved/

## Load the dataset, and have a look at the data

In [20]:
# load data
#df_train = pd.read_csv('Memotion2/memotion_train.csv')
#df_val = pd.read_csv('Memotion2/memotion_val.csv')
#df_test = pd.read_csv('memotion2_test/memotion_test.csv')
# store data
#df_train.to_pickle('df_train_pickle')
#df_val.to_pickle('df_val_pickle')
#df_test.to_pickle('df_test_pickle')

In [21]:
# load existing data
df_train = pd.read_pickle('df_train_pickle')
df_val = pd.read_pickle('df_val_pickle')
df_test = pd.read_pickle('df_test_pickle')
# take a look at the data
print(df_train.head())

   Id                                          Image url  \
0   1  https://preview.redd.it/9jkzlvt8p4q31.jpg?widt...   
1   2  https://i.pinimg.com/originals/fd/c8/e2/fdc8e2...   
2   3  https://i.pinimg.com/originals/85/10/13/851013...   
3   4  https://i.imgur.com/07ZcrjZ_d.webp?maxwidth=52...   
4   5  https://i.imgur.com/MGddJxr_d.jpg?maxwidth=520...   

                                            ocr_text      humour  \
0    enters into a wrong class teacher and students        funny   
1  dj if you single make some nooooissssseeee my ...       funny   
2                       everyone sit like a lady me        funny   
3  when youre about to watch a 2 hour educational...       funny   
4   government work from home olympic swimmers nsdf   very_funny   

          sarcastic      offensive      motivational overall_sentiment  \
0     not_sarcastic         slight  not_motivational           neutral   
1     not_sarcastic  not_offensive  not_motivational           neutral   
2     no

In [22]:
# turn the string labels into numerical labels
# can choose between binary and multi
#from format import str2label
#str2label(df_train, "binary", "train")
#str2label(df_val, "binary", "val")


In [23]:
# load the labels
df_train_label = pd.read_pickle('train_label')
df_val_label = pd.read_pickle('val_label')
# take a look at the labels
print(df_train_label.head())

   Id                                          Image url  \
0   1  https://preview.redd.it/9jkzlvt8p4q31.jpg?widt...   
1   2  https://i.pinimg.com/originals/fd/c8/e2/fdc8e2...   
2   3  https://i.pinimg.com/originals/85/10/13/851013...   
3   4  https://i.imgur.com/07ZcrjZ_d.webp?maxwidth=52...   
4   5  https://i.imgur.com/MGddJxr_d.jpg?maxwidth=520...   

                                            ocr_text humour sarcastic  \
0    enters into a wrong class teacher and students       1         1   
1  dj if you single make some nooooissssseeee my ...      1         1   
2                       everyone sit like a lady me       1         1   
3  when youre about to watch a 2 hour educational...      1         1   
4   government work from home olympic swimmers nsdf       1         1   

  offensive motivational overall_sentiment classification_based_on  
0         1            1                 2          image_and_text  
1         1            1                 2          image_and_

## Data Preprocessing

In [3]:
import os
from PIL import Image
# Load train images
train_images = []
train_images_embeddings = []
# load validation images
val_images = []
val_images_embeddings = []
# load all images in the directory
# set the path to the directory that contains the images
train_path = "./image folder/train_images/"
val_path = "./image folder/val_images/"


### Color Enhancement
Enhance the contrast of the images by 50% to make the images more colorful.

In [3]:
# enhance the contrast of the train images
from PIL import ImageEnhance
enhanced_train_path = "./image folder/enhanced_train_images/"

def enhance_contrast(factor, input_path, output_path):
    # iterate through all the files in the directory
    i = 0
    for filename in os.listdir(input_path):
        # open the image using PIL
        img = Image.open(os.path.join(input_path, filename))
        # convert the image to RGB
        img = img.convert('RGB')
        # enhance the color of the image
        enhancer = ImageEnhance.Contrast(img)
        # save the enhanced image
        enhancer.enhance(factor).save(os.path.join(output_path, filename))

enhance_contrast(2, train_path, enhanced_train_path)

In [15]:
'''
from PIL import ImageEnhance
# enhance the contrast by 2, for report writing
current_path = "./"
Img = Image.open("./image folder/train_images/1.jpg")
Img = Img.convert('RGB')
enhancer = ImageEnhance.Contrast(Img)
enhancer.enhance(2).save(current_path + "contrast_2.jpg")
'''

## PCA Image Compression

Compress the image to make it less misleading.

In [1]:
import skimage
from sklearn.decomposition import PCA


In [8]:
# compress the images
compressed_train_path = "./image folder/compressed_train_images/"

def compress_image(input_path, output_path, num_components):
    # iterate through all the files in the directory
    i = 0
    for i in range(1, 7001):
        filename = str(i) + ".jpg"
        # judge if the file is already compressed
        if os.path.exists(os.path.join(output_path, filename)):
            continue
        print(filename)
        # load the image
        image = Image.open(os.path.join(input_path, filename))
        
        # convert the image to grayscale
        image_gray = image.convert('L')
        image_gray = np.array(image_gray)
        
        pca = PCA(n_components=num_components)
        image_compressed = pca.fit_transform(image_gray)
        image_decompressed = pca.inverse_transform(image_compressed)

        # add the color back to the image
        if np.array(image).shape != image_decompressed.shape:
            image_decompressed = np.stack([image_decompressed]*3, axis=-1)
            image_array = np.array(image)
            image = image_array[:, :, :3]
        else:
            image = np.array(image)

        # Blend the images using alpha blending
        alpha = 0.5
        blended_image = alpha * image_decompressed + (1 - alpha) * image

        # normalize the image
        blended_image = (blended_image- blended_image.min()) / (blended_image.max() - blended_image.min())

        # convert the image to uint8
        blended_image = (255*blended_image).astype(np.uint8)

        # save the image
        Image.fromarray(blended_image).save(os.path.join(output_path, filename))

compress_image(train_path, compressed_train_path, 50)

441.jpg
442.jpg
443.jpg
444.jpg
445.jpg
446.jpg
447.jpg
448.jpg
449.jpg
450.jpg
451.jpg
452.jpg
453.jpg
454.jpg
455.jpg
456.jpg
457.jpg
458.jpg
459.jpg
460.jpg
461.jpg
462.jpg
463.jpg
464.jpg
465.jpg
466.jpg
467.jpg
468.jpg
469.jpg
470.jpg
471.jpg
472.jpg
473.jpg
474.jpg
475.jpg
476.jpg
477.jpg
478.jpg
479.jpg
480.jpg
481.jpg
482.jpg
483.jpg
484.jpg
485.jpg
486.jpg
487.jpg
488.jpg
489.jpg
490.jpg
491.jpg
492.jpg
493.jpg
494.jpg
495.jpg
496.jpg
497.jpg
498.jpg
499.jpg
500.jpg
501.jpg
502.jpg
503.jpg
504.jpg
505.jpg
506.jpg
507.jpg
508.jpg
509.jpg
510.jpg
511.jpg
512.jpg
513.jpg
514.jpg
515.jpg
516.jpg
517.jpg
518.jpg
519.jpg
520.jpg
521.jpg
522.jpg
523.jpg
524.jpg
525.jpg
526.jpg
527.jpg
528.jpg
529.jpg
530.jpg
531.jpg
532.jpg
533.jpg
534.jpg
535.jpg
536.jpg
537.jpg
538.jpg
539.jpg
540.jpg
541.jpg
542.jpg
543.jpg
544.jpg
545.jpg
546.jpg
547.jpg
548.jpg
549.jpg
550.jpg
551.jpg
552.jpg
553.jpg
554.jpg
555.jpg
556.jpg
557.jpg
558.jpg
559.jpg
560.jpg
561.jpg
562.jpg
563.jpg
564.jpg
565.jpg


In [13]:
# for testing
'''
path = "./image folder/train_images/1.jpg"

# load the image
image = Image.open(path)

# convert the image to grayscale
image_gray = image.convert('L')
image_gray = np.array(image_gray)

pca = PCA(n_components=50)
image_compressed = pca.fit_transform(image_gray)
image_decompressed = pca.inverse_transform(image_compressed)

# add the color back to the image
if np.array(image).shape != image_decompressed.shape:
    image_decompressed = np.stack([image_decompressed]*3, axis=-1)
    image_array = np.array(image)
    image = image_array[:, :, :3]
else:
    image = np.array(image)

# Blend the images using alpha blending
alpha = 0.5
blended_image = alpha * image_decompressed + (1 - alpha) * image

# normalize the image
blended_image = (blended_image- blended_image.min()) / (blended_image.max() - blended_image.min())

# convert the image to uint8
blended_image = (255*blended_image).astype(np.uint8)

# save the image
Image.fromarray(blended_image).save("./blended.jpg")
'''

## Combined Dataset

Combine the color enhanced dataset and the PCA compressed dataset with the original dataset.

In [12]:
import shutil
compressed_train_path = "./image folder/compressed_train_images/"
enhanced_train_path = "./image folder/enhanced_train_images/"
train_path = "./image folder/train_images/"
train_path_fusion = "./image folder/train_images_fusion/"
# cpoy all the images in compressed_train_images to train_images_fusion
for filename in os.listdir(compressed_train_path):
    # rename the file
    new_filename = filename.split(".")[0] + "_compressed.jpg"
    # copy the file
    shutil.copy(os.path.join(compressed_train_path, filename), os.path.join(train_path_fusion, new_filename))

# Move all the images in enhanced_train_images to train_images_fusion
for filename in os.listdir(enhanced_train_path):
    # rename the file
    new_filename = filename.split(".")[0] + "_enhanced.jpg"
    # copy the file
    shutil.copy(os.path.join(enhanced_train_path, filename), os.path.join(train_path_fusion, new_filename))

# Move all the images in train_images to train_images_fusion
for filename in os.listdir(train_path):
    # rename the file
    new_filename = filename.split(".")[0] + "_original.jpg"
    # copy the file
    shutil.copy(os.path.join(train_path, filename), os.path.join(train_path_fusion, new_filename))

