In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Data/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Data


In [2]:
import numpy as np, pandas as pd
np.random.seed(42)
import os
import pickle
import collections
from PIL import Image
import plotly.express as px
import plotly.graph_objects as go
import re
from tqdm import notebook

In [3]:
IMAGES_DIR = '/content/drive/My Drive/Data/memotion_dataset_7k/images'
image_filenames = os.listdir(IMAGES_DIR)
file_extentions = [filename.split('.')[-1] for filename in image_filenames]

images_paths = [os.path.join(IMAGES_DIR,filename) for filename in image_filenames]

REF_FILE = '/content/drive/My Drive/Data/memotion_dataset_7k/reference_df_pickle'
LABELS_FILE = '/content/drive/My Drive/Data/memotion_dataset_7k/labels_pd_pickle'

with open(REF_FILE, 'rb') as handle:
    reference_df_ = pickle.load(handle)

with open(LABELS_FILE, 'rb') as handle:
    labels_pd_ = pickle.load(handle)

In [4]:
image_formats = collections.Counter(file_extentions)
print(f'Num Images: {len(images_paths)}')

print('Image formats found: ', image_formats)
image_formats_df = pd.DataFrame.from_dict(image_formats, orient='index').reset_index()
image_formats_df

Num Images: 6994
Image formats found:  Counter({'jpg': 4951, 'png': 1675, 'jpeg': 345, 'JPG': 16, 'PNG': 4, 'bmp': 2, 'jpe': 1})


Unnamed: 0,index,0
0,jpg,4951
1,png,1675
2,jpeg,345
3,JPG,16
4,PNG,4
5,jpe,1
6,bmp,2


In [5]:
labels_pd_.columns

Index(['image_name', 'text_ocr', 'text_corrected', 'humour', 'sarcasm',
       'offensive', 'motivational', 'overall_sentiment'],
      dtype='object')

In [6]:
def get_train_val_split(train_frac, df, id_col):
    """
    Splits dataframe into train and val keeping percentage of
    labels same in both splits.
    Args:
        train_frac: Fraction of samples to use for train
        df: pd.DataFrame to split
        id_col: Column that uniquely identifies every row.
    Returns:
        split_df
    """
    val_frac = 1 - train_frac
    assert val_frac + train_frac == 1
    labels = set(df.label)
    split_df = None
    df = df.sample(frac=1) #shuffle df

    for lbl in notebook.tqdm(labels, total = len(labels)):
        lbl_df = df[df.label == lbl].copy()
        temp_df_train = lbl_df.sample(frac=train_frac).copy()
        temp_df_val = lbl_df[~lbl_df[id_col].isin(temp_df_train[id_col])].copy()
        temp_df_train['split'] = 'train'
        temp_df_val['split'] = 'val'
        if not isinstance(split_df,pd.DataFrame):
            split_df = temp_df_train.copy()
            split_df = pd.concat([split_df, temp_df_val])
        else:
            split_df = pd.concat([split_df, temp_df_train, temp_df_val])
    
    assert len(split_df) == len(df)
    return split_df

In [7]:
#  Negative and Very Negative => -1
# Positive and Very Positive => 1
# Neutral => 0

task_a_labels = {
    'negative': -1 ,
    'very_negative': -1,
    'neutral' : 0,
    'positive' : 1,
    'very_positive': 1,
}

task_a_labels_df = labels_pd_[['image_name','overall_sentiment']].copy()
task_a_labels_df['label'] = task_a_labels_df['overall_sentiment'].map(task_a_labels)
task_a_labels_df.label.value_counts()

 1    4160
 0    2201
-1     631
Name: label, dtype: int64

In [8]:
task_a_labels_df

Unnamed: 0,image_name,overall_sentiment,label
0,image_1.jpg,very_positive,1
1,image_2.jpeg,very_positive,1
2,image_3.JPG,positive,1
3,image_4.png,positive,1
4,image_5.png,neutral,0
...,...,...,...
6987,image_6988.jpg,neutral,0
6988,image_6989.jpg,neutral,0
6989,image_6990.png,positive,1
6990,image_6991.jpg,very_positive,1


In [9]:
task_a_split_df = get_train_val_split(
    train_frac = 0.90,
    df = task_a_labels_df,
    id_col= 'image_name',
)

  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
print( f' Humor labels: {set(labels_pd_["humour"])}')
print( f' Sarcasm labels: {set(labels_pd_["sarcasm"])}')
print( f' Offensive labels: {set(labels_pd_["offensive"])}')
print( f' Motivational labels: {set(labels_pd_["motivational"])}')



humour_labels_dict = {'funny':1, 'hilarious':1, 'not_funny':0, 'very_funny':1}
sarcasm_labels_dict = {'general':1, 'twisted_meaning':1, 'not_sarcastic':0, 'very_twisted':1}
motivational_labels_dict = { 'motivational':1, 'not_motivational':0 }
offensive_labels_dict = { 'hateful_offensive':1, 'slight':1, 'not_offensive':0, 'very_offensive':1}

task_b_labels_df = labels_pd_.copy()

task_b_labels_df['humour'] = labels_pd_['humour'].map(humour_labels_dict)
task_b_labels_df['sarcasm'] = labels_pd_['sarcasm'].map(sarcasm_labels_dict)
task_b_labels_df['offensive'] = labels_pd_['offensive'].map(offensive_labels_dict)
task_b_labels_df['motivational'] = labels_pd_['motivational'].map(motivational_labels_dict)

 Humor labels: {'not_funny', 'very_funny', 'funny', 'hilarious'}
 Sarcasm labels: {'twisted_meaning', 'very_twisted', 'general', 'not_sarcastic'}
 Offensive labels: {'hateful_offensive', 'slight', 'not_offensive', 'very_offensive'}
 Motivational labels: {'not_motivational', 'motivational'}


In [11]:
print(task_b_labels_df.humour.value_counts(),'\n')
print(task_b_labels_df.sarcasm.value_counts(),'\n')
print(task_b_labels_df.offensive.value_counts(),'\n')
print(task_b_labels_df.motivational.value_counts(),'\n')

print('Total:\n',
     pd.concat(
        [
            task_b_labels_df['humour'],
            task_b_labels_df['sarcasm'],
            task_b_labels_df['offensive'],
            task_b_labels_df['motivational'],
        ],
        ignore_index= True,
        axis = 0,
    ).value_counts()      
)

1    5341
0    1651
Name: humour, dtype: int64 

1    5448
0    1544
Name: sarcasm, dtype: int64 

1    4279
0    2713
Name: offensive, dtype: int64 

0    4525
1    2467
Name: motivational, dtype: int64 

Total:
 1    17535
0    10433
dtype: int64


In [12]:
image_sizes = [Image.open(filepath).size for filepath in images_paths]



In [13]:
image_widths = [size_[0] for size_ in image_sizes]
image_heights = [size_[1] for size_ in image_sizes]
image_size_df = pd.DataFrame(data = {'Width':image_widths, 'Height':image_heights })

In [14]:
image_size_df.sample(5)

Unnamed: 0,Width,Height
4422,1151,2048
351,236,275
5333,888,500
1541,650,400
1721,646,960


In [15]:
image_size_df.describe()

Unnamed: 0,Width,Height
count,6994.0,6994.0
mean,587.075064,546.478124
std,256.801682,250.016236
min,100.0,123.0
25%,480.0,392.0
50%,500.0,500.0
75%,640.0,648.0
max,4961.0,5553.0


In [16]:
fig = go.Figure()


fig_1 = go.Histogram(x=image_size_df['Height'], nbinsx= 100, name='Height') #
fig_2 = go.Histogram(x=image_size_df['Width'], nbinsx=100, name = 'Width')


fig.add_trace(fig_1)
fig.add_trace(fig_2)

fig.show(interactive = False)

In [17]:
labels_pd_.columns

Index(['image_name', 'text_ocr', 'text_corrected', 'humour', 'sarcasm',
       'offensive', 'motivational', 'overall_sentiment'],
      dtype='object')

In [18]:
class TextCleaner:
    """Basic Text cleaner that removes excess whitespaces and URLs"""
    
#     url_re = r"\b(?:https?://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+(?:[/?].*)?"
    
    url_re_1 = r"\b(?:https?://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+(?:[/?].*)?" #removes most urls
    url_re_2 = r"(w{3}\.)*[a-zA-Z0-9]+\.{1}(co){1}[m]{0,1}\s{0,1}" # removes any.com urls
    url_re_3 = r"(w{3}\.)*[a-zA-Z0-9]+\.{1}(net){1}\s{0,1}" # removes any.net urls
    
    def clean(self, text):
        text = str(text)
        excess_whitespace_removed = ' '.join(text.split())
        s1 = re.sub(self.url_re_1, "", excess_whitespace_removed)
        s2 = re.sub(self.url_re_2, "", s1)
        s3 = re.sub(self.url_re_3, "", s2)
        
        return s3


text_cleaner = TextCleaner() 
s = "Je veux que: https://site.english.com/this/is/a/url/path/component#fragment quickmeme.net meme.co asy.com 9gag.com"
print(f" Text: {s}\n Cleaned Text:  {text_cleaner.clean(s)}")

 Text: Je veux que: https://site.english.com/this/is/a/url/path/component#fragment quickmeme.net meme.co asy.com 9gag.com
 Cleaned Text:  Je veux que: 


In [19]:
text_df = labels_pd_[['image_name','text_corrected']].copy()

In [20]:
text_df.head()

Unnamed: 0,image_name,text_corrected
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...
3,image_4.png,10 Year Challenge - Sweet Dee Edition
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...


In [21]:
#check if df contains any columns with null values
text_df.columns[text_df.isna().any()].tolist()

['text_corrected']

In [22]:
#images with no text
nulls_samples = text_df[pd.isnull(text_df).any(axis=1)]
nulls_samples

Unnamed: 0,image_name,text_corrected
119,image_120.jpg,
4799,image_4800.jpg,
6781,image_6782.jpg,
6784,image_6785.jpg,
6786,image_6787.jpg,


In [23]:
#lets drop the null values
text_df.dropna(subset=['text_corrected'],inplace=True)

#reset index
text_df.index = pd.RangeIndex(len(text_df.index))
# text_df[pd.isnull(text_df).any(axis=1)]

In [24]:
text_len_df = text_df.copy()

text_df.loc[:,'char_len'] = text_df.text_corrected\
                                .map(text_cleaner.clean)\
                                .str.len()

text_df.loc[:,'word_len'] = text_df.text_corrected\
                                .map(text_cleaner.clean)\
                                .map(lambda x: [str(word) for word in str(x).split()])\
                                .map(len)

In [25]:
text_df.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,157,32
3,image_4.png,10 Year Challenge - Sweet Dee Edition,37,7
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18


In [26]:
labels_pd_.columns

Index(['image_name', 'text_ocr', 'text_corrected', 'humour', 'sarcasm',
       'offensive', 'motivational', 'overall_sentiment'],
      dtype='object')

In [27]:
labels_df = labels_pd_.copy()
labels_df.drop(['text_ocr', 'text_corrected'], axis=1, inplace=True)

In [28]:
char_df = text_df.sort_values(['char_len'],ascending=True)
char_df = pd.merge(char_df, labels_df, how='inner', on=['image_name'])

word_df = text_df.sort_values(['word_len'],ascending=True)
word_df = pd.merge(word_df, labels_df, how='inner', on=['image_name'])

In [29]:
char_df.columns

Index(['image_name', 'text_corrected', 'char_len', 'word_len', 'humour',
       'sarcasm', 'offensive', 'motivational', 'overall_sentiment'],
      dtype='object')

In [30]:
char_df.head(5)[['text_corrected','char_len','overall_sentiment']]

Unnamed: 0,text_corrected,char_len,overall_sentiment
0,HI,2,positive
1,Me,2,negative
2,NO.,3,positive
3,MEME,4,positive
4,SOON,4,very_positive


In [31]:
char_df.tail(5)[['text_corrected','char_len','overall_sentiment']]

Unnamed: 0,text_corrected,char_len,overall_sentiment
6982,3:00 Außerhalb Lang TWANT YOU TO DRAW ME Quick...,482,positive
6983,Here's to the girls: To the girls who don't wa...,504,positive
6984,friends hgcaps My wife's an incredible woman. ...,507,very_positive
6985,IMAGINE IF PRESIDENT OBAMA: Mange your - HAD B...,545,positive
6986,A LOO WITH A VIEW: The mystery dumper lays cab...,996,positive


In [32]:
word_df.head(5)[['text_corrected','word_len','overall_sentiment']]

Unnamed: 0,text_corrected,word_len,overall_sentiment
0,SURPRISE!,1,neutral
1,SWEET! memegenerator.net,1,positive
2,Remember,1,positive
3,OKAY,1,neutral
4,Fact#379,1,positive


In [33]:
word_df.tail(5)[['text_corrected','word_len','overall_sentiment']]

Unnamed: 0,text_corrected,word_len,overall_sentiment
6982,Boys cry Girls masturbate Boys have feelings G...,77,very_positive
6983,friends hgcaps My wife's an incredible woman. ...,93,very_positive
6984,Here's to the girls: To the girls who don't wa...,94,positive
6985,IMAGINE IF PRESIDENT OBAMA: Mange your - HAD B...,96,positive
6986,A LOO WITH A VIEW: The mystery dumper lays cab...,187,positive


In [34]:
text_df['char_len'].describe()

count    6987.000000
mean       79.039216
std        50.421469
min         2.000000
25%        45.000000
50%        68.000000
75%       101.000000
max       996.000000
Name: char_len, dtype: float64

In [35]:
text_df['word_len'].describe()

count    6987.000000
mean       14.416058
std         9.001141
min         1.000000
25%         8.000000
50%        13.000000
75%        19.000000
max       187.000000
Name: word_len, dtype: float64

In [36]:
_fig_text = go.Figure()

char_len_fig = go.Histogram(x=text_df['char_len'], name="Num chars", nbinsx=100)
word_len_fig = go.Histogram(x=text_df['word_len'], name="Num words", nbinsx=100)


_fig_text.add_trace(char_len_fig, )
_fig_text.add_trace(word_len_fig, )



_fig_text.show(interactive=False)

In [37]:
!python -m pip install -U pycld3 langcodes

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [38]:
 !pip install language_data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [39]:
import cld3
import langcodes

In [40]:
def detect_language(text:str) -> str:
    """Detects the language of the string. 
    Returns 'unknown' if the probability is < 0.5 and is not reliable
    """
    lang, probability, is_reliable, _ = cld3.get_language(text)
    if probability >= 0.5 and is_reliable:
        return lang
    else:
#         print(lang,probability ,is_reliable)
        return 'fr-Cyrl'

def detect_languages(text:str, num:int = 3) :
    """Detects the language of the string. 
    Returns 'unknown' if the probability is < 0.5 and is not reliable
    """
    langs = []
    for lng in cld3.get_frequent_languages(
        text,
        num_langs=3
    ):  
        lang, probability, is_reliable, _ = lng
        if probability >= 0.5 and is_reliable:
            langs.append(lang)
    
    return tuple(langs)
    
def get_language_name(lang:str) -> str:
    """Converts language code to language name"""
    return langcodes.Language.get(lang).language_name('en')



In [41]:
text_df.loc[:,'cld3_preds'] = text_df.text_corrected\
                                .map( text_cleaner.clean )\
                                .map( detect_language )\
                                .map( get_language_name )

In [42]:
print("Languages detected: ", set(text_df['cld3_preds']))
print("Num Languages detected: ", len(set(text_df['cld3_preds'])))

Languages detected:  {'Latin', 'Corsican', 'Japanese', 'Hindi', 'Malagasy', 'Nyanja', 'Welsh', 'Italian', 'Albanian', 'Sundanese', 'Igbo', 'Indonesian', 'Russian', 'Chinese', 'German', 'Lithuanian', 'Galician', 'Swahili', 'Czech', 'Hawaiian', 'Norwegian', 'Cebuano', 'Javanese', 'Polish', 'Bosnian', 'Māori', 'Greek', 'Malay', 'Irish', 'Swedish', 'Hmong', 'Catalan', 'French', 'Shona', 'Tajik', 'Hungarian', 'Hausa', 'Spanish', 'Serbian', 'Finnish', 'Yoruba', 'English', 'Haitian Creole', 'Afrikaans', 'Samoan', 'Western Frisian', 'Luxembourgish', 'Portuguese', 'Maltese', 'Danish', 'Kazakh', 'Somali', 'Dutch', 'Xhosa', 'Filipino', 'Bulgarian', 'Uzbek', 'Zulu', 'Scottish Gaelic', 'Ukrainian'}
Num Languages detected:  60


In [43]:
text_df['cld3_preds'].value_counts()

English            5564
French              998
Portuguese           29
Norwegian            29
Luxembourgish        22
German               21
Malagasy             20
Dutch                18
Western Frisian      17
Catalan              17
Afrikaans            16
Galician             14
Danish               14
Latin                12
Maltese              11
Welsh                10
Irish                10
Scottish Gaelic       9
Hawaiian              9
Italian               9
Igbo                  9
Filipino              7
Sundanese             7
Cebuano               6
Polish                6
Javanese              6
Somali                6
Bulgarian             5
Greek                 5
Japanese              5
Haitian Creole        5
Swahili               4
Serbian               4
Spanish               4
Swedish               4
Czech                 4
Finnish               3
Russian               3
Indonesian            3
Albanian              3
Xhosa                 3
Chinese         

In [44]:
with pd.option_context('display.max_colwidth', -1): 
    print(
        text_df[~text_df.cld3_preds.isin(['English','unknown'])][['text_corrected','cld3_preds']].sample(5)
    )

                                                                                                          text_corrected  \
2261  WHEN YOU REALIZE WONDER WOMAN IS ONLY IGI WONDERWOMANDATMAN COMIC XS IG WERDY.COMIC.MEMES 5 WEEKS AWAY! <3 - Joker   
2089  EVEN HITLER HAD A GIRLFRIEND YOU ARE LITERALLY LESS DESIRABLE THAN HITLER VALENTINE'S DAY FUN FACT:                  
3096  socially amazing penguin's entry MemeCenter                                                                          
3225  SORRY I'M BETTER THAN KIRK U MAD BRO? quickmeme.con                                                                  
5600  HEY! THAT'S MY SPOT! memegenerator.net                                                                               

     cld3_preds  
2261  French     
2089  French     
3096  Norwegian  
3225  French     
5600  French     


In [45]:
text_df['ext'] = text_df['image_name'].apply(lambda x: x.split('.')[-1])

In [46]:
text_df.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,cld3_preds,ext
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16,English,jpg
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24,English,jpeg
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,157,32,English,JPG
3,image_4.png,10 Year Challenge - Sweet Dee Edition,37,7,French,png
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18,English,png


In [47]:
text_df_1 = text_df.loc[(text_df.ext == 'jpg') | (text_df.ext == 'png') | (text_df.ext == 'jpeg'), ]
text_df_1.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,cld3_preds,ext
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16,English,jpg
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24,English,jpeg
3,image_4.png,10 Year Challenge - Sweet Dee Edition,37,7,French,png
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18,English,png
5,image_6.jpg,"1998: ""Don't get in car with strangers"" 2008: ...",177,31,English,jpg


In [48]:
text_en_df = text_df_1.loc[text_df_1.cld3_preds == 'English', ]
text_en_df.cld3_preds.value_counts()

English    5544
Name: cld3_preds, dtype: int64

In [49]:
import shutil

In [50]:
for image_path in text_en_df.image_name:
  src_fpath = '/content/drive/My Drive/Data/memotion_dataset_7k/images/'+ image_path
  print(src_fpath)
  dest_fpath = '/content/drive/My Drive/Data/memotion_dataset_7k/en_images/'+ image_path
  print(dest_fpath)
  os.makedirs(os.path.dirname(dest_fpath), exist_ok=True)
  shutil.copy(src_fpath, dest_fpath)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/drive/My Drive/Data/memotion_dataset_7k/images/image_3887.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/en_images/image_3887.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/images/image_3888.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/en_images/image_3888.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/images/image_3889.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/en_images/image_3889.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/images/image_3890.png
/content/drive/My Drive/Data/memotion_dataset_7k/en_images/image_3890.png
/content/drive/My Drive/Data/memotion_dataset_7k/images/image_3892.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/en_images/image_3892.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/images/image_3894.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/en_images/image_3894.jpg
/content/drive/My Drive/Data/memotion_dataset_7k/images/image_389

In [51]:
 len(os.listdir('/content/drive/My Drive/Data/memotion_dataset_7k/en_images/'))

5544

In [52]:
text_en_df.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,cld3_preds,ext
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16,English,jpg
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24,English,jpeg
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18,English,png
5,image_6.jpg,"1998: ""Don't get in car with strangers"" 2008: ...",177,31,English,jpg
8,image_9.jpg,Fornite died in 10 months but Minecraft never ...,94,19,English,jpg


In [53]:
text_en_df.tail()

Unnamed: 0,image_name,text_corrected,char_len,word_len,cld3_preds,ext
6982,image_6988.jpg,Tuesday is Mardi Gras Wednesday is Valentine's...,208,36,English,jpg
6983,image_6989.jpg,MUST WATCH MOVIES OF 2017 ITI Chennai memes MA...,168,25,English,jpg
6984,image_6990.png,LESS MORE TALKING PLANNING SODA JUNK FOOD COMP...,327,46,English,png
6985,image_6991.jpg,When I have time is a fantasy. no one has time...,70,15,English,jpg
6986,image_6992.jpg,"The starting point for every good idea is ""Wha...",66,12,English,jpg


In [54]:
text_df.shape

(6987, 6)

In [55]:
text_df.tail()

Unnamed: 0,image_name,text_corrected,char_len,word_len,cld3_preds,ext
6982,image_6988.jpg,Tuesday is Mardi Gras Wednesday is Valentine's...,208,36,English,jpg
6983,image_6989.jpg,MUST WATCH MOVIES OF 2017 ITI Chennai memes MA...,168,25,English,jpg
6984,image_6990.png,LESS MORE TALKING PLANNING SODA JUNK FOOD COMP...,327,46,English,png
6985,image_6991.jpg,When I have time is a fantasy. no one has time...,70,15,English,jpg
6986,image_6992.jpg,"The starting point for every good idea is ""Wha...",66,12,English,jpg


In [56]:
labels_df.head()

Unnamed: 0,image_name,humour,sarcasm,offensive,motivational,overall_sentiment
0,image_1.jpg,hilarious,general,not_offensive,not_motivational,very_positive
1,image_2.jpeg,not_funny,general,not_offensive,motivational,very_positive
2,image_3.JPG,very_funny,not_sarcastic,not_offensive,not_motivational,positive
3,image_4.png,very_funny,twisted_meaning,very_offensive,motivational,positive
4,image_5.png,hilarious,very_twisted,very_offensive,not_motivational,neutral


In [57]:
label_en_df = text_df.merge(labels_df, how='left', on='image_name')
label_en_df.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,cld3_preds,ext,humour,sarcasm,offensive,motivational,overall_sentiment
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16,English,jpg,hilarious,general,not_offensive,not_motivational,very_positive
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24,English,jpeg,not_funny,general,not_offensive,motivational,very_positive
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,157,32,English,JPG,very_funny,not_sarcastic,not_offensive,not_motivational,positive
3,image_4.png,10 Year Challenge - Sweet Dee Edition,37,7,French,png,very_funny,twisted_meaning,very_offensive,motivational,positive
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18,English,png,hilarious,very_twisted,very_offensive,not_motivational,neutral


In [58]:
label_en_df.shape

(6987, 11)

In [59]:
label_en_df = label_en_df.loc[label_en_df.cld3_preds == 'English', ]
label_en_df.cld3_preds.value_counts()

English    5564
Name: cld3_preds, dtype: int64

In [60]:
label_en_df.shape

(5564, 11)

In [61]:
label_en_df.drop(['cld3_preds', 'ext'], axis=1, inplace=True)

In [62]:
label_df_A = label_en_df.drop(['humour', 'sarcasm', 'offensive', 'motivational'], axis=1)
label_df_A.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,overall_sentiment
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16,very_positive
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24,very_positive
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,157,32,positive
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18,neutral
5,image_6.jpg,"1998: ""Don't get in car with strangers"" 2008: ...",177,31,negative


In [63]:
label_df_B = label_en_df.drop(['overall_sentiment'], axis=1)
label_df_B.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,humour,sarcasm,offensive,motivational
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16,hilarious,general,not_offensive,not_motivational
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24,not_funny,general,not_offensive,motivational
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,157,32,very_funny,not_sarcastic,not_offensive,not_motivational
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18,hilarious,very_twisted,very_offensive,not_motivational
5,image_6.jpg,"1998: ""Don't get in car with strangers"" 2008: ...",177,31,hilarious,general,slight,motivational


In [64]:
label_df_A['overall_sentiment'] = label_df_A['overall_sentiment'].map(task_a_labels)
label_df_B['humour'] = label_df_B['humour'].map(humour_labels_dict)
label_df_B['sarcasm'] = label_df_B['sarcasm'].map(sarcasm_labels_dict)
label_df_B['offensive'] = label_df_B['offensive'].map(offensive_labels_dict)
label_df_B['motivational'] = label_df_B['motivational'].map(motivational_labels_dict)

In [65]:
label_df_A.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,overall_sentiment
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16,1
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24,1
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,157,32,1
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18,0
5,image_6.jpg,"1998: ""Don't get in car with strangers"" 2008: ...",177,31,-1


In [66]:
label_df_B.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,humour,sarcasm,offensive,motivational
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,95,16,1,1,0,0
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,127,24,0,1,0,1
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,157,32,1,0,0,0
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,99,18,1,1,1,0
5,image_6.jpg,"1998: ""Don't get in car with strangers"" 2008: ...",177,31,1,1,1,1


In [67]:
label_df_A.to_csv('/content/drive/My Drive/Data/memotion_dataset_7k/Labels_A.csv', index=False)
label_df_B.to_csv('/content/drive/My Drive/Data/memotion_dataset_7k/Labels_B.csv', index=False)

#TEST DATA CLEANING

In [68]:
IMAGES_DIR = '/content/drive/My Drive/Data/test_data/2000_data'
image_filenames = os.listdir(IMAGES_DIR)
file_extentions = [filename.split('.')[-1] for filename in image_filenames]

images_paths = [os.path.join(IMAGES_DIR,filename) for filename in image_filenames]

images_paths[:5]

['/content/drive/My Drive/Data/test_data/2000_data/stevejobs_5630a55873e81.jpeg',
 '/content/drive/My Drive/Data/test_data/2000_data/pepe_6403.jpg',
 '/content/drive/My Drive/Data/test_data/2000_data/seal_weddell-seal-is-foddxj.jpg',
 '/content/drive/My Drive/Data/test_data/2000_data/success_happy-birthday-success-kid-guess-who-just-turned-10-27337421.png',
 '/content/drive/My Drive/Data/test_data/2000_data/hillary_dcea5b8f4ef438f0db5bb7eb8fb8402a249c6c109b28c069cabfaf5d6fbf87e4.jpg']

In [69]:
labels_pd_ = pd.read_csv('/content/drive/MyDrive/Data/test_data/2000_testdata.csv')

In [70]:
image_formats = collections.Counter(file_extentions)
print(f'Num Images: {len(images_paths)}')

print('Image formats found: ', image_formats)
image_formats_df = pd.DataFrame.from_dict(image_formats, orient='index').reset_index()
image_formats_df

Num Images: 2000
Image formats found:  Counter({'jpg': 1371, 'png': 526, 'jpeg': 95, 'JPG': 7, 'webp': 1})


Unnamed: 0,index,0
0,jpeg,95
1,jpg,1371
2,png,526
3,JPG,7
4,webp,1


In [71]:
labels_pd_.columns

Index(['Image_name', 'Image_URL', 'OCR_extracted_text', 'corrected_text',
       'Overall_Sentiment', 'humour', 'sarcasm', 'motivational', 'offensive'],
      dtype='object')

In [72]:
labels_pd_.head()

Unnamed: 0,Image_name,Image_URL,OCR_extracted_text,corrected_text,Overall_Sentiment,humour,sarcasm,motivational,offensive
0,chuck_chuck_norris_meme_10.jpg,https://gtmemes.com/wp-content/uploads/2019/03...,Some magicians can walk on water Chuck Norris...,Some magicians can walk on water Chuck Norris...,1,1,1,1,0
1,dr_evil_NDBB96K.png,https://i.imgur.com/NDBB96K.png,ONE MILLION DOLLARS made on imgur,ONE MILLION DOLLARS made on imgur,1,1,1,1,1
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,https://media0ch-a.akamaihd.net/83/96/9e457d63...,Me: Mom can my friend sleep over? Mom: That's ...,Me: Mom can my friend sleep over? Mom: That's ...,-1,1,1,1,1
3,obama_2691536739_469698809820026_263513986_n.jpg,http://politicalmemes.com/wp-content/uploads/2...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,-1,1,1,1,1
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,https://pics.me.me/threat-kim-jong-un-allegedl...,THREAT: Kim Jong Un allegedly working on multi...,THREAT: Kim Jong Un allegedly working on multi...,-1,1,1,1,1


In [73]:
print(labels_pd_.humour.value_counts(),'\n')
print(labels_pd_.sarcasm.value_counts(),'\n')
print(labels_pd_.offensive.value_counts(),'\n')
print(labels_pd_.motivational.value_counts(),'\n')

print('Total:\n',
     pd.concat(
        [
            labels_pd_['humour'],
            labels_pd_['sarcasm'],
            labels_pd_['offensive'],
            labels_pd_['motivational'],
        ],
        ignore_index= True,
        axis = 0,
    ).value_counts()      
)

1    1793
0      83
Name: humour, dtype: int64 

1    1875
0       1
Name: sarcasm, dtype: int64 

1    959
0    917
Name: offensive, dtype: int64 

1    1557
0     319
Name: motivational, dtype: int64 

Total:
 1    6184
0    1320
dtype: int64


In [74]:
image_sizes = [Image.open(filepath).size for filepath in images_paths]

In [75]:
image_widths = [size_[0] for size_ in image_sizes]
image_heights = [size_[1] for size_ in image_sizes]
image_size_df = pd.DataFrame(data = {'Width':image_widths, 'Height':image_heights })

In [76]:
image_size_df.sample(5)

Unnamed: 0,Width,Height
1655,617,480
786,480,320
1324,450,365
1055,640,492
952,249,209


In [77]:
image_size_df.describe()

Unnamed: 0,Width,Height
count,2000.0,2000.0
mean,594.093,553.438
std,262.829603,250.178436
min,130.0,60.0
25%,500.0,396.0
50%,500.0,512.0
75%,640.0,654.0
max,3216.0,4356.0


In [78]:
fig = go.Figure()


fig_1 = go.Histogram(x=image_size_df['Height'], nbinsx= 100, name='Height') #
fig_2 = go.Histogram(x=image_size_df['Width'], nbinsx=100, name = 'Width')


fig.add_trace(fig_1)
fig.add_trace(fig_2)

fig.show(interactive = False)

In [79]:
labels_pd_.columns

Index(['Image_name', 'Image_URL', 'OCR_extracted_text', 'corrected_text',
       'Overall_Sentiment', 'humour', 'sarcasm', 'motivational', 'offensive'],
      dtype='object')

In [80]:
class TextCleaner:
    """Basic Text cleaner that removes excess whitespaces and URLs"""
    
#     url_re = r"\b(?:https?://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+(?:[/?].*)?"
    
    url_re_1 = r"\b(?:https?://|www\.)[a-z0-9-]+(\.[a-z0-9-]+)+(?:[/?].*)?" #removes most urls
    url_re_2 = r"(w{3}\.)*[a-zA-Z0-9]+\.{1}(co){1}[m]{0,1}\s{0,1}" # removes any.com urls
    url_re_3 = r"(w{3}\.)*[a-zA-Z0-9]+\.{1}(net){1}\s{0,1}" # removes any.net urls
    
    def clean(self, text):
        text = str(text)
        excess_whitespace_removed = ' '.join(text.split())
        s1 = re.sub(self.url_re_1, "", excess_whitespace_removed)
        s2 = re.sub(self.url_re_2, "", s1)
        s3 = re.sub(self.url_re_3, "", s2)
        
        return s3


text_cleaner = TextCleaner() 
s = "Je veux que: https://site.english.com/this/is/a/url/path/component#fragment quickmeme.net meme.co asy.com 9gag.com"
print(f" Text: {s}\n Cleaned Text:  {text_cleaner.clean(s)}")

 Text: Je veux que: https://site.english.com/this/is/a/url/path/component#fragment quickmeme.net meme.co asy.com 9gag.com
 Cleaned Text:  Je veux que: 


In [81]:
text_df = labels_pd_[['Image_name','corrected_text']].copy()

In [82]:
text_df.head()

Unnamed: 0,Image_name,corrected_text
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...
1,dr_evil_NDBB96K.png,ONE MILLION DOLLARS made on imgur
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...


In [83]:
#check if df contains any columns with null values
text_df.columns[text_df.isna().any()].tolist()

['corrected_text']

In [84]:
#images with no text
nulls_samples = text_df[pd.isnull(text_df).any(axis=1)]
nulls_samples

Unnamed: 0,Image_name,corrected_text
222,sports_yzpsp7txypndvzwfuzlb.jpg,
271,trump_Trump-Memes-8.jpg,
348,misog_111jh6qu4wyg20204j16vkd.jpg,
355,friends_thomas_freaked_out_meme_by_wildcat1999...,
513,friends_thomas_and_friends_meme_9_by_thethomag...,
623,trump_HellToupee1500-5ada42951d64040039145c86.jpg,
852,got_GoT_Meme_2.jpg,
878,misog_257ulhmkmisicg8e7twizsr.jpg,
983,racis_179trumpracist.jpg,
1003,bethe_223Conspiracy-Ted1.jpg,


In [85]:
#lets drop the null values
text_df.dropna(subset=['corrected_text'],inplace=True)

#reset index
text_df.index = pd.RangeIndex(len(text_df.index))
# text_df[pd.isnull(text_df).any(axis=1)]

In [86]:
text_len_df = text_df.copy()

text_df.loc[:,'char_len'] = text_df.corrected_text\
                                .map(text_cleaner.clean)\
                                .str.len()

text_df.loc[:,'word_len'] = text_df.corrected_text\
                                .map(text_cleaner.clean)\
                                .map(lambda x: [str(word) for word in str(x).split()])\
                                .map(len)

In [87]:
text_df.head()

Unnamed: 0,Image_name,corrected_text,char_len,word_len
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12
1,dr_evil_NDBB96K.png,ONE MILLION DOLLARS made on imgur,33,6
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39


In [88]:
labels_pd_.columns

Index(['Image_name', 'Image_URL', 'OCR_extracted_text', 'corrected_text',
       'Overall_Sentiment', 'humour', 'sarcasm', 'motivational', 'offensive'],
      dtype='object')

In [89]:
labels_df = labels_pd_.copy()
labels_df.drop(['corrected_text'], axis=1, inplace=True)

In [90]:
char_df = text_df.sort_values(['char_len'],ascending=True)
char_df = pd.merge(char_df, labels_df, how='inner', on=['Image_name'])

word_df = text_df.sort_values(['word_len'],ascending=True)
word_df = pd.merge(word_df, labels_df, how='inner', on=['Image_name'])

In [91]:
char_df.columns

Index(['Image_name', 'corrected_text', 'char_len', 'word_len', 'Image_URL',
       'OCR_extracted_text', 'Overall_Sentiment', 'humour', 'sarcasm',
       'motivational', 'offensive'],
      dtype='object')

In [92]:
char_df.head(5)[['corrected_text','char_len','Overall_Sentiment']]

Unnamed: 0,corrected_text,char_len,Overall_Sentiment
0,HELLO,5,-1
1,#NAME?,6,1
2,#NAME?,6,-1
3,HUMANS,6,1
4,#NAME?,6,-1


In [93]:
char_df.tail(5)[['corrected_text','char_len','Overall_Sentiment']]

Unnamed: 0,corrected_text,char_len,Overall_Sentiment
1853,There are some words I've known since I was a ...,378,-1
1854,Women really are blamed for everything. Includ...,388,-1
1855,Giorgio A. Tsoukalos @Tsoukalos I have never -...,413,1
1856,What would you rather have? A GF Not Good At ...,430,1
1857,Why Men are always happy...? 1. Their last nam...,464,1


In [94]:
word_df.head(5)[['corrected_text','char_len','Overall_Sentiment']]

Unnamed: 0,corrected_text,char_len,Overall_Sentiment
0,"""DICTATOR""",10,1
1,HELLO,5,-1
2,#NAME?,6,-1
3,Meanwhile,9,1
4,#NAME?,6,1


In [95]:
word_df.tail(5)[['corrected_text','char_len','Overall_Sentiment']]

Unnamed: 0,corrected_text,char_len,Overall_Sentiment
1853,The reason you're still upset about the electi...,352,-1
1854,Pro @promax 4 Trump Follow my girlfriend now e...,361,1
1855,What would you rather have? A GF Not Good At ...,430,1
1856,Giorgio A. Tsoukalos @Tsoukalos I have never -...,413,1
1857,Why Men are always happy...? 1. Their last nam...,464,1


In [96]:
text_df['char_len'].describe()

count    1858.000000
mean       85.307320
std        56.096173
min         5.000000
25%        47.000000
50%        72.500000
75%       107.000000
max       464.000000
Name: char_len, dtype: float64

In [97]:
text_df['word_len'].describe()

count    1858.000000
mean       15.408504
std         9.996417
min         1.000000
25%         9.000000
50%        13.000000
75%        19.000000
max        88.000000
Name: word_len, dtype: float64

In [98]:
_fig_text = go.Figure()

char_len_fig = go.Histogram(x=text_df['char_len'], name="Num chars", nbinsx=100)
word_len_fig = go.Histogram(x=text_df['word_len'], name="Num words", nbinsx=100)


_fig_text.add_trace(char_len_fig, )
_fig_text.add_trace(word_len_fig, )



_fig_text.show(interactive=False)

In [99]:
!python -m pip install -U pycld3 langcodes

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [100]:
 !pip install language_data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [101]:
import cld3
import langcodes

In [102]:
def detect_language(text:str) -> str:
    """Detects the language of the string. 
    Returns 'unknown' if the probability is < 0.5 and is not reliable
    """
    lang, probability, is_reliable, _ = cld3.get_language(text)
    if probability >= 0.5 and is_reliable:
        return lang
    else:
#         print(lang,probability ,is_reliable)
        return 'fr-Cyrl'

def detect_languages(text:str, num:int = 3) :
    """Detects the language of the string. 
    Returns 'unknown' if the probability is < 0.5 and is not reliable
    """
    langs = []
    for lng in cld3.get_frequent_languages(
        text,
        num_langs=3
    ):  
        lang, probability, is_reliable, _ = lng
        if probability >= 0.5 and is_reliable:
            langs.append(lang)
    
    return tuple(langs)
    
def get_language_name(lang:str) -> str:
    """Converts language code to language name"""
    return langcodes.Language.get(lang).language_name('en')



In [103]:
text_df.loc[:,'cld3_preds'] = text_df.corrected_text\
                                .map( text_cleaner.clean )\
                                .map( detect_language )\
                                .map( get_language_name )

In [104]:
print("Languages detected: ", set(text_df['cld3_preds']))
print("Num Languages detected: ", len(set(text_df['cld3_preds'])))

Languages detected:  {'Latin', 'Corsican', 'Japanese', 'Malagasy', 'Welsh', 'Italian', 'Igbo', 'Russian', 'Lithuanian', 'German', 'Galician', 'Czech', 'Estonian', 'Hawaiian', 'Norwegian', 'Polish', 'Māori', 'Malay', 'Irish', 'Swedish', 'French', 'Catalan', 'Shona', 'Serbian', 'Finnish', 'Yoruba', 'English', 'Afrikaans', 'Western Frisian', 'Luxembourgish', 'Portuguese', 'Maltese', 'Danish', 'Dutch', 'Somali', 'Xhosa', 'Filipino', 'Bulgarian', 'Uzbek', 'Zulu', 'Scottish Gaelic'}
Num Languages detected:  41


In [105]:
text_df['cld3_preds'].value_counts()

English            1503
French              243
Portuguese           10
Norwegian             8
Malagasy              6
Luxembourgish         5
Danish                5
Latin                 5
Irish                 5
Māori                 5
German                4
Galician              4
Western Frisian       4
Japanese              3
Dutch                 3
Russian               3
Uzbek                 3
Hawaiian              3
Polish                2
Estonian              2
Italian               2
Somali                2
Catalan               2
Maltese               2
Welsh                 2
Xhosa                 2
Corsican              2
Swedish               2
Zulu                  2
Igbo                  2
Afrikaans             2
Malay                 1
Scottish Gaelic       1
Filipino              1
Czech                 1
Yoruba                1
Bulgarian             1
Shona                 1
Serbian               1
Lithuanian            1
Finnish               1
Name: cld3_preds

In [108]:
with pd.option_context('display.max_colwidth', -1): 
    print(
        text_df[~text_df.cld3_preds.isin(['English','unknown'])][['corrected_text','cld3_preds']].sample(5)
    )

                                             corrected_text cld3_preds
1080  BULLSHIT. usa HD memecenter.com MemeCenter             Norwegian
559   OK THEN WE CAN BE FRIENDS memegenerator.es             French   
905   "DICTATOR"                                             Latin    
1603  I was a different man  Logan. I had HAIR. imgflip.com  French   
1556  LORDE VOLDEMORT                                        French   


In [109]:
text_df['ext'] = text_df['Image_name'].apply(lambda x: x.split('.')[-1])
text_df = text_df.loc[(text_df.ext == 'jpg') | (text_df.ext == 'png') | (text_df.ext == 'jpeg'), ]

In [110]:
text_df.head()

Unnamed: 0,Image_name,corrected_text,char_len,word_len,cld3_preds,ext
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12,English,jpg
1,dr_evil_NDBB96K.png,ONE MILLION DOLLARS made on imgur,33,6,French,png
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15,English,jpg
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20,English,jpg
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39,English,png


In [111]:
text_en_df = text_df.loc[text_df.cld3_preds == 'English', ]
text_en_df.cld3_preds.value_counts()

English    1496
Name: cld3_preds, dtype: int64

In [112]:
import shutil

In [113]:
for image_path in text_en_df.Image_name:
  src_fpath = '/content/drive/MyDrive/Data/test_data/2000_data/'+ image_path
  print(src_fpath)
  dest_fpath = '/content/drive/My Drive/Data/test_data/en_test_images/'+ image_path
  print(dest_fpath)
  os.makedirs(os.path.dirname(dest_fpath), exist_ok=True)
  shutil.copy(src_fpath, dest_fpath)

/content/drive/MyDrive/Data/test_data/2000_data/chuck_chuck_norris_meme_10.jpg
/content/drive/My Drive/Data/test_data/en_test_images/chuck_chuck_norris_meme_10.jpg
/content/drive/MyDrive/Data/test_data/2000_data/misog_2109e457d636565e2e06dce39874c5231e1.jpg
/content/drive/My Drive/Data/test_data/en_test_images/misog_2109e457d636565e2e06dce39874c5231e1.jpg
/content/drive/MyDrive/Data/test_data/2000_data/obama_2691536739_469698809820026_263513986_n.jpg
/content/drive/My Drive/Data/test_data/en_test_images/obama_2691536739_469698809820026_263513986_n.jpg
/content/drive/MyDrive/Data/test_data/2000_data/kim_threat-kim-jong-un-allegedly-working-on-multiple-attack-missiles-36036632.png
/content/drive/My Drive/Data/test_data/en_test_images/kim_threat-kim-jong-un-allegedly-working-on-multiple-attack-missiles-36036632.png
/content/drive/MyDrive/Data/test_data/2000_data/big_bang_cfe5c1d4d28c66694e01996f6ed2e70765b442c958e94178ee32f0b47d8497ec.jpg
/content/drive/My Drive/Data/test_data/en_test_ima

In [114]:
 len(os.listdir('/content/drive/My Drive/Data/test_data/en_test_images/'))

1496

In [115]:
text_en_df.head()

Unnamed: 0,Image_name,corrected_text,char_len,word_len,cld3_preds,ext
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12,English,jpg
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15,English,jpg
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20,English,jpg
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39,English,png
6,big_bang_cfe5c1d4d28c66694e01996f6ed2e70765b44...,YOU CANT-RUIN A FRIENDSHIP WITH SEX THATS LIK...,90,17,English,jpg


In [116]:
text_en_df.tail()

Unnamed: 0,Image_name,corrected_text,char_len,word_len,cld3_preds,ext
1852,godfather_4882890.jpg,IT'S DONE,9,2,English,jpg
1853,bean_mr-bean-funny-quotes-fresh-why-men-are-al...,Why Men are always happy...? 1. Their last nam...,464,88,English,jpg
1855,best_2018_5cc9ad1804d3f.jpeg,What are the Mario Bros views on Consent ? Mar...,211,45,English,jpeg
1856,spector_MNfsifQ.jpg,"""I DON'T HAVE DREAMS, I HAVE GOALS. NOW IT'S O...",65,14,English,jpg
1857,spector_562d262b20b5b.jpeg,"""When I got here, I dominated. They thought I ...",277,53,English,jpeg


In [117]:
text_df.shape

(1850, 6)

In [118]:
text_df.tail()

Unnamed: 0,Image_name,corrected_text,char_len,word_len,cld3_preds,ext
1853,bean_mr-bean-funny-quotes-fresh-why-men-are-al...,Why Men are always happy...? 1. Their last nam...,464,88,English,jpg
1854,hillary_trump-supporter-pledges-to-occupy-demo...,TRUMP SUPPORTER PLEDGE TO KILL HILLARY OCCUPY...,55,8,Italian,png
1855,best_2018_5cc9ad1804d3f.jpeg,What are the Mario Bros views on Consent ? Mar...,211,45,English,jpeg
1856,spector_MNfsifQ.jpg,"""I DON'T HAVE DREAMS, I HAVE GOALS. NOW IT'S O...",65,14,English,jpg
1857,spector_562d262b20b5b.jpeg,"""When I got here, I dominated. They thought I ...",277,53,English,jpeg


In [119]:
labels_df.head()

Unnamed: 0,Image_name,Image_URL,OCR_extracted_text,Overall_Sentiment,humour,sarcasm,motivational,offensive
0,chuck_chuck_norris_meme_10.jpg,https://gtmemes.com/wp-content/uploads/2019/03...,Some magicians can walk on water Chuck Norris...,1,1,1,1,0
1,dr_evil_NDBB96K.png,https://i.imgur.com/NDBB96K.png,ONE MILLION DOLLARS made on imgur,1,1,1,1,1
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,https://media0ch-a.akamaihd.net/83/96/9e457d63...,Me: Mom can my friend sleep over? Mom: That's ...,-1,1,1,1,1
3,obama_2691536739_469698809820026_263513986_n.jpg,http://politicalmemes.com/wp-content/uploads/2...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,-1,1,1,1,1
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,https://pics.me.me/threat-kim-jong-un-allegedl...,THREAT: Kim Jong Un allegedly working on multi...,-1,1,1,1,1


In [120]:
label_en_df = text_df.merge(labels_df, how='left', on='Image_name')
label_en_df.head()

Unnamed: 0,Image_name,corrected_text,char_len,word_len,cld3_preds,ext,Image_URL,OCR_extracted_text,Overall_Sentiment,humour,sarcasm,motivational,offensive
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12,English,jpg,https://gtmemes.com/wp-content/uploads/2019/03...,Some magicians can walk on water Chuck Norris...,1,1,1,1,0
1,dr_evil_NDBB96K.png,ONE MILLION DOLLARS made on imgur,33,6,French,png,https://i.imgur.com/NDBB96K.png,ONE MILLION DOLLARS made on imgur,1,1,1,1,1
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15,English,jpg,https://media0ch-a.akamaihd.net/83/96/9e457d63...,Me: Mom can my friend sleep over? Mom: That's ...,-1,1,1,1,1
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20,English,jpg,http://politicalmemes.com/wp-content/uploads/2...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,-1,1,1,1,1
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39,English,png,https://pics.me.me/threat-kim-jong-un-allegedl...,THREAT: Kim Jong Un allegedly working on multi...,-1,1,1,1,1


In [121]:
label_en_df.shape

(1850, 13)

In [122]:
label_en_df = label_en_df.loc[label_en_df.cld3_preds == 'English', ]
label_en_df.cld3_preds.value_counts()

English    1496
Name: cld3_preds, dtype: int64

In [123]:
label_en_df.shape

(1496, 13)

In [124]:
label_en_df.drop(['cld3_preds', 'ext'], axis=1, inplace=True)

In [125]:
label_en_df.head()

Unnamed: 0,Image_name,corrected_text,char_len,word_len,Image_URL,OCR_extracted_text,Overall_Sentiment,humour,sarcasm,motivational,offensive
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12,https://gtmemes.com/wp-content/uploads/2019/03...,Some magicians can walk on water Chuck Norris...,1,1,1,1,0
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15,https://media0ch-a.akamaihd.net/83/96/9e457d63...,Me: Mom can my friend sleep over? Mom: That's ...,-1,1,1,1,1
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20,http://politicalmemes.com/wp-content/uploads/2...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,-1,1,1,1,1
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39,https://pics.me.me/threat-kim-jong-un-allegedl...,THREAT: Kim Jong Un allegedly working on multi...,-1,1,1,1,1
6,big_bang_cfe5c1d4d28c66694e01996f6ed2e70765b44...,YOU CANT-RUIN A FRIENDSHIP WITH SEX THATS LIK...,90,17,http://www.quickmeme.com/img/cf/cfe5c1d4d28c66...,YOU CANT-RUIN A FRIENDSHIP WITH SEX THATS LIK...,1,1,1,1,0


In [126]:
label_df_A = label_en_df.drop(['humour', 'sarcasm', 'offensive', 'motivational','OCR_extracted_text', 'Image_URL'], axis=1)
label_df_A.head()

Unnamed: 0,Image_name,corrected_text,char_len,word_len,Overall_Sentiment
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12,1
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15,-1
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20,-1
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39,-1
6,big_bang_cfe5c1d4d28c66694e01996f6ed2e70765b44...,YOU CANT-RUIN A FRIENDSHIP WITH SEX THATS LIK...,90,17,1


In [127]:
label_df_A.columns = ['image_name', 'text_corrected', 'char_len', 'word_len', 'overall_sentiment']
label_df_A.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,overall_sentiment
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12,1
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15,-1
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20,-1
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39,-1
6,big_bang_cfe5c1d4d28c66694e01996f6ed2e70765b44...,YOU CANT-RUIN A FRIENDSHIP WITH SEX THATS LIK...,90,17,1


In [128]:
label_df_B = label_en_df.drop(['Overall_Sentiment', 'OCR_extracted_text', 'Image_URL'], axis=1)
label_df_B.head()

Unnamed: 0,Image_name,corrected_text,char_len,word_len,humour,sarcasm,motivational,offensive
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12,1,1,1,0
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15,1,1,1,1
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20,1,1,1,1
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39,1,1,1,1
6,big_bang_cfe5c1d4d28c66694e01996f6ed2e70765b44...,YOU CANT-RUIN A FRIENDSHIP WITH SEX THATS LIK...,90,17,1,1,1,0


In [129]:
label_df_B.columns = ['image_name', 'text_corrected', 'char_len', 'word_len', 'humour', 'sarcasm', 'offensive', 'motivational']
label_df_B.head()

Unnamed: 0,image_name,text_corrected,char_len,word_len,humour,sarcasm,offensive,motivational
0,chuck_chuck_norris_meme_10.jpg,Some magicians can walk on water Chuck Norris...,68,12,1,1,1,0
2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,Me: Mom can my friend sleep over? Mom: That's ...,82,15,1,1,1,1
3,obama_2691536739_469698809820026_263513986_n.jpg,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,105,20,1,1,1,1
4,kim_threat-kim-jong-un-allegedly-working-on-mu...,THREAT: Kim Jong Un allegedly working on multi...,230,39,1,1,1,1
6,big_bang_cfe5c1d4d28c66694e01996f6ed2e70765b44...,YOU CANT-RUIN A FRIENDSHIP WITH SEX THATS LIK...,90,17,1,1,1,0


In [130]:
label_df_A.to_csv('/content/drive/My Drive/Data/test_data/Labels_test_A.csv', index=False)
label_df_B.to_csv('/content/drive/My Drive/Data/test_data/Labels_test_B.csv', index=False)