In [None]:
import torch 
import matplotlib.pyplot as plt 
import numpy as np  
import argparse 
import pickle  
import os 
from torchvision import transforms  
from PIL import Image 



In [None]:
pip install pickle-mixin



In [None]:
ENCODER_PATH = '/content/encoder-5-3000.pkl' 
DECODER_PATH = '/content/decoder-5-3000.pkl'
VOCAB_PATH =   '/content/vocab.pkl'
  

EMBED_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 1

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
  

def load_image(image_path, transform=None):  
  image = Image.open(image_path) 
  #image=np.array(image)
  #if(image.shape[2]==4):
    #image=image[:][:][:3]
    #image=Image.fromarray(np.uint8(image))
  image = image.resize([224, 224], Image.LANCZOS) 
  if transform is not None: 
    image = transform(image).unsqueeze(0) 
  return image 

In [None]:
def PretrainedResNet(image_path, encoder_path=ENCODER_PATH,  
                     decoder_path=DECODER_PATH, 
                     vocab_path=VOCAB_PATH, 
                     embed_size=EMBED_SIZE, 
                     hidden_size=HIDDEN_SIZE, 
                     num_layers=NUM_LAYERS): 
  

    transform = transforms.Compose([ 
                transforms.ToTensor(),  
                transforms.Normalize((0.485, 0.456, 0.406),  
                                     (0.229, 0.224, 0.225))]) 
        
  
    with open(vocab_path, 'rb') as f: 
        vocab = pickle.load(f) 
  
  
    encoder = EncoderCNN(embed_size).eval()   
    decoder = DecoderRNN(embed_size, hidden_size, 
                          len(vocab), num_layers) 
  
    encoder = encoder.to(device) 
    decoder = decoder.to(device) 

    encoder.load_state_dict(torch.load(encoder_path)) 
    decoder.load_state_dict(torch.load(decoder_path)) 

    
    image = load_image(image_path, transform) 
    image_tensor = image.to(device) 
      
   
    feature = encoder(image_tensor) 
    sampled_ids = decoder.sample(feature) 
  
    sampled_ids = sampled_ids[0].cpu().numpy()          
      

    sampled_caption = [] 
    for word_id in sampled_ids: 
        word = vocab.idx2word[word_id] 
        sampled_caption.append(word) 
        if word == '<end>': 
            break
    sentence = ' '.join(sampled_caption)[8:-5].title()  
  
    image = Image.open(image_path) 
    return sentence

In [None]:
plt.figure(figsize=(24,24)) 
predicted_label, image = PretrainedResNet(image_path='/content/featmeme.jpg') 
plt.imshow(image) 
print(predicted_label)

In [None]:
!wget "https://drive.google.com/file/d/1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY"

--2020-10-29 10:27:50--  https://drive.google.com/file/d/1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY
Resolving drive.google.com (drive.google.com)... 173.194.76.100, 173.194.76.138, 173.194.76.102, ...
Connecting to drive.google.com (drive.google.com)|173.194.76.100|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://drive.google.com/file/d/1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY/ [following]
--2020-10-29 10:27:50--  https://drive.google.com/file/d/1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY/
Reusing existing connection to drive.google.com:443.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://drive.google.com/file/d/1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY/edit [following]
--2020-10-29 10:27:51--  https://drive.google.com/file/d/1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY/edit
Reusing existing connection to drive.google.com:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY

In [None]:
pip install gdown



In [None]:
!gdown --id 1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY

Downloading...
From: https://drive.google.com/uc?id=1EV61z5FdWnzezG6U1Nup7m5VfW7xPUVY
To: /content/decoder-5-3000.pkl
36.9MB [00:00, 117MB/s] 


In [None]:
!gdown --id 1oLHalPIRIgDV0bW1Yex1SLKDSRbn6-gG

Downloading...
From: https://drive.google.com/uc?id=1oLHalPIRIgDV0bW1Yex1SLKDSRbn6-gG
To: /content/encoder-5-3000.pkl
235MB [00:01, 166MB/s]


In [None]:
!wget https://i.barkpost.com/wp-content/uploads/2015/02/featmeme.jpg

--2020-10-31 17:14:04--  https://i.barkpost.com/wp-content/uploads/2015/02/featmeme.jpg
Resolving i.barkpost.com (i.barkpost.com)... 104.22.37.153, 104.22.36.153, 172.67.29.37, ...
Connecting to i.barkpost.com (i.barkpost.com)|104.22.37.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 151538 (148K) [image/jpeg]
Saving to: ‘featmeme.jpg’


2020-10-31 17:14:04 (16.0 MB/s) - ‘featmeme.jpg’ saved [151538/151538]



In [None]:
!gdown --id 1kUwrv1jp9PdHFCP9dWys2c38Z6vInEG0

Downloading...
From: https://drive.google.com/uc?id=1kUwrv1jp9PdHFCP9dWys2c38Z6vInEG0
To: /content/vocab.pkl
  0% 0.00/360k [00:00<?, ?B/s]100% 360k/360k [00:00<00:00, 51.8MB/s]


In [None]:
!gdown --id 1iAVOW8wgbarroLLbuiB8L56FpBL4oQhQ

Permission denied: https://drive.google.com/uc?id=1iAVOW8wgbarroLLbuiB8L56FpBL4oQhQ
Maybe you need to change permission over 'Anyone with the link'?


In [None]:
!wget https://i.redd.it/uyrdtpig1o831.jpg

--2020-10-29 07:34:27--  https://i.redd.it/uyrdtpig1o831.jpg
Resolving i.redd.it (i.redd.it)... 151.101.1.140, 151.101.65.140, 151.101.129.140, ...
Connecting to i.redd.it (i.redd.it)|151.101.1.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73645 (72K) [image/jpeg]
Saving to: ‘uyrdtpig1o831.jpg’


2020-10-29 07:34:27 (2.81 MB/s) - ‘uyrdtpig1o831.jpg’ saved [73645/73645]



In [None]:
!wget https://grandjurytarget.files.wordpress.com/2015/06/friends-tv-show.jpg

--2020-10-29 07:34:29--  https://grandjurytarget.files.wordpress.com/2015/06/friends-tv-show.jpg
Resolving grandjurytarget.files.wordpress.com (grandjurytarget.files.wordpress.com)... 192.0.72.26, 192.0.72.27
Connecting to grandjurytarget.files.wordpress.com (grandjurytarget.files.wordpress.com)|192.0.72.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34616 (34K) [image/jpeg]
Saving to: ‘friends-tv-show.jpg’


2020-10-29 07:34:30 (7.13 MB/s) - ‘friends-tv-show.jpg’ saved [34616/34616]



In [None]:
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO


class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i+1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))


    words = [word for word, cnt in counter.items() if cnt >= threshold]

   
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

   
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

def main(args):
    vocab = build_vocab(json=args.caption_path, threshold=args.threshold)
    vocab_path = args.vocab_path
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    print("Total vocabulary size: {}".format(len(vocab)))
    print("Saved the vocabulary wrapper to '{}'".format(vocab_path))




In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence


class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features


class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
        
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)          
            outputs = self.linear(hiddens.squeeze(1))            
            _, predicted = outputs.max(1)                        
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)                       
            inputs = inputs.unsqueeze(1)                         
        sampled_ids = torch.stack(sampled_ids, 1)                


In [None]:
import json

In [None]:
f=open("test.jsonl")

In [None]:
data = json.load(f)

In [None]:
import json

with open('/content/test.jsonl', 'r') as json_file:
    json_list = list(json_file)

for json_str in json_list:
    result = json.loads(json_str)
    print("result: {}".format(result))
    # print(isinstance(result, dict))
    

result: {'id': 16395, 'img': 'img/16395.png', 'text': 'handjobs sold seperately'}
result: {'id': 37405, 'img': 'img/37405.png', 'text': 'introducing fidget spinner for women'}
result: {'id': 94180, 'img': 'img/94180.png', 'text': "happy pride month let's go beat up lesbians"}
result: {'id': 54321, 'img': 'img/54321.png', 'text': 'laughs in [majority of u.s crime rate]'}
result: {'id': 97015, 'img': 'img/97015.png', 'text': 'finds out those 72 virgins.. are goats'}
result: {'id': 73506, 'img': 'img/73506.png', 'text': 'look your dad fucked goats.. i capped him...sorry..'}
result: {'id': 5429, 'img': 'img/05429.png', 'text': 'claims that her ancestors created a safe space for jews these are her ancestors'}
result: {'id': 70691, 'img': 'img/70691.png', 'text': 'overdose'}
result: {'id': 69421, 'img': 'img/69421.png', 'text': "when they laugh about your 9cm in the change room but they haven't seen your 9 mm yet"}
result: {'id': 50162, 'img': 'img/50162.png', 'text': "roses are black, viole

In [None]:
import pandas as pd

In [None]:
df=pd.DataFrame(result, columns=['id', 'img', 'text'])

In [None]:
result.keys()

dict_keys(['id', 'img', 'text'])

In [None]:
result[key = '01258']

In [None]:
import json

In [None]:
def load_jsonl(input_path) -> list:
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [None]:
import pandas as pd
webpage_data = load_jsonl('/content/test.jsonl')
db_data = []
db_cols = ['id', 'img', 'label', 'text']
for d in webpage_data:
    db_data.append([])
    for col in db_cols:
        db_data[-1].append(d.get(col, float('nan')))
df = pd.DataFrame(db_data, columns=db_cols)

Loaded 1000 records from /content/test.jsonl


In [None]:
df.head(10)

Unnamed: 0,id,img,label,text
0,16395,img/16395.png,,handjobs sold seperately
1,37405,img/37405.png,,introducing fidget spinner for women
2,94180,img/94180.png,,happy pride month let's go beat up lesbians
3,54321,img/54321.png,,laughs in [majority of u.s crime rate]
4,97015,img/97015.png,,finds out those 72 virgins.. are goats
5,73506,img/73506.png,,look your dad fucked goats.. i capped him...so...
6,5429,img/05429.png,,claims that her ancestors created a safe space...
7,70691,img/70691.png,,overdose
8,69421,img/69421.png,,when they laugh about your 9cm in the change r...
9,50162,img/50162.png,,"roses are black, violets are black everything ..."


In [None]:
df.loc[01258,"id"]

In [None]:
print(df.ix[01263])

In [None]:
df.loc[df['id'].isin([01263])]

In [None]:
dd=df.values

In [None]:
c=1247
for i in dd:
  if(i[0]== c):
    print(i)
    break

[1247 'img/01247.png' 1 "you can't be racist if there is no other race"]


In [None]:
import glob

In [None]:
import cv2

In [None]:
caption={}

In [None]:
try:
  i=0
  for file in glob.glob("/content/drive/My Drive/Colab Notebooks/Memes Project/img/*.png"):
    i=i+1
    predicted_label= PretrainedResNet(image_path=file)
    caption[name]=predicted_label
    print(i,end=",")
except TypeError:
  print(file)
    


In [None]:
len(trainc)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
for i in dd:
  path = i[1]
  path=path[4:]
  predicted_label, image = PretrainedResNet(image_path='/content/{}'.format(path))
  print(path)
  break

42953.png


In [None]:
trainc

[]

In [None]:
import os.path
os.path.sep


'/'

In [None]:
glob.glob(os.path.join('Users','mayankyadav', 'Downloads','Project','data','img','*.png'))

[]

In [None]:
!wget https://www.kaggle.com/parthplc/facebook-hateful-meme-dataset

--2020-10-29 12:13:32--  https://www.kaggle.com/parthplc/facebook-hateful-meme-dataset
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘facebook-hateful-meme-dataset.1’

facebook-hateful-me     [ <=>                ]  30.84K  --.-KB/s    in 0.007s  

2020-10-29 12:13:33 (4.03 MB/s) - ‘facebook-hateful-meme-dataset.1’ saved [31576]



In [None]:
import pandas as pd
webpage_data = load_jsonl('/content/test_unseen.jsonl')
db_data = []
db_cols = ['id', 'img','text']
for d in webpage_data:
    db_data.append([])
    for col in db_cols:
        db_data[-1].append(d.get(col, float('nan')))
df = pd.DataFrame(db_data, columns=db_cols)

Loaded 2000 records from /content/test_unseen.jsonl


In [None]:
df.head(10)

Unnamed: 0,id,img,text
0,15740,img/15740.png,when someone tells you how to bbq
1,38794,img/38794.png,when they say white folks don't know how to cook
2,60792,img/60792.png,the original derp-face
3,71824,img/71824.png,okay here you go! you piece of shit!
4,4796,img/04796.png,xboxone farming 1619 simulator
5,75310,img/75310.png,how roaches be when you shut off the kitchen l...
6,63425,img/63425.png,when they say white people have no culture
7,72360,img/72360.png,picking up jewish girls the old fashioned way
8,17203,img/17203.png,mmm...this gas bill is getting expensive
9,76803,img/76803.png,pol the look on your face when the press think...


In [None]:
len(dict)

2000

In [None]:
dict=df.set_index('img')['text'].to_dict()

In [None]:
dict

{'img/15740.png': 'when someone tells you how to bbq',
 'img/38794.png': "when they say white folks don't know how to cook",
 'img/60792.png': 'the original derp-face',
 'img/71824.png': 'okay here you go! you piece of shit!',
 'img/04796.png': 'xboxone farming 1619 simulator',
 'img/75310.png': 'how roaches be when you shut off the kitchen light',
 'img/63425.png': 'when they say white people have no culture',
 'img/72360.png': 'picking up jewish girls the old fashioned way',
 'img/17203.png': 'mmm...this gas bill is getting expensive',
 'img/76803.png': "pol the look on your face when the press think you're a hero but you're just bringing back fresh meat for the tribe",
 'img/96013.png': 'getting stoned in the u.s. getting stoned in the middle east',
 'img/78093.png': 'asian parents be like... ling ling, dinner is ready',
 'img/08179.png': 'i aint understand why they shot him all he did was rob a liquor store steal a cop car punch a pregnant lady and run from da police',
 'img/60425.

In [None]:
dict4=dict.copy()

In [None]:
l=list(dict.keys())

In [None]:
len(l)

2000

In [None]:

from PIL import Image
import matplotlib.pyplot as plt 

In [None]:
prob=[]

In [None]:
img=Image.open(path+"/"+"img/60792.png")
img
np.array(img).shape

(341, 512, 4)

In [None]:
i=0
for file in l:
  i=i+1
  print(i)  
  print(file)
  img=Image.open(path+"/"+file)
  img=np.array(img)
  print(img.shape)
  predicted_label= PretrainedResNet(path+"/"+file)
  print(i,end=",")
  x_test=[]
  x_test.append(dict[file]+"."+predicted_label)
  x_test=np.array(x_test)
  x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
  x_testing = sequence.pad_sequences(x_test_tokenized, maxlen=max_text_length)
  prob.append(model.predict(x_testing)[0][0])
  print("predicted label")
  break

In [None]:
prob

[]

In [None]:
path="/content/data"

In [None]:
from tensorflow import keras
model = keras.models.load_model('/content/drive/My Drive/Colab Notebooks/Memes Project/Project-20201030T073607Z-001.zip (Unzipped Files)/Project/abc2.json')
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split
model.load_weights("/content/drive/My Drive/Colab Notebooks/Memes Project/Project-20201030T073607Z-001.zip (Unzipped Files)/Project/abc.h5")
import numpy as np
train_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Memes Project/Project-20201030T073607Z-001.zip (Unzipped Files)/Project/train.csv').fillna(' ')
x = train_df['comment_text'].values
max_features = 20000
max_text_length = 400
x_tokenizer = tf.keras.preprocessing.text.Tokenizer(max_features)
x_tokenizer.fit_on_texts(list(x))
#x_test=["you can't be racist if there is no other race."]
#x_test=np.array(x_test)
#x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
#x_testing = sequence.pad_sequences(x_test_tokenized, maxlen=max_text_length)
#model.predict(x_testing)

In [None]:
arr = df["id"].to_numpy()
label=[0 if i<0.5 else 1 for i in prob]
label

In [None]:
dataset=zip(arr,prob,label)

In [None]:
dataset

<zip at 0x7f57de391cc8>

In [None]:
import csv
with open("/content/drive/My Drive/Colab Notebooks/Memes Project/submission_format_phase_1.csv", "w") as f:
    writer = csv.writer(f)
    for row in dataset:
        writer.writerow(row)

In [None]:
!wget -O data.zip 'https://drivendata-competition-fb-hateful-memes-data.s3.amazonaws.com/XjiOc5ycDBRRNwbhRlgH.zip?AWSAccessKeyId=AKIARVBOBDCY4MWEDJKS&Signature=18jU0Xxm%2FBfvPASl32CcUuTXVEo%3D&Expires=1604709308'

--2020-10-31 16:16:34--  https://drivendata-competition-fb-hateful-memes-data.s3.amazonaws.com/XjiOc5ycDBRRNwbhRlgH.zip?AWSAccessKeyId=AKIARVBOBDCY4MWEDJKS&Signature=18jU0Xxm%2FBfvPASl32CcUuTXVEo%3D&Expires=1604709308
Resolving drivendata-competition-fb-hateful-memes-data.s3.amazonaws.com (drivendata-competition-fb-hateful-memes-data.s3.amazonaws.com)... 52.218.192.42
Connecting to drivendata-competition-fb-hateful-memes-data.s3.amazonaws.com (drivendata-competition-fb-hateful-memes-data.s3.amazonaws.com)|52.218.192.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4225379472 (3.9G) [application/zip]
Saving to: ‘data.zip’


2020-10-31 16:20:17 (18.1 MB/s) - ‘data.zip’ saved [4225379472/4225379472]



In [None]:
!unzip -P EWryfbZyNviilcDF data.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: data/img/14695.png      
  inflating: data/img/86572.png      
  inflating: data/img/98523.png      
  inflating: data/img/14527.png      
  inflating: data/img/50149.png      
  inflating: data/img/49630.png      
  inflating: data/img/64207.png      
  inflating: data/img/62531.png      
  inflating: data/img/14896.png      
  inflating: data/img/56983.png      
  inflating: data/img/45196.png      
  inflating: data/img/89067.png      
  inflating: data/img/83095.png      
  inflating: data/img/24581.png      
  inflating: data/img/65430.png      
  inflating: data/img/31074.png      
  inflating: data/img/75608.png      
  inflating: data/img/89264.png      
  inflating: data/img/59642.png      
  inflating: data/img/31472.png      
  inflating: data/img/96342.png      
  inflating: data/img/49782.png      
  inflating: data/img/41650.png      
  inflating: data/img/76910.png      
  inflating: data/img/5