# process memotion dataset

### 1. remove crashed images...

In [4]:
import pandas as pd
import os
import cv2
import numpy as np
from PIL import Image
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

home = '/home/chen_zhang06/HatefulMemes/'

In [5]:
def preprocess_text(memo):
    import re
    from collections import Counter

    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text_to_remove = ["imgflip.com", "imgflip", "quickmeme.com", "quickmeme",
                      "memecenter.com", "memecenter", "memegenerator.net",
                      "memegenerator", "9gag.com", "arhtisticlicense.com",
                      "starecat.com", "gapbagap.net", "dudelol.com"]

    pat = r'\b(?:{})\b'.format('|'.join(text_to_remove))

    memo.text = memo.text.str.lower()
    # Remove URLs
    memo.text = memo.text.str.replace(url_pattern, "")
    # Remove words in 'text_to_remove'
    memo.text = memo.text.str.replace(pat, '')
    # Remove any character that's not a letter or a number
    memo.text = memo.text.replace(r"[\W_]+", " ", regex=True)
    memo = memo.dropna()
    return memo



In [None]:
# getting csv and image data
path2img = os.path.join(home, "memotion_dataset_7k/images/")
path2data = os.path.join(home, "memotion_dataset_7k/labels.csv")
df = pd.read_csv(path2data, delimiter=',')
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns: \n {list(df.columns)}\n')
print('--------------', df["offensive"].unique())
# print(df.head)

for i in df["offensive"].unique():
    print(f"# of '{i}' memes \t: {len(df[df['offensive']==i])}")

columns = ["image_name", "text_corrected", 'offensive']
# hateful_offensive = df[df["offensive"]=="hateful_offensive"][columns]
hateful_offensive = df[df["offensive"].isin(df["offensive"].unique())][columns]
# print(hateful_offensive)
# very_offensive    = df[df["offensive"]=="very_offensive"][columns]
# not_offensive          = df[df["offensive"]=="not_offensive"][columns]

memo = df[columns]
memo.columns = ["id", "text", 'label']
print(memo)


In [None]:
# locate empty or mismatched files: 
img_dir = os.path.join(home, 'memotion_dataset_7k/images/')
emptyfile = []
for i in memo['id']: 
    try: 
        path = os.path.join(img_dir, i)
        img = Image.open(path)
        im = np.array(img).astype(np.float32)
    except: 
        emptyfile.append(i)
print(len(emptyfile))

In [23]:
# remove those lines in the pd file, and remove the 'slight' offensive category, as it might be confusing
print(memo.shape)
memo.drop(memo[memo['id'].isin(emptyfile)].index, inplace = True)
memo.drop(memo[memo['label'] == 'slight'].index, inplace = True)
print(memo.shape)

(4155, 3)
(4155, 3)


### 2. Re-assign label values, move those images to another folder

In [None]:
# then re-assign the label value: 
hatefullist = ['hateful_offensive', 'very_offensive']
# nonhatefullist = ['not_offensive', 'slight', 'very_offensive']
nonhatefullist = ['not_offensive']
memo.loc[memo['label'].isin(hatefullist), 'label'] = 1
memo.loc[memo['label'].isin(nonhatefullist), 'label'] = 0

# Add missing columns so it looks like Hateful Memes data
memo["img"] = ("img/" + memo["id"])
memo["img"] = memo["img"].str.lower()
# remove extension in "image_name" & get the number of the image & add 100k, cause the HM data ID's goes up to 99k
memo["id"] = memo["id"].str.split(".").str.get(0).str.split("_").str.get(1).astype(int) + 100_000
# Re-order columns
memo = memo[["id", "img", 'label', "text"]]

In [25]:
# Pre-process the text in memes
memo = preprocess_text(memo)
print(memo)

          id                 img label  \
0     100001     img/image_1.jpg     0   
1     100002    img/image_2.jpeg     0   
2     100003     img/image_3.jpg     0   
3     100004     img/image_4.png     1   
4     100005     img/image_5.png     1   
...      ...                 ...   ...   
6986  106987  img/image_6987.jpg     0   
6987  106988  img/image_6988.jpg     1   
6988  106989  img/image_6989.jpg     0   
6990  106991  img/image_6991.jpg     0   
6991  106992  img/image_6992.jpg     0   

                                                   text  
0     look there my friend lightyear now all sohalik...  
1     the best of 10 yearchallenge completed in less...  
2     sam thorne strippin follow follow saw everyone...  
3                   10 year challenge sweet dee edition  
4     10 year challenge with no filter 47 hilarious ...  
...                                                 ...  
6986  if you re going on and on and on about your pr...  
6987  tuesday is mardi gras wed



In [31]:
# copy these files into another folder: memotion_dataset_7k/images_selected/
import zipfile
flist = list(memo['img'].str.split('/').str.get(1))
# print(flist)

import os
os.chdir(os.path.join(home, 'memotion_dataset_7k/images/'))
with zipfile.ZipFile('memotion_selected.zip', 'w') as zipMe:        
    for file in flist:
        try: 
            zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED)
        except: pass
!mv memotion_selected.zip ../images_selected/

In [None]:
os.chdir(os.path.join(home, 'memotion_dataset_7k/images_selected/'))
!unzip memotion_selected.zip
# !rm -rf memotion_selected.zip

In [33]:
!rm -rf memotion_selected.zip

In [34]:
# check number of images: 4138
!ls -F | wc -l

4138


### 3. generate jsonl file

In [None]:
print(memo)

In [38]:
data = memo.to_json(orient='records', lines=True)
with open(os.path.join(home, "annotations/label_memotion.jsonl"), "w", encoding='utf-8') as f:
    f.write(data)

# double check jsonl file and selected images

In [8]:
import pandas as pd
import os
import cv2
import numpy as np
from PIL import Image
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

home = '/home/chen_zhang06/HatefulMemes/'
memotion = pd.read_json(os.path.join(home, "annotations/label_memotion.jsonl"), lines=True)
print(memotion)

          id                 img  label  \
0     100001     img/image_1.jpg      0   
1     100002    img/image_2.jpeg      0   
2     100003     img/image_3.jpg      0   
3     100004     img/image_4.png      1   
4     100005     img/image_5.png      1   
...      ...                 ...    ...   
4146  106987  img/image_6987.jpg      0   
4147  106988  img/image_6988.jpg      1   
4148  106989  img/image_6989.jpg      0   
4149  106991  img/image_6991.jpg      0   
4150  106992  img/image_6992.jpg      0   

                                                   text  
0     look there my friend lightyear now all sohalik...  
1     the best of 10 yearchallenge completed in less...  
2     sam thorne strippin follow follow saw everyone...  
3                   10 year challenge sweet dee edition  
4     10 year challenge with no filter 47 hilarious ...  
...                                                 ...  
4146  if you re going on and on and on about your pr...  
4147  tuesday is ma

4151 rows in the memotion, but only 4138 files in the images_selected

In [12]:
fnames = memotion['img']
# print(fnames)
img_names = list(fnames.str.split('/').str.get(1))
# print(img_names)

In [7]:
from os import listdir
from os.path import isfile, join
mypath = os.path.join(home, "memotion_dataset_7k/images_selected/")
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
print(len(onlyfiles))

4138


In [18]:
diff = [f for f in img_names if f not in onlyfiles]
print(diff, len(diff))

['image_3.jpg', 'image_284.png', 'image_534.jpg', 'image_762.jpg', 'image_763.jpg', 'image_1615.png', 'image_2743.jpg', 'image_4321.jpg', 'image_4631.jpg', 'image_5245.jpg', 'image_5371.jpg', 'image_5493.jpg', 'image_5635.jpg'] 13


In [19]:
# remove those lines in pd: 
memotion.drop(memotion[memotion['img'].str.split('/').str.get(1).isin(diff)].index, inplace = True)
print(memotion)

          id                 img  label  \
0     100001     img/image_1.jpg      0   
1     100002    img/image_2.jpeg      0   
3     100004     img/image_4.png      1   
4     100005     img/image_5.png      1   
5     100008     img/image_8.jpg      0   
...      ...                 ...    ...   
4146  106987  img/image_6987.jpg      0   
4147  106988  img/image_6988.jpg      1   
4148  106989  img/image_6989.jpg      0   
4149  106991  img/image_6991.jpg      0   
4150  106992  img/image_6992.jpg      0   

                                                   text  
0     look there my friend lightyear now all sohalik...  
1     the best of 10 yearchallenge completed in less...  
3                   10 year challenge sweet dee edition  
4     10 year challenge with no filter 47 hilarious ...  
5     10 year challenge emotional edition boredpanda...  
...                                                 ...  
4146  if you re going on and on and on about your pr...  
4147  tuesday is ma

In [29]:
# save to a new jsonl file: 

# data = memotion.to_json(orient='records', lines=True)
# with open(os.path.join(home, "annotations/memotion_selected.jsonl"), "w", encoding='utf-8') as f:
#     f.write(data)
    
# rename image as selected 1
memotion['img'] = memotion['img'].str.split('/').str.get(1)
# print(memotion['img'].str.split('/').str.get(1))
# print(memotion)
data = memotion.to_json(orient='records', lines=True)
with open(os.path.join(home, "annotations/memotion_selected1.jsonl"), "w", encoding='utf-8') as f:
    f.write(data)

In [21]:
# extract all features for those 4k images: 
import os
home = "/home/chen_zhang06/HatefulMemes"
os.chdir(os.path.join(home, "mmf/tools/scripts/features/"))
out_folder = os.path.join(home, "features_memo_4k/")

!python extract_features_vmb.py --config_file "https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model_x152.yaml" \
                                --model_name "X-152" \
                                --output_folder $out_folder \
                                --image_dir "/home/chen_zhang06/HatefulMemes/memotion_dataset_7k/images_selected" \
                                --num_features 100 \
                                # --exclude_list "/content/exclude.txt"
                                # --feature_name "fc6" \
                                # --confidence_threshold 0. \

model and config file exists in directory: /home/chen_zhang06/HatefulMemes/mmf/tools/scripts/features
image directory: /home/chen_zhang06/HatefulMemes/memotion_dataset_7k/images_selected
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  keep = ((ws >= min_size) & (hs >= min_size)).nonzero().squeeze(1)
Processed 200/4137
Processed 400/4137
Processed 600/4137
  "Palette images with Transparency expressed in bytes should be "
Processed 800/4137
Processed 1000/4137
Processed 1200/4137
Processed 1400/4137
Processed 1600/4137
Processed 1800/4137
Processed 2000/4137
Processed 2200/4137
Processed 2400/4137
Processed 2600/4137
Processed 2800/4137
Processed 3000/4137
Processed 3200/4137
Processed 3400/4137
Processed 3600/4137
Processed 3800/4137
Processed 4000/4137


In [None]:
# use the submission1 best model to predict:

os.chdir(home)
# Download the fine-tuned model
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1NOX2lJkbK7sKRsg4_y_KUcLamknowsu2' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1NOX2lJkbK7sKRsg4_y_KUcLamknowsu2" -O 'submission#1.zip' && rm -rf /tmp/cookies.txt
# unzip the model
!unzip -qq $home/submission#1.zip -d $home/submission#1
# remove the .zip after unzipping to free the disk
!rm -rf $home/submission#1.zip

In [None]:
# double check if all jsonl files and memotion features matched up .... 3616 is an issue

# from os import listdir
# from os.path import isfile, join
# mypath = os.path.join(home, "memotion_dataset_7k/images_selected/")
# onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# print(len(onlyfiles))

home = '/home/chen_zhang06/HatefulMemes/'
memo = pd.read_json(os.path.join(home, "annotations/memotion_selected.jsonl"), lines=True)
print(memo)
# f_list = list(memotion['img'].str.split('/').str.get(1))
memo.drop(memo[memo['id'] == 103616].index, inplace = True)
print(memo) # 3616 removed...
# save again

data = memo.to_json(orient='records', lines=True)
with open(os.path.join(home, "annotations/memotion_selected1.jsonl"), "w", encoding='utf-8') as f:
    f.write(data)

In [None]:
"""
Uncomment it if needed
"""

# Validate the model on the dev_unseen data
os.chdir(home)
# where checkpoint is
ckpt_dir = os.path.join(home, "submission#1/best.ckpt")
feats_dir = os.path.join(home, "features/feats_hm")

!mmf_run config="projects/visual_bert/configs/hateful_memes/defaults.yaml"\
    model="visual_bert"\
    dataset=hateful_memes\
    run_type=val\
    checkpoint.resume_file=$ckpt_dir\
    checkpoint.reset.optimizer=True\
    dataset_config.hateful_memes.annotations.val[0]=hateful_memes/defaults/annotations/memotion_selected1.jsonl\
    dataset_config.hateful_memes.annotations.test[0]=hateful_memes/defaults/annotations/test_unseen.jsonl\
    dataset_config.hateful_memes.features.train[0]=$feats_dir\
    dataset_config.hateful_memes.features.val[0]=$feats_dir\
    dataset_config.hateful_memes.features.test[0]=$feats_dir\

2021-05-04T14:40:26 | mmf.trainers.callbacks.logistics: val/hateful_memes/cross_entropy: 1.9090, val/total_loss: 1.9090, val/hateful_memes/accuracy: 0.5857, val/hateful_memes/binary_f1: 0.2606, val/hateful_memes/roc_auc: 0.5084
This means that the memotion dataset is poorly labelled. 

Get test results for those images! 

In [95]:
from subprocess import call
import numpy as np
import os

# home = "/home/jupyter/HatefulMemes"
home = "/home/chen_zhang06/HatefulMemes"
os.chdir(home)

# models = [i for i in os.listdir("./majority_voting_models") if i.endswith(".ckpt")]

# models = ['best-' + str(i) + '.ckpt' for i in range(0, 27)]
# try only 11 models first! 
models = ['best-' + str(i) + '.ckpt' for i in range(0, 27)]
print(models)

print(f"[INFO] Getting predictions for {len(models)} models! This might take long..")
for model in models:
    model = "./majority_voting_models/" + model 
    feats_dir = os.path.join(home, "features/feats_hm")
    # Execute the bash script which gets predictions for 'test_unseen' data
    print(f"Running " + model)
    rc = call(f"./generate_submission.sh {model} {feats_dir}", shell=True)
print("finished!")

['best-0.ckpt', 'best-1.ckpt', 'best-2.ckpt', 'best-3.ckpt', 'best-4.ckpt', 'best-5.ckpt', 'best-6.ckpt', 'best-7.ckpt', 'best-8.ckpt', 'best-9.ckpt', 'best-10.ckpt', 'best-11.ckpt', 'best-12.ckpt', 'best-13.ckpt', 'best-14.ckpt', 'best-15.ckpt', 'best-16.ckpt', 'best-17.ckpt', 'best-18.ckpt', 'best-19.ckpt', 'best-20.ckpt', 'best-21.ckpt', 'best-22.ckpt', 'best-23.ckpt', 'best-24.ckpt', 'best-25.ckpt', 'best-26.ckpt']
[INFO] Getting predictions for 27 models! This might take long..
Running ./majority_voting_models/best-0.ckpt
Running ./majority_voting_models/best-1.ckpt
Running ./majority_voting_models/best-2.ckpt
Running ./majority_voting_models/best-3.ckpt
Running ./majority_voting_models/best-4.ckpt
Running ./majority_voting_models/best-5.ckpt
Running ./majority_voting_models/best-6.ckpt
Running ./majority_voting_models/best-7.ckpt
Running ./majority_voting_models/best-8.ckpt
Running ./majority_voting_models/best-9.ckpt
Running ./majority_voting_models/best-10.ckpt
Running ./majori

In [1]:
import numpy as np
import pandas as pd
import os

# home = "/home/jupyter/HatefulMemes"
home = "/home/chen_zhang06/HatefulMemes"
os.chdir(home)

# Store all the prediction folders
folders = [i for i in os.listdir("save/preds") if i.startswith("hateful_memes")]
# print(folders)
preds = pd.DataFrame()

try:
    for folder in folders:
        pred = [i for i in os.listdir(f"save/preds/{folder}/reports/") if i.endswith(".csv")]
#         print(pred)
        pred = pd.read_csv(f"save/preds/{folder}/reports/{pred[0]}")
        preds = pd.concat([preds, pred], axis=1)
except:
    pass

# print(preds)

# assert len(preds.columns) == 27*3

# Create 
submission = pred
np_df = np.asarray(preds)

for idx, row in enumerate(np_df[:,:]):
    probas = row[1::3]
    labels = row[2::3]

    if sum(labels) > 13:
        submission.loc[idx, 'label']=1
        submission.loc[idx, 'proba']=probas.max()    
    else:
        submission.loc[idx, 'label']=0
        submission.loc[idx, 'proba']=probas.min()

print(submission)
print(submission['label'].value_counts())

          id     proba  label
0     100001  0.000098      0
1     100002  0.000906      0
2     100004  0.000589      0
3     100005  0.000014      0
4     100008  0.000029      0
...      ...       ...    ...
4132  106987  0.000012      0
4133  106988  0.000185      0
4134  106989  0.000022      0
4135  106991  0.000011      0
4136  106992  0.000017      0

[4137 rows x 3 columns]
0    3365
1     772
Name: label, dtype: int64


In [3]:
# write to csv: 
submission.to_csv("save/preds/voting.csv", index = False)

# compare the prediciton from the best model (27 models, majority voting) wiht initial labeling

In [None]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

home = '/home/chen_zhang06/HatefulMemes/'
memotion = pd.read_json(os.path.join(home, "annotations/memotion_selected1.jsonl"), lines=True)
# print(memotion)

In [10]:
pred = pd.read_csv(os.path.join(home, "save/preds/voting.csv"))
pred.columns = ["pred_id", "proba", 'pred_label']
# print(pred)

      pred_id     proba  pred_label
0      100001  0.000098           0
1      100002  0.000906           0
2      100004  0.000589           0
3      100005  0.000014           0
4      100008  0.000029           0
...       ...       ...         ...
4132   106987  0.000012           0
4133   106988  0.000185           0
4134   106989  0.000022           0
4135   106991  0.000011           0
4136   106992  0.000017           0

[4137 rows x 3 columns]


In [None]:
result = pd.concat([memotion, pred], axis=1, join="inner")
# print(result)

In [None]:
# remove those lines that the model prediction does not match with orinigal label
result.drop(result[result['label'] != result['pred_label']].index, inplace = True)
# print(result)

In [20]:
columns = ['id', 'img', 'label', 'text']
output = result[columns]
print(output)
print(output['label'].value_counts())

          id                 img  label  \
0     100001     img/image_1.jpg      0   
1     100002    img/image_2.jpeg      0   
4     100008     img/image_8.jpg      0   
7     100013    img/image_13.png      0   
8     100014    img/image_14.png      0   
...      ...                 ...    ...   
4131  106986  img/image_6986.jpg      0   
4132  106987  img/image_6987.jpg      0   
4134  106989  img/image_6989.jpg      0   
4135  106991  img/image_6991.jpg      0   
4136  106992  img/image_6992.jpg      0   

                                                   text  
0     look there my friend lightyear now all sohalik...  
1     the best of 10 yearchallenge completed in less...  
4     10 year challenge emotional edition boredpanda...  
7     i did the facebook 10 year challenge and it wa...  
8     ifidownloada movie in jamaica memes in 2009 am...  
...                                                 ...  
4131  what oils are good por cutting that poul smell...  
4132  if you re goi

In [21]:
# select total 1000 images for training: 
# get the hateful ones: 
hateful_df = output[output['label'] == 1]
print(hateful_df)

          id                 img  label  \
16    100027   img/image_27.jpeg      1   
59    100089    img/image_89.jpg      1   
70    100105   img/image_105.png      1   
100   100153   img/image_153.png      1   
102   100158   img/image_158.jpg      1   
...      ...                 ...    ...   
4065  106880  img/image_6880.jpg      1   
4079  106900  img/image_6900.jpg      1   
4093  106923  img/image_6923.jpg      1   
4113  106955  img/image_6955.jpg      1   
4130  106985  img/image_6985.jpg      1   

                                                   text  
16    i wonder why who s that they call him hawkeye ...  
59         women get over here this instant hmm not bad  
70    to be continued baby godfather challange gone ...  
100   best of barney i only have one rule never scre...  
102   mr mime the barrier pokemon known to make wall...  
...                                                 ...  
4065  silly girl art base 3 6 dec thats not the kitc...  
4079  you ve been h

In [25]:
non_hateful_df = output[output['label'] == 0]
# print(non_hateful_df)
# the training dataset from hateful memes contains 5481 0s and 3019 1s, so I'll pick 424 0s and all 276 1s
# train = pd.read_json(os.path.join(home, "annotations/train.jsonl"), lines=True)
# print(train['label'].value_counts())
non_hateful_selected = non_hateful_df.sample(n = 424, random_state = 1)
# print(non_hateful_selected)

          id                 img  label  \
0     100001     img/image_1.jpg      0   
1     100002    img/image_2.jpeg      0   
4     100008     img/image_8.jpg      0   
7     100013    img/image_13.png      0   
8     100014    img/image_14.png      0   
...      ...                 ...    ...   
4131  106986  img/image_6986.jpg      0   
4132  106987  img/image_6987.jpg      0   
4134  106989  img/image_6989.jpg      0   
4135  106991  img/image_6991.jpg      0   
4136  106992  img/image_6992.jpg      0   

                                                   text  
0     look there my friend lightyear now all sohalik...  
1     the best of 10 yearchallenge completed in less...  
4     10 year challenge emotional edition boredpanda...  
7     i did the facebook 10 year challenge and it wa...  
8     ifidownloada movie in jamaica memes in 2009 am...  
...                                                 ...  
4131  what oils are good por cutting that poul smell...  
4132  if you re goi

In [28]:
# concat dfs
result = pd.concat([hateful_df, non_hateful_selected])
print(result)
results = result.sample(frac=1).reset_index(drop=True) # shuffle
print(results)

          id                 img  label  \
16    100027   img/image_27.jpeg      1   
59    100089    img/image_89.jpg      1   
70    100105   img/image_105.png      1   
100   100153   img/image_153.png      1   
102   100158   img/image_158.jpg      1   
...      ...                 ...    ...   
1982  103425  img/image_3425.jpg      0   
1465  102533  img/image_2533.jpg      0   
2501  104283  img/image_4283.jpg      0   
2588  104444  img/image_4444.jpg      0   
2845  104860  img/image_4860.png      0   

                                                   text  
16    i wonder why who s that they call him hawkeye ...  
59         women get over here this instant hmm not bad  
70    to be continued baby godfather challange gone ...  
100   best of barney i only have one rule never scre...  
102   mr mime the barrier pokemon known to make wall...  
...                                                 ...  
1982  how u feel when u drop a dank meme on the time...  
1465  i don t care 

In [30]:
# write the data to a jsonl file! 
data = results.to_json(orient='records', lines=True)
with open(os.path.join(home, "annotations/memotion_1k.jsonl"), "w", encoding='utf-8') as f:
    f.write(data)

In [35]:
# move all those images... 
flist = list(results['img'].str.split('/').str.get(1))
# print(flist)
# copy these files into another folder: memotion_dataset_7k/images_selected/
import zipfile
import os
os.chdir(os.path.join(home, 'memotion_dataset_7k/images_selected/'))
with zipfile.ZipFile('memotion_selected.zip', 'w') as zipMe:        
    for file in flist:
        zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED)
!mv memotion_selected.zip ../images_1k/

In [36]:
# unzip the file in folder and remove the zip file in terminal

In [52]:
# move all those features! important
# os.chdir(os.path.join(home, "features_memo_4k"))
import shutil
flist1 = list(results['img'].str.split('/').str.get(1).str.split('.').str.get(0))
# print(flist1)
source = os.path.join(home, "features_memo_4k")
dest = os.path.join(home, "features_memo_1k")
for f in flist1: 
    fname = f + '.npy'
    shutil.copyfile(os.path.join(source, fname), os.path.join(dest, fname))
    fname = f + '_info.npy'
    shutil.copyfile(os.path.join(source, fname), os.path.join(dest, fname))

In [None]:
# all done! all 700 image features and the jsonl files are ready