In [1]:
#Imports

import numpy as np
import scipy.io as sio

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import os

import torch
from torchvision import transforms
from PIL import Image

import torch.nn as nn

from transformers import SamModel, SamProcessor, CLIPVisionModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Extracting imgage-level features using DINOV2 Models
preprocess = transforms.Compose([
    transforms.ToTensor(),
])

def make_features(img):

    #Visualizing Initial Images
    # plt.imshow(img)
    # plt.show()

    #print (img.shape)

    #Changing the shape to be able to be put in 14x14 patches 
    x_shape = img.shape[0]
    y_shape = img.shape[1]

    x = x_shape//14
    x_new_shape = x*14

    y = y_shape//14
    y_new_shape = y*14

    img_new = img[:x_new_shape, :y_new_shape, :]

    #print (img_new.shape)

    #Converting from RGBA to RGB
    img_rgb = Image.fromarray(img_new).convert('RGB')

    #Converting to tensors
    img_t = preprocess(img_rgb)

    #Visualizing Processed images
    # image = img_t.permute(1, 2, 0)
    # plt.imshow(image)
    # plt.show()

    img_t = img_t.unsqueeze(0) #batches as expected 

    #If looking into pacthes, uncomment. 
    # output = dinov2_vitb14.forward_features(img_t)
    # patches = output['x_norm_patchtokens'].detach().numpy()
    # print (patches.shape)

    return dinov2_vitb14(img_t).detach().numpy()


# Loading the images

image_path = 'ALL/ImagesToKeep'

desired_files = [file for file in os.listdir(image_path) if not file.startswith('.')]

for ver in ['b','l','g']: #'s',

    dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vit{}14'.format(ver))

    count  = 0 
    for filename in desired_files:
        f = filename[:-4]

        #print (f)
        
        #img = np.load(image_path + '/' + filename)
        img = Image.open(image_path + '/' + filename)

        img = np.array(img)

        img_features = make_features(img)

        #print (img_features.shape)

        new_filename =  'ALL/ImagesToKeep_Features/' + f + '_dino_' + ver + '.npy'
        np.save(new_filename, img_features)
        #print (new_filename)
        print (count)
        count = count + 1

    print('Done with {}.npy'.format(ver))

Using cache found in /Users/whishei/.cache/torch/hub/facebookresearch_dinov2_main


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64


In [5]:
# Extracting imgage-level features using CLIP Models
preprocess_CLIP = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def make_clip_features(img,ver):

    size = ver[1:3]
    size = int(size)

    x_shape = img.shape[0]
    y_shape = img.shape[1]

    if x_shape <= y_shape:
        min = x_shape
    else:
        min = y_shape

    val = min//size
    new_shape = val*size

    img_new = img[:new_shape, :new_shape, :]

    # print (img.shape)
    # plt.imshow(img)
    # plt.show()
    # print (img_new.shape)

    img_rgb = Image.fromarray(img_new).convert("RGB")
    img_t = preprocess_CLIP(img_rgb)

    # image = img_t.permute(1, 2, 0)
    # plt.imshow(image)
    # plt.show()
    img_t = img_t.unsqueeze(0)

    #print (clip_model.eval())
    
    outputs = clip_model(img_t)

    output = outputs.pooler_output
    features = output.detach().numpy()
    
    features = np.array(features)
    features = np.squeeze(features)

    #print (features.shape)

    return features

In [8]:
image_path = 'ALL/ImagesToKeep'

desired_files = [file for file in os.listdir(image_path) if not file.startswith('.')]

for ver in ['b16','b32','l14']:

    if ver[0] == 'b':
        clip_model = CLIPVisionModel.from_pretrained('openai/clip-vit-base-patch{}'.format(ver[1:]))
        #clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch{}'.format(ver[1:]))
    else:
        clip_model = CLIPVisionModel.from_pretrained('openai/clip-vit-large-patch{}'.format(ver[1:]))
        #clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch{}'.format(ver[1:]))

    count  = 0 
    for filename in desired_files:
        f = filename[:-4]
        
        img = Image.open(image_path + '/' + filename)

        img = np.array(img)
        
        img_features = make_clip_features(img, ver)

        new_filename =  'ALL/ImagesToKeep_Features/' + f + '_clip_' + ver + '.npy'
        np.save(new_filename, img_features)
        print (count)

        count = count + 1

    print('Done with {}.npy'.format(ver))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
Done with b16.npy
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
Done with 

In [9]:
#Loading SAM model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")

In [13]:
# Extracting imgage-level features using SAM Models
preprocess_SAM = transforms.Compose([
    transforms.Resize((1024, 1024)),
    transforms.ToTensor(),
])


def make_sam_features(img):

    # plt.imshow(img)
    # plt.show()
    # print (img.shape)

    x_shape = img.shape[0]
    y_shape = img.shape[1]

    if x_shape <= y_shape:
        min = x_shape
    else:
        min = y_shape

    val = min//14
    new_shape = val*14

    img_new = img[:new_shape, :new_shape, :]

    img_rgb = Image.fromarray(img_new).convert("RGB")

    #print (img_rgb.size)

    img_t = preprocess_SAM(img_rgb)

    # image = img_t.permute(1, 2, 0)
    # plt.imshow(image)
    # plt.show()

    img_t = img_t.unsqueeze(0)

    #print (img_t.shape)

    image_embeddings = model.get_image_embeddings(img_t) 

    #print (image_embeddings.shape)

    # # Global Average Pooling
    gap = nn.AdaptiveAvgPool2d((1, 1))

    pooled_features = gap(image_embeddings)

    new_features = pooled_features.detach().numpy()

    new_features = np.squeeze(new_features)

    #print (new_features.shape)

    return new_features
    

In [14]:
image_path = 'ALL/ImagesToKeep'

desired_files = [file for file in os.listdir(image_path) if not file.startswith('.')]

count  = 0 
for filename in desired_files:
    f = filename[:-4]
    
    img = Image.open(image_path + '/' + filename)

    img = np.array(img)

    img_features = make_sam_features(img) 

    new_filename =  'ALL/ImagesToKeep_Features/' + f + '_sam.npy'
    np.save(new_filename, img_features)
    print (count)

    count = count + 1

print('Done with sam')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
Done with sam
