In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')
%cd gdrive/MyDrive/research/CTKG
%pwd

In [None]:
!wget http://wednesday.csail.mit.edu/temporal/release/train.tar
!tar -xf train.tar

In [None]:
!git clone https://github.com/torralba-lab/im2recipe-Pytorch.git

In [None]:
!pip install pyodbc

In [None]:
import os
import numpy as np

def get_all_files_in_directory(root_dir):
    all_files = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            all_files.append(os.path.join(dirpath, filename))
    return all_files

root_dir = "val"
files = get_all_files_in_directory(root_dir)
for file in files:
    print(file)


In [None]:
len(files)

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms



In [None]:
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

class CustomImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.image_files = get_all_files_in_directory(root_dir)
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image



In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

dataset = CustomImageDataset(root_dir="val", transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
import matplotlib.pyplot as plt
def imshow(img):

    img = img / 2 + 0.5
    np_img = img.numpy()
    plt.imshow(np.transpose(np_img, (1, 2, 0)))
    plt.show()


In [None]:
dataiter = iter(dataloader)
images_list = []


while len(images_list) < 16:
    batch_images = next(dataiter)
    images_list.extend(batch_images)


images_to_display = images_list[:16]

In [None]:

images_tensor = torch.stack(images_to_display)
imshow(torchvision.utils.make_grid(images_tensor))

In [None]:
import tqdm
import torchvision.models as models

model = models.resnet101(pretrained=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))


features = []
with torch.no_grad():
  for batch in dataloader:
      images = batch


      output = model.forward(images)

      current_outputs = output.cpu().numpy()
      features.append(current_outputs)
      if len(features) == 99:
          break



In [None]:
len(features)

In [None]:
features = np.array(features)

In [None]:
features.shape

In [None]:
np.save('features_wo_fc.npy', features)

In [None]:
features = features.reshape((99*32, -1))

In [None]:
features = np.load('features.npy')


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=16, random_state=0).fit(features)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_



In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

reduced_data_tsne = TSNE(n_components=2).fit_transform(features)


plt.figure(figsize=(6, 6))


plt.scatter(reduced_data_tsne[:, 0], reduced_data_tsne[:, 1], c=labels)
plt.title('t-SNE reduced data')

plt.show()

In [None]:
from scipy.spatial import distance

closest_images_to_centroids = []

for i, centroid in enumerate(centroids):
    distances = [distance.euclidean(centroid, feature) for feature in features[labels == i]]
    closest_image_idx = distances.index(min(distances))
    closest_images_to_centroids.append(closest_image_idx)



In [None]:
from PIL import Image

fig, axs = plt.subplots(4, 4, figsize=(4, 4))

for idx, ax in enumerate(axs.ravel()):
    img = files[closest_images_to_centroids[idx]]
    img = Image.open(img).convert("RGB")
    ax.imshow(img)
    ax.axis('off')
    ax.set_title(str(idx))

plt.tight_layout()
plt.show()




In [None]:
from sklearn.cluster import KMeans
import numpy as np


desired_cluster = 0
centroid_of_desired_cluster = kmeans.cluster_centers_[desired_cluster]


indices_of_desired_cluster = np.where(kmeans.labels_ == desired_cluster)[0]
distances = np.linalg.norm(features[indices_of_desired_cluster] - centroid_of_desired_cluster, axis=1)


closest_16_indices = indices_of_desired_cluster[np.argsort(distances)[1:17]]




In [None]:
from PIL import Image

fig, axs = plt.subplots(4, 4, figsize=(4, 4))

for idx, ax in enumerate(axs.ravel()):
    img = files[closest_16_indices[idx]]
    img = Image.open(img).convert("RGB")
    ax.imshow(img)
    ax.axis('off')

plt.tight_layout()
plt.show()




In [None]:
from sklearn.cluster import KMeans
import numpy as np


desired_cluster = 3
centroid_of_desired_cluster = kmeans.cluster_centers_[desired_cluster]


indices_of_desired_cluster = np.where(kmeans.labels_ == desired_cluster)[0]
distances = np.linalg.norm(features[indices_of_desired_cluster] - centroid_of_desired_cluster, axis=1)


closest_16_indices = indices_of_desired_cluster[np.argsort(distances)[1:17]]




In [None]:
from PIL import Image

fig, axs = plt.subplots(4, 4, figsize=(4, 4))

for idx, ax in enumerate(axs.ravel()):
    img = files[closest_16_indices[idx]]
    img = Image.open(img).convert("RGB")
    ax.imshow(img)
    ax.axis('off')

plt.tight_layout()
plt.show()




In [None]:
from sklearn.cluster import KMeans
import numpy as np


desired_cluster = 8
centroid_of_desired_cluster = kmeans.cluster_centers_[desired_cluster]


indices_of_desired_cluster = np.where(kmeans.labels_ == desired_cluster)[0]
distances = np.linalg.norm(features[indices_of_desired_cluster] - centroid_of_desired_cluster, axis=1)


closest_16_indices = indices_of_desired_cluster[np.argsort(distances)[1:17]]




In [None]:
from PIL import Image

fig, axs = plt.subplots(4, 4, figsize=(4, 4))

for idx, ax in enumerate(axs.ravel()):
    img = files[closest_16_indices[idx]]
    img = Image.open(img).convert("RGB")
    ax.imshow(img)
    ax.axis('off')

plt.tight_layout()
plt.show()


