In [2]:
import torch 
from torch import nn
from torch.optim import Adam
from torch.nn import CrossEntropyLoss, Linear, ReLU, Sequential
import cv2
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import pandas as pd
import numpy as np
import os
from torchvision import transforms
import matplotlib.pyplot as plt
import pickle

In [4]:
# DATA_PATH = "/kaggle/input/unibuc-ml-202325/"
DATA_PATH = "../data/"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hyperparameters = {
    'batch_size': 64,
    'learning_rate': 0.001,
    'epochs': 1,
}

In [3]:
class CustomImageDataset(Dataset):
    def __init__(self, img_dir, csv_file, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.img_labels = pd.read_csv(csv_file)
    def __len__(self):
        return len(self.img_labels)
    def __getitem__(self, index):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[index, 0])
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.img_labels.iloc[index, 1]
        if self.transform:
            image = self.transform(image)
        return (image, label)    

In [4]:
original_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

train_dataset = CustomImageDataset(img_dir=DATA_PATH + "train_images", csv_file=DATA_PATH + "train.csv", transform=original_transform)
# train_loader = DataLoader(train_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)

val_dataset = CustomImageDataset(img_dir=DATA_PATH + "val_images", csv_file=DATA_PATH + "val.csv", transform=original_transform)
# val_loader = DataLoader(val_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)

In [42]:
def get_image_id(dataset, index):
    return dataset.img_labels.iloc[index, 0]


def get_image_label(dataset, index):
    return dataset.img_labels.iloc[index, 1]

In [43]:
train_classes = dict()

for i in range(len(train_dataset)):
    label = get_image_label(train_dataset, i)
    if label not in train_classes:
        train_classes[label] = [get_image_id(train_dataset, i)]
    else:
        train_classes[label].append(get_image_id(train_dataset, i))


In [44]:
def print_frequencies(classes):    
    for label in range(0, 96):
        print(f"{label}: ")
        freq = set()
        for image in classes[label]:
            x = image.split(".")[0]
            x = int(x)
            x = x % 100
            freq.add(x % 100)
        print(freq)

In [45]:
validation_classes = dict()

for i in range(len(val_dataset)):
    label = get_image_label(val_dataset, i)
    if label not in validation_classes:
        validation_classes[label] = [get_image_id(val_dataset, i)]
    else:
        validation_classes[label].append(get_image_id(val_dataset, i))


In [76]:
print_frequencies(train_classes)


0: 
{0}
1: 
{1}
2: 
{2}
3: 
{3}
4: 
{4}
5: 
{5}
6: 
{6}
7: 
{7}
8: 
{8}
9: 
{9}
10: 
{10}
11: 
{11}
12: 
{12}
13: 
{13}
14: 
{14}
15: 
{15}
16: 
{16}
17: 
{17}
18: 
{18}
19: 
{19}
20: 
{20}
21: 
{21}
22: 
{22}
23: 
{23}
24: 
{24}
25: 
{25}
26: 
{26}
27: 
{27, 55, 87}
28: 
{28}
29: 
{29}
30: 
{30}
31: 
{31}
32: 
{32}
33: 
{33}
34: 
{34}
35: 
{35}
36: 
{36}
37: 
{37}
38: 
{53, 38}
39: 
{39}
40: 
{40}
41: 
{41}
42: 
{42}
43: 
{43}
44: 
{44}
45: 
{45}
46: 
{46}
47: 
{47}
48: 
{48}
49: 
{49}
50: 
{50}
51: 
{51}
52: 
{52}
53: 
{54}
54: 
{56}
55: 
{57}
56: 
{58}
57: 
{59}
58: 
{60}
59: 
{61}
60: 
{62}
61: 
{63}
62: 
{64}
63: 
{65}
64: 
{66}
65: 
{67}
66: 
{68}
67: 
{69}
68: 
{70}
69: 
{71}
70: 
{72}
71: 
{73}
72: 
{74}
73: 
{75}
74: 
{76}
75: 
{77}
76: 
{78}
77: 
{79}
78: 
{80}
79: 
{81}
80: 
{82}
81: 
{89, 83}
82: 
{84}
83: 
{85}
84: 
{86}
85: 
{88}
86: 
{90}
87: 
{91}
88: 
{92}
89: 
{93}
90: 
{94}
91: 
{95}
92: 
{96}
93: 
{97}
94: 
{98}
95: 
{99}


In [47]:
print_frequencies(validation_classes)

0: 
{0}
1: 
{1}
2: 
{2}
3: 
{3}
4: 
{4}
5: 
{5}
6: 
{6}
7: 
{7}
8: 
{8}
9: 
{9}
10: 
{10}
11: 
{11}
12: 
{12}
13: 
{13}
14: 
{14}
15: 
{15}
16: 
{16}
17: 
{17}
18: 
{18}
19: 
{19}
20: 
{20}
21: 
{21}
22: 
{22}
23: 
{23}
24: 
{24}
25: 
{25}
26: 
{26}
27: 
{27, 87, 55}
28: 
{28}
29: 
{29}
30: 
{30}
31: 
{31}
32: 
{32}
33: 
{33}
34: 
{34}
35: 
{35}
36: 
{36}
37: 
{37}
38: 
{53, 38}
39: 
{39}
40: 
{40}
41: 
{41}
42: 
{42}
43: 
{43}
44: 
{44}
45: 
{45}
46: 
{46}
47: 
{47}
48: 
{48}
49: 
{49}
50: 
{50}
51: 
{51}
52: 
{52}
53: 
{54}
54: 
{56}
55: 
{57}
56: 
{58}
57: 
{59}
58: 
{60}
59: 
{61}
60: 
{62}
61: 
{63}
62: 
{64}
63: 
{65}
64: 
{66}
65: 
{67}
66: 
{68}
67: 
{69}
68: 
{70}
69: 
{71}
70: 
{72}
71: 
{73}
72: 
{74}
73: 
{75}
74: 
{76}
75: 
{77}
76: 
{78}
77: 
{79}
78: 
{80}
79: 
{81}
80: 
{82}
81: 
{89, 83}
82: 
{84}
83: 
{85}
84: 
{86}
85: 
{88}
86: 
{90}
87: 
{91}
88: 
{92}
89: 
{93}
90: 
{94}
91: 
{95}
92: 
{96}
93: 
{97}
94: 
{98}
95: 
{99}


In [85]:
tester = [0] * 100
cnt = [0] * 100

for label in range(96):
    freq = set()
    for image in train_classes[label]:
        x = image.split(".")[0]
        x = int(x)
        x = x % 100
        freq.add(x % 100)
        tester[x] = label

    for j in freq:
        cnt[j] += 1

# print(cnt)

# for i, x in enumerate(tester):
    # print(f"{i}: {x}")
    # cnt[x] += 1
# cnt = [0] * 96
# for i, x in enumerate(cnt):
#     if x != 1:
#         print(f"{i}: {x}")
    

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [54]:
import pickle

with open("tester.txt", "wb") as fp:
    pickle.dump(tester, fp)

In [6]:
tester = []

with open("tester.txt", "rb") as fp:
    tester = pickle.load(fp)

# print(tester)

# read from submission.csv and create a dictionary with key = image_id and value = label

submission = pd.read_csv("../submissions/submission_svm.csv")

accuracy = 0

for i in range(len(submission)):
    image_id = submission.iloc[i, 0]
    image_id = image_id.split(".")[0]
    image_id = int(image_id)
    image_id = image_id % 100
    label = submission.iloc[i, 1]
    if label == tester[image_id]:
        accuracy += 1

print(accuracy / len(submission))

FileNotFoundError: [Errno 2] No such file or directory: '../data/submissions/submission_svm.csv'