## Tutoriel : interagir avec le système de stockage S3 du SSP Cloud (MinIO)

In [4]:
import os

import pandas as pd
import s3fs
import zipfile

### Récupérer les données d'un challenge

In [5]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

In [6]:
# Lister les challenges
fs.ls("gvimont/diffusion/hackathon-minarm-2024")

['gvimont/diffusion/hackathon-minarm-2024/AIVSAI',
 'gvimont/diffusion/hackathon-minarm-2024/Acoustique',
 'gvimont/diffusion/hackathon-minarm-2024/Similarité']

In [7]:
# Lister les fichiers d'un challenge
fs.ls("gvimont/diffusion/hackathon-minarm-2024/Similarité")

['gvimont/diffusion/hackathon-minarm-2024/Similarité/.keep',
 'gvimont/diffusion/hackathon-minarm-2024/Similarité/archive.zip']

In [8]:
# Télécharger les données dans le service
PATH_IN = 'gvimont/diffusion/hackathon-minarm-2024/Similarité/archive.zip'
fs.download(PATH_IN, 'data/archive.zip')

[None]

In [9]:
# Décompresser les données
with zipfile.ZipFile("data/archive.zip","r") as zip_file:
    zip_file.extractall("data/")

NB : les données peuvent être également téléchargées directement si besoin, pour être utilisées hors du SSP CLoud.
Exemple pour le fichier ci-dessus (même format de lien pour les autres challenges) : 

http://minio.lab.sspcloud.fr/gvimont/diffusion/hackathon-minarm-2024/AIVSAI/HC3.zip

### Exporter des données

In [10]:
import pandas as pd

#df = pd.read.('/data/HC3/medicine.jsonl', lines=True)
df_train = pd.read_csv(r"./data/anno_train.csv", header=None)
df_test = pd.read_csv(r"./data/anno_test.csv", header=None)

# Ajouter des en-têtes au DataFrame
headers = ['Imagefile', 'Bounding_boxe1','Bounding_boxe2' ,'Bounding_boxe3','Bounding_boxe4','class number']
df_train.columns = headers
df_test.columns = headers


df_train.head()
#df_test


Unnamed: 0,Imagefile,Bounding_boxe1,Bounding_boxe2,Bounding_boxe3,Bounding_boxe4,class number
0,00001.jpg,39,116,569,375,14
1,00002.jpg,36,116,868,587,3
2,00003.jpg,85,109,601,381,91
3,00004.jpg,621,393,1484,1096,134
4,00005.jpg,14,36,133,99,106


In [12]:
df_names = pd.read_csv(r"./data/names.csv", header=None)

headers_names = ["Name"]

df_names.columns = headers_names

df_names.head()

Unnamed: 0,Name
0,AM General Hummer SUV 2000
1,Acura RL Sedan 2012
2,Acura TL Sedan 2012
3,Acura TL Type-S 2008
4,Acura TSX Sedan 2012


In [11]:
df_train.isnull().sum()


Imagefile         0
Bounding_boxe1    0
Bounding_boxe2    0
Bounding_boxe3    0
Bounding_boxe4    0
class number      0
dtype: int64

In [35]:
import os
from PIL import Image
import pandas as pd
import torch
from torch.utils.data import Dataset
from torchvision import transforms

import torch.nn as nn
from torchvision.models import resnet18

from torch.utils.data import DataLoader

import torch.optim as optim

In [49]:


class CarsDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None):
        """
        Args:
            annotations_file (str): Chemin vers le fichier qui contient les annotations.
            img_dir (str): Chemin du répertoire contenant les images.
            transform (callable, optional): Transformations optionnelles à appliquer sur les images.
        """
        self.img_labels = pd.read_csv(annotations_file, header=None)
        headers = ['Imagefile', 'Bounding_boxe1','Bounding_boxe2' ,'Bounding_boxe3','Bounding_boxe4','class number']
        self.img_labels.columns = headers
        self.img_labels['class number'] -= 1  # Adjust labels to be zero-indexed
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = Image.open(img_path).convert('RGB')
        label = self.img_labels.iloc[idx, 5]
        if self.transform:
            image = self.transform(image)
        return image, label


In [43]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [50]:

# Chemins vers vos fichiers CSV et dossiers d'images
train_file = "data/anno_train.csv"
test_file = "data/anno_test.csv"
train_dir = "data/cars_train"
test_dir = "data/cars_test"

# Créer les datasets
train_dataset = CarsDataset(train_file, train_dir, transform)
test_dataset = CarsDataset(test_file, test_dir, transform)

# Créer les DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [51]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 196)  # 196 est le nombre de classes dans votre cas
model = model.to(device)




In [52]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [54]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}')

    # Evaluation on the test set
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        print(f'Accuracy of the model on test images: {100 * correct / total}%')



KeyboardInterrupt: 