In [None]:
from google.colab import drive

**This notebook contains code for implementing yolo-v5 which helps us to provide the bounding boxes for the test-images.Here we will be working on image level for making the predictions.**

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os
import numpy as np
import yaml
import shutil
from shutil import copyfile
from tqdm import tqdm
import torch

In [None]:
pip install -q --upgrade wandb

In [None]:
import wandb

In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

Setup complete. Using torch 1.9.0+cu102 (Tesla K80)


**First step is to merge the df at both study and test level so as to get a combined df along with the labels for the bounding boxes.Here at image level there are only two classes availaible which are either opacity or none**

In [None]:
!unzip '/content/drive/MyDrive/train_image_level.csv.zip'

Archive:  /content/drive/MyDrive/train_image_level.csv.zip
  inflating: train_image_level.csv   


In [None]:
train_image_level=pd.read_csv('/content/train_image_level.csv')

In [None]:
train_study_level=pd.read_csv('/content/drive/MyDrive/train_study_level.csv')

In [None]:
train_study_level['id']=train_study_level.apply(lambda x:x['id'].split('_')[0],axis=1)

In [None]:
train_study_level.rename(columns={'id':'StudyInstanceUID'},inplace=True)

In [None]:
train_df=pd.merge(train_image_level,train_study_level,how='inner')

In [None]:
meta_train=pd.read_csv('/content/drive/MyDrive/meta_train.csv')

In [None]:
meta_train_sort=meta_train.sort_values(by='image_id')
meta_train_sort

Unnamed: 0,image_id,dim0,dim1
2067,000a312787f2,3488,4256
3559,000c3a3f293f,2320,2832
4034,0012ff7358bc,2544,3056
3122,001398f4ff4f,3520,4280
4241,001bd15d1891,2800,3408
...,...,...,...
5186,ffcc6edd9445,4240,3480
1305,ffd91a2c4ca0,2800,3408
3069,ffd9b6cf2961,2388,3050
3191,ffdc682f7680,3488,4256


In [None]:
meta_train_sort['image_id']=meta_train_sort.apply(lambda x:x['image_id']+'_image',axis=1)
meta_train_sort.rename(columns={'image_id':'id'},inplace=True)

In [None]:
final_train_df=pd.merge(train_df,meta_train_sort,how='inner',on='id')

In [None]:
final_train_df['image_level']=0

In [None]:
for ind in final_train_df.index:
  label=final_train_df['label'][ind].split(' ')[0]
  if label=='opacity':
    final_train_df['image_level'][ind]=1

  else:
    final_train_df['image_level'][ind]=0

In [None]:
final_train_df.to_csv('final_train_df.csv',index=False)

In [None]:
final_train_df=pd.read_csv('/content/drive/MyDrive/final_train_df.csv')

**This is the final dataframe after combining train and study level.The image level is 1 for opacity and 0 for none.**

In [None]:
final_train_df.head()

Unnamed: 0,id,boxes,label,StudyInstanceUID,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,dim0,dim1,image_level
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,0,1,0,0,3488,4256,1
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed,1,0,0,0,2320,2832,0
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,0,1,0,0,2544,3056,1
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,0,0,0,1,3520,4280,1
4,001bd15d1891_image,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e,0,1,0,0,2800,3408,1


**By using stratified k fold we will divide the images into train and valid and will use  yolo-v5 algo for each of the folds.In this way we will come up with five different weights for each of the fold which we will combine it during inference.**

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
final_train_df['folds']=0

In [None]:
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i,(tr_idx,val_idx) in enumerate(fold.split(final_train_df,final_train_df['image_level'])):
  final_train_df['folds'][val_idx]=i

In [None]:
!unzip '/content/drive/MyDrive/my_train.zip'

**Directory Structure**

1)Main_Directory_Name-->Datset_folds--->Images--->Train/Val

In [None]:
for i in range(5):
  train_df=final_train_df[final_train_df['folds']!=i]
  valid_df=final_train_df[final_train_df['folds']==i]
  

  

  for ind in tqdm(train_df.index):
    

      filename=train_df['id'][ind].split('_')[0]
      filepath='/content/resized_train_data/' + filename + '.jpg'

      os.makedirs('/content/covid_19/dataset_folds_{}/images/train'.format(i),exist_ok=True)

      copyfile(filepath,'/content/covid_19/dataset_folds_{}/images/train/{}.jpg'.format(i,filename))

    


  for ind in tqdm(valid_df.index):
    
      filename=valid_df['id'][ind].split('_')[0]
      filepath='/content/resized_train_data/' + filename + '.jpg'

      os.makedirs('/content/covid_19/dataset_folds_{}/images/val'.format(i),exist_ok=True)

      copyfile(filepath,'/content/covid_19/dataset_folds_{}/images/val/{}.jpg'.format(i,filename))

100%|██████████| 4843/4843 [00:00<00:00, 5753.37it/s]
100%|██████████| 1211/1211 [00:00<00:00, 5717.66it/s]
100%|██████████| 4843/4843 [00:01<00:00, 4750.78it/s]
100%|██████████| 1211/1211 [00:00<00:00, 4560.72it/s]
100%|██████████| 4843/4843 [00:00<00:00, 5377.37it/s]
100%|██████████| 1211/1211 [00:00<00:00, 5427.44it/s]
100%|██████████| 4843/4843 [00:00<00:00, 5930.67it/s]
100%|██████████| 1211/1211 [00:00<00:00, 6176.40it/s]
100%|██████████| 4844/4844 [00:00<00:00, 5059.68it/s]
100%|██████████| 1210/1210 [00:00<00:00, 5865.45it/s]


**Steps for training yolov5**

1) Forst we have to cfreate the directory structure as mentioned above

2) We have to clone the yolov5 repository and move it along the same directory structure.Further we have to install the requires dependencies using the requirements.txt file

3)Now we have to create a yaml file under the same directory structure

In [None]:
!git clone https://github.com/ultralytics/yolov5

Cloning into 'yolov5'...
remote: Enumerating objects: 9362, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 9362 (delta 4), reused 9 (delta 4), pack-reused 9352[K
Receiving objects: 100% (9362/9362), 9.71 MiB | 6.29 MiB/s, done.
Resolving deltas: 100% (6502/6502), done.


In [None]:
pip install -r 'covid_19/yolov5/requirements.txt'

In [None]:
for fold in range(5):

  yaml_data={'train':'/content/covid_19/dataset_folds_{}/images/train'.format(fold),
  'val':'/content/covid_19/dataset_folds_{}/images/val'.format(fold),
  'nc':2,
  'names':['none','opacity']}


  my_file=open('/content/covid_19/yolov5/data/data_fold_{}.yaml'.format(fold),'w')
  yaml.dump(yaml_data,my_file)

In [None]:
%cat '/content/covid_19/yolov5/data/data_fold_2.yaml'

names: [none, opacity]
nc: 2
train: /content/covid_19/dataset_folds_2/images/train
val: /content/covid_19/dataset_folds_2/images/val


**After doing all the necessary steps for training yolov5 our next step is towards creating the co-ordinates for the bounding boxes in yolov5 format from given labels.**

**As we have resized the image into 256*256 we have to scale the labels accordingly.Here I have made sure that only those images are taken for which the labels are present.**

In [None]:
for ind in tqdm(list(final_train_df[final_train_df['boxes'].notnull()].index)):
  final_train_df['boxes'][ind]=eval(final_train_df['boxes'][ind])

In [None]:
ind_boxes=final_train_df[final_train_df['boxes'].notnull()].index
scaled_dim_final_list=[]
for ind in ind_boxes:
  scaled_dim_list=[]
  scaled_dim_0=final_train_df['dim1'][ind]/256
  scaled_dim_1=final_train_df['dim0'][ind]/256
  for j in range(len(final_train_df['boxes'][ind])):
    scaled_list=[]
    scaled_list.append(final_train_df['boxes'][ind][j]['x']/scaled_dim_0)
    scaled_list.append(final_train_df['boxes'][ind][j]['y']/scaled_dim_1)
    scaled_list.append(final_train_df['boxes'][ind][j]['width']/scaled_dim_0)
    scaled_list.append(final_train_df['boxes'][ind][j]['height']/scaled_dim_1)

    scaled_dim_list.append(scaled_list)

  #print(final_train_df['boxes'][ind][j]['x']/scaled_dim_0)

  scaled_dim_final_list.append((scaled_dim_list,ind))

**Here we are defining  a function which will transform the coordinates into yolo-v5 format.On ething to keep in ming that the coordinates must be normalized.**

**The co-ordinates are xcenter,ycenter,width,height**

In [None]:
def get_yolo_format_boxes(ind,list_dim,df_index):
  yolo_boxes=[]
  for ind_1,ind_2 in zip(list(ind),range(len(list_dim))):
    if ind_1==df_index and list_dim[ind_2][1]==df_index:
      for ind_3 in range(len(final_train_df['boxes'][ind_1])):
        w=list_dim[ind_2][0][ind_3][2]
        h=list_dim[ind_2][0][ind_3][3]
        xc=(list_dim[ind_2][0][ind_3][0])+ int(w/2)
        yc=(list_dim[ind_2][0][ind_3][1])+ int(h/2)
        normalized_coord=[xc/256,yc/256,w/256,h/256]#normalizing the co-ordinates
        if (normalized_coord[0])<1 and (normalized_coord[1])<1 and (normalized_coord[2])<1 and (normalized_coord[3])<1:
          yolo_boxes.append(normalized_coord)

  return yolo_boxes

**We also have to create labels for our train and valid images.This will be a text file which will contain the class label followed by bb co-ordinates.For each co-ordinate there will be a line change.**

**Directory Structure**

Main_Directory_Name--->Dataset_Folds--->labels-->train/val

In [None]:
for i in range(5):
  train_df=final_train_df[final_train_df['folds']!=i]
  valid_df=final_train_df[final_train_df['folds']==i]

  for num in tqdm(train_df.index):

    filename=train_df['id'][num].split('_')[0]
    os.makedirs('/content/covid_19/dataset_folds_{}/labels/train'.format(i),exist_ok=True)
    filepath='/content/covid_19/dataset_folds_{}/labels/train/{}.txt'.format(i,filename)

    if train_df['image_level'][num]==1:

      yolo_bb=get_yolo_format_boxes(ind_boxes,scaled_dim_final_list,num)
      #print(yolo_bb)
      with open(filepath,'w') as f:
        for ind_bb_yolo in yolo_bb:
          yolo_bb_classes=[1]+ind_bb_yolo
          yolo_bb_classes=[str(j) for j in yolo_bb_classes]
          yolo_bb_classes=' '.join(yolo_bb_classes)
          #print(yolo_bb_classes)
          f.write(yolo_bb_classes)
          f.write('\n')

  for num in tqdm(valid_df.index):

    filename=valid_df['id'][num].split('_')[0]
    os.makedirs('/content/covid_19/dataset_folds_{}/labels/val'.format(i),exist_ok=True)
    filepath='/content/covid_19/dataset_folds_{}/labels/val/{}.txt'.format(i,filename)

    if valid_df['image_level'][num]==1:

      yolo_bb=get_yolo_format_boxes(ind_boxes,scaled_dim_final_list,num)
      #print(yolo_bb)
      with open(filepath,'w') as f:
        for ind_bb_yolo in yolo_bb:
          yolo_bb_classes=[1]+ind_bb_yolo
          yolo_bb_classes=[str(j) for j in yolo_bb_classes]
          yolo_bb_classes=' '.join(yolo_bb_classes)
          #print(yolo_bb_classes)
          f.write(yolo_bb_classes)
          f.write('\n')

100%|██████████| 4843/4843 [00:03<00:00, 1339.12it/s]
100%|██████████| 1211/1211 [00:00<00:00, 1226.47it/s]
100%|██████████| 4843/4843 [00:03<00:00, 1321.13it/s]
100%|██████████| 1211/1211 [00:00<00:00, 1262.10it/s]
100%|██████████| 4843/4843 [00:03<00:00, 1318.58it/s]
100%|██████████| 1211/1211 [00:00<00:00, 1361.47it/s]
100%|██████████| 4843/4843 [00:03<00:00, 1327.78it/s]
100%|██████████| 1211/1211 [00:00<00:00, 1322.89it/s]
100%|██████████| 4844/4844 [00:03<00:00, 1279.94it/s]
100%|██████████| 1210/1210 [00:00<00:00, 1303.22it/s]


**Finally after doing all the necessary steps we will start training**

In [None]:
#IMG_SIZE-->256
#BATCH_SIZE-->256
#TRAIN_PATH-->/content/covid_19/yolov5/train.py
#EPOCHS-->10
#DATA-->/content/covid_19/yolov5/data/data_fold_{i}.yaml
#WEIGHTS-->yolov5s.pt
#SAVE_PERIOD-->10
#PROJECT--->COVID_19_DETECTION

In [None]:
for i in range(5):
  !python /content/covid_19/yolov5/train.py --img 256 --batch 16 --epochs 10 --data /content/covid_19/yolov5/data/data_fold_{i}.yaml  --weights yolov5s.pt --save_period 10 --project covid_19_detection --name yolov5_final_folds_{i}
  print('###################################\n')

Downloading https://ultralytics.com/assets/Arial.ttf to /root/.config/Ultralytics/Arial.ttf...
100% 755k/755k [00:00<00:00, 39.3MB/s]
[34m[1mtrain: [0mweights=yolov5s.pt, cfg=, data=/content/covid_19/yolov5/data/data_fold_0.yaml, hyp=data/hyps/hyp.scratch.yaml, epochs=10, batch_size=16, imgsz=256, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, adam=False, sync_bn=False, workers=8, project=covid_19_detection, entity=None, name=yolov5_final_folds_0, exist_ok=False, quad=False, linear_lr=False, label_smoothing=0.0, upload_dataset=False, bbox_interval=-1, save_period=10, artifact_alias=latest, local_rank=-1, freeze=0, patience=100
[34m[1mgithub: [0mskipping check (not a git repository), for updates see https://github.com/ultralytics/yolov5
YOLOv5 🚀 v5.0-425-g22ee6fb torch 1.9.0+cu102 CUDA:0 (Tesla K80, 11441.1875MB)

[34m[1mhyperparameters: [0mlr0=0.01, lr

In [None]:
#REF-->https://www.kaggle.com/ayuraj/train-yolov5-cross-validation-ensemble-w-b