<a href="https://colab.research.google.com/github/talhaanwarch/mia-covid19/blob/main/ag2D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install packages

In [1]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-4bac73b0-d88a-8bb8-7dde-2469a5ea266b)


In [2]:
#restart kernel after installing packages and run it again
%%capture 
!pip install -U  pip
!pip install -U setuptools wheel
!pip install -U "mxnet_cu110<2.0.0"
!pip install  autogluon  # autogluon==0.2.0

# Get data from drive

In [3]:
#connect drive with colab notebook/
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [4]:
#copy data from drive to colab
!cp /content/drive/MyDrive/covid/train.zip /content/
!cp /content/drive/MyDrive/covid/val.zip /content/

In [5]:
#unzip data
%%capture 
!unzip /content/train.zip
!rm /content/train.zip
!mv /content/resized /content/train
#val data
!unzip /content/val.zip -d val
!rm /content/val.zip


# import packages

In [6]:
import pandas as pd
import numpy as np
from glob import glob
import cv2
import pandas as pd
import autogluon.core as ag
from autogluon.vision import ImagePredictor
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


# prepare data

In [7]:
glob('/content/train'+'/*/')[1].split('/')[-2]

'non-covid'

In [10]:
def flatten(path,label=None):
  '''
  path: images path in a list
  label: corresponding label in list
  '''
  path = [item for sublist in path for item in sublist]
  label = [item for sublist in label for item in sublist]
  return path, label
  

def prepare(path,split=False,flat=True):
  '''
  Pass the path of main folder in which binary class folders are present
  Split True will split the data and return two sets
  '''
  main_path=glob(path+'/*/')
  pos_path=[glob(folder+'*.jpg') for folder in glob(main_path[0]+'*/')]
  neg_path=[glob(folder+'*.jpg') for folder in glob(main_path[1]+'*/')]
  #get folder name as label
  pos_label=[[main_path[0].split('/')[-2]]*len(i)  for i in pos_path]
  neg_label=[[main_path[1].split('/')[-2]]*len(i)  for i in neg_path]
  path=pos_path+neg_path
  label=pos_label+neg_label
  if flat:
    if split:
      train_path,val_path,train_label,val_label=train_test_split(path,label,test_size=0.2)
      train_path,train_label=flatten(train_path,train_label)
      val_path,val_label=flatten(val_path,val_label)
      return train_path,val_path,train_label,val_label
    else:
      path,label=flatten(path,label)
      return path,label
  else:
    return path,label
  

In [11]:
train_path,val_path,train_label,val_label=prepare('/content/train',split=True)
test_path,test_label=prepare('/content/val',split=False)
print('Train Images and labels',len(train_path),len(train_label))
print('Val Images and labels',len(val_path),len(val_label))
print('Test Images and labels',len(test_path),len(test_label))

Train Images and labels 269082 269082
Val Images and labels 66590 66590
Test Images and labels 75532 75532


In [12]:
train_df=pd.DataFrame(zip(train_path,train_label),columns=['image','label'])
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.label=train_df.label.map({'non-covid':0,'covid':1})
train_df.to_csv('/content/drive/MyDrive/covid/train_df.csv',index=False)
train_df.head()

Unnamed: 0,image,label
0,/content/train/covid/ct_scan_119/155.jpg,1
1,/content/train/covid/ct_scan_313/330.jpg,1
2,/content/train/covid/ct_scan_215/287.jpg,1
3,/content/train/non-covid/ct_scan_417/80.jpg,0
4,/content/train/non-covid/ct_scan_261/416.jpg,0


In [13]:
val_df=pd.DataFrame(zip(val_path,val_label),columns=['image','label'])
val_df = val_df.sample(frac=1).reset_index(drop=True)
val_df.label=val_df.label.map({'non-covid':0,'covid':1})
val_df.to_csv('/content/drive/MyDrive/covid/val_df.csv',index=False)
val_df.head()

Unnamed: 0,image,label
0,/content/train/covid/ct_scan_32/194.jpg,1
1,/content/train/covid/ct_scan_394/257.jpg,1
2,/content/train/covid/ct_scan_563/259.jpg,1
3,/content/train/covid/ct_scan_216/234.jpg,1
4,/content/train/non-covid/ct_scan_106/329.jpg,0


In [14]:
test_df=pd.DataFrame(zip(test_path,test_label),columns=['image','label'])
test_df = test_df.sample(frac=1).reset_index(drop=True)
test_df.label=test_df.label.map({'non-covid':0,'covid':1})
test_df.to_csv('/content/drive/MyDrive/covid/test_df.csv',index=False)
test_df.head()

Unnamed: 0,image,label
0,/content/val/non-covid/ct_scan_82/56.jpg,0
1,/content/val/non-covid/ct_scan_2/10.jpg,0
2,/content/val/non-covid/ct_scan_44/141.jpg,0
3,/content/val/covid/ct_scan_160/56.jpg,1
4,/content/val/covid/ct_scan_50/41.jpg,1


# train model

In [15]:
%%time
predictor = ImagePredictor(verbosity=1)
predictor.fit(train_df,tuning_data=val_df, hyperparameters={'epochs': 5}) 

The number of requested GPUs is greater than the number of available GPUs.Reduce the number to 1


Downloading /root/.mxnet/models/resnet50_v1b-0ecdba34.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet50_v1b-0ecdba34.zip...


100%|██████████| 55344/55344 [00:01<00:00, 54404.70KB/s]


CPU times: user 2h 18min 9s, sys: 3min 40s, total: 2h 21min 49s
Wall time: 2h 4min 47s


In [16]:
fit_result = predictor.fit_summary()
print('Top-1 train acc: %.3f, val acc: %.3f' %(fit_result['train_acc'], fit_result['valid_acc']))
fit_result


Top-1 train acc: 0.827, val acc: 0.813


{'best_config': {'batch_size': 16,
  'dist_ip_addrs': None,
  'early_stop_baseline': -inf,
  'early_stop_max_value': inf,
  'early_stop_patience': 10,
  'epochs': 5,
  'estimator': gluoncv.auto.estimators.image_classification.image_classification.ImageClassificationEstimator,
  'final_fit': False,
  'gpus': [0],
  'log_dir': '/content/66e0faff',
  'lr': 0.01,
  'model': 'resnet50_v1b',
  'ngpus_per_trial': 8,
  'nthreads_per_trial': 128,
  'num_trials': 1,
  'num_workers': 2,
  'scheduler': 'local',
  'search_strategy': 'random',
  'searcher': 'random',
  'seed': 509,
  'time_limits': 7200,
  'wall_clock_tick': 1621701840.1969714},
 'fit_history': {'best_config': {'batch_size': 16,
   'dist_ip_addrs': None,
   'early_stop_baseline': -inf,
   'early_stop_max_value': inf,
   'early_stop_patience': 10,
   'epochs': 5,
   'estimator': gluoncv.auto.estimators.image_classification.image_classification.ImageClassificationEstimator,
   'final_fit': False,
   'gpus': [0],
   'log_dir': '/conten

In [20]:
predictor.save('/content/drive/MyDrive/covid/predictor1.ag')


In [17]:
val_df=ImagePredictor.Dataset.from_csv('/content/drive/MyDrive/covid/val_df.csv')
res = predictor.evaluate(val_df)
print('Top-1 test acc: %.3f' % res[0])
val_pred=predictor.predict(test_df)
print(classification_report(test_df.label,val_pred))

Top-1 test acc: 0.813
              precision    recall  f1-score   support

           0       0.78      0.84      0.81     40516
           1       0.79      0.73      0.76     35016

    accuracy                           0.79     75532
   macro avg       0.79      0.78      0.79     75532
weighted avg       0.79      0.79      0.79     75532



In [18]:
test_df=ImagePredictor.Dataset.from_csv('/content/drive/MyDrive/covid/test_df.csv')
res = predictor.evaluate(test_df)
print('Top-1 test acc: %.3f' % res[0])

Top-1 test acc: 0.789


In [19]:
test_pred=predictor.predict(test_df)
print(classification_report(test_df.label,test_pred))

              precision    recall  f1-score   support

           0       0.78      0.84      0.81     40516
           1       0.79      0.73      0.76     35016

    accuracy                           0.79     75532
   macro avg       0.79      0.78      0.79     75532
weighted avg       0.79      0.79      0.79     75532



# final evaluation on validation data

In [21]:
predictor=predictor.load('/content/drive/MyDrive/covid/predictor.ag')


In [23]:
path,label=prepare('/content/val',split=False,flat=False)
len(path),len(label)

(374, 374)

In [24]:
#now loop each folder of each directoy and create a dataframe for each folder
def evaluate(path,label):
  test_pred,test_true=[],[]
  for i, j in zip(path,label):
    test_df=pd.DataFrame(zip(i,j),columns=['image','label'])
    test_df.label=test_df.label.map({'non-covid':0,'covid':1})
    test_df.to_csv('test_df.csv',index=False)
    test_df=ImagePredictor.Dataset.from_csv('/content/test_df.csv')
    pred=list(predictor.predict(test_df))
    #find maximum occurent of element
    test_pred.append(max(pred,key=pred.count))
    test_true.append(test_df.label.mean())
  return test_pred,test_true



In [25]:
test_pred,test_true=evaluate(path,label)

In [26]:
print(classification_report(test_true,test_pred,target_names=['non-covid','covid']))

              precision    recall  f1-score   support

   non-covid       0.83      0.91      0.87       209
       covid       0.87      0.76      0.81       165

    accuracy                           0.84       374
   macro avg       0.85      0.84      0.84       374
weighted avg       0.85      0.84      0.84       374

