<a href="https://colab.research.google.com/github/talhaanwarch/mia-covid19/blob/main/ag2D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install packages

In [1]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-4d7d703c-3458-861d-337f-5682100cabcc)


In [2]:
#restart kernel after installing packages and run it again
%%capture 
!pip install -U  pip
!pip install -U setuptools wheel
!pip install -U "mxnet_cu110<2.0.0"
!pip install  autogluon  # autogluon==0.2.0

# Get data from drive

In [3]:
#connect drive with colab notebook/
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [4]:
#copy data from drive to colab
!cp /content/drive/MyDrive/covid/train.zip /content/
!cp /content/drive/MyDrive/covid/val.zip /content/

In [5]:
#unzip data
%%capture 
!unzip /content/train.zip
!rm /content/train.zip
!mv /content/resized /content/train
#val data
!unzip /content/val.zip -d val
!rm /content/val.zip


# import packages

In [6]:
import pandas as pd
import numpy as np
from glob import glob
import cv2
import pandas as pd
import autogluon.core as ag
from autogluon.vision import ImagePredictor
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


# prepare data

In [16]:
glob('/content/train'+'/*/')[1].split('/')[-2]

['non-covid', 'non-covid']

In [39]:
def flatten(path,label=None):
  '''
  path: images path in a list
  label: corresponding label in list
  '''
  path = [item for sublist in path for item in sublist]
  label = [item for sublist in label for item in sublist]
  return path, label
  

def prepare(path,split=False):
  '''
  Pass the path of main folder in which binary class folders are present
  Split True will split the data and return two sets
  '''
  main_path=glob(path+'/*/')
  pos_path=[glob(folder+'*.jpg') for folder in glob(main_path[0]+'*/')]
  neg_path=[glob(folder+'*.jpg') for folder in glob(main_path[1]+'*/')]
  #get folder name as label
  pos_label=[[main_path[0].split('/')[-2]]*len(i)  for i in pos_path]
  neg_label=[[main_path[1].split('/')[-2]]*len(i)  for i in neg_path]
  path=pos_path+neg_path
  label=pos_label+neg_label
  if split:
    train_path,val_path,train_label,val_label=train_test_split(path,label,test_size=0.2)
    train_path,train_label=flatten(train_path,train_label)
    val_path,val_label=flatten(val_path,val_label)
    return train_path,val_path,train_label,val_label
  else:
    path,label=flatten(path,label)
    return path,label
  

In [44]:
train_path,val_path,train_label,val_label=prepare('/content/train',split=True)
test_path,test_label=prepare('/content/val',split=False)
print('Train Images and labels',len(train_path),len(train_label))
print('Val Images and labels',len(val_path),len(val_label))
print('Test Images and labels',len(test_path),len(test_label))

Train Images and labels 266634 266634
Val Images and labels 69038 69038
Test Images and labels 75532 75532


In [50]:
train_df=pd.DataFrame(zip(train_path,train_label),columns=['image','label'])
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.label=train_df.label.map({'non-covid':0,'covid':1})
train_df.to_csv('/content/drive/MyDrive/covid/train_df.csv',index=False)
train_df.head()

Unnamed: 0,image,label
0,/content/train/non-covid/ct_scan_760/110.jpg,0
1,/content/train/non-covid/ct_scan_383/120.jpg,0
2,/content/train/non-covid/ct_scan_450/174.jpg,0
3,/content/train/covid/ct_scan_636/80.jpg,1
4,/content/train/non-covid/ct_scan_324/464.jpg,0


In [51]:
val_df=pd.DataFrame(zip(val_path,val_label),columns=['image','label'])
val_df = val_df.sample(frac=1).reset_index(drop=True)
val_df.label=val_df.label.map({'non-covid':0,'covid':1})
val_df.to_csv('/content/drive/MyDrive/covid/val_df.csv',index=False)
val_df.head()

Unnamed: 0,image,label
0,/content/train/covid/ct_scan_227/1.jpg,1
1,/content/train/covid/ct_scan_380/403.jpg,1
2,/content/train/non-covid/ct_scan_237/332.jpg,0
3,/content/train/covid/ct_scan_494/65.jpg,1
4,/content/train/non-covid/ct_scan_415/16.jpg,0


In [52]:
test_df=pd.DataFrame(zip(test_path,test_label),columns=['image','label'])
test_df = test_df.sample(frac=1).reset_index(drop=True)
test_df.label=test_df.label.map({'non-covid':0,'covid':1})
test_df.to_csv('/content/drive/MyDrive/covid/test_df.csv',index=False)
test_df.head()

Unnamed: 0,image,label
0,/content/val/covid/ct_scan_41/166.jpg,1
1,/content/val/covid/ct_scan_148/678.jpg,1
2,/content/val/covid/ct_scan_86/54.jpg,1
3,/content/val/covid/ct_scan_57/255.jpg,1
4,/content/val/non-covid/ct_scan_42/515.jpg,0


# train model

In [53]:
%%time
predictor = ImagePredictor(verbosity=1)
predictor.fit(train_df,tuning_data=val_df, hyperparameters={'epochs': 2}) 

The number of requested GPUs is greater than the number of available GPUs.Reduce the number to 1


Downloading /root/.mxnet/models/resnet50_v1b-0ecdba34.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet50_v1b-0ecdba34.zip...


100%|██████████| 55344/55344 [00:02<00:00, 23065.25KB/s]


CPU times: user 1h 33min 47s, sys: 2min 57s, total: 1h 36min 44s
Wall time: 1h 28min 33s


In [55]:
fit_result = predictor.fit_summary()
print('Top-1 train acc: %.3f, val acc: %.3f' %(fit_result['train_acc'], fit_result['valid_acc']))
fit_result


Top-1 train acc: 0.813, val acc: 0.788


{'best_config': {'batch_size': 16,
  'dist_ip_addrs': None,
  'early_stop_baseline': -inf,
  'early_stop_max_value': inf,
  'early_stop_patience': 10,
  'epochs': 2,
  'estimator': gluoncv.auto.estimators.image_classification.image_classification.ImageClassificationEstimator,
  'final_fit': False,
  'gpus': [0],
  'log_dir': '/content/5c80ab9c',
  'lr': 0.01,
  'model': 'resnet50_v1b',
  'ngpus_per_trial': 8,
  'nthreads_per_trial': 128,
  'num_trials': 1,
  'num_workers': 2,
  'scheduler': 'local',
  'search_strategy': 'random',
  'searcher': 'random',
  'seed': 667,
  'time_limits': 7200,
  'wall_clock_tick': 1621688961.6387753},
 'fit_history': {'best_config': {'batch_size': 16,
   'dist_ip_addrs': None,
   'early_stop_baseline': -inf,
   'early_stop_max_value': inf,
   'early_stop_patience': 10,
   'epochs': 2,
   'estimator': gluoncv.auto.estimators.image_classification.image_classification.ImageClassificationEstimator,
   'final_fit': False,
   'gpus': [0],
   'log_dir': '/conten

In [56]:
val_df=ImagePredictor.Dataset.from_csv('/content/drive/MyDrive/covid/val_df.csv')
res = predictor.evaluate(val_df)
print('Top-1 test acc: %.3f' % res[0])
val_pred=predictor.predict(test_df)
print(classification_report(test_df.label,val_pred))

Top-1 test acc: 0.788
              precision    recall  f1-score   support

           0       0.78      0.88      0.83     40516
           1       0.84      0.71      0.77     35016

    accuracy                           0.80     75532
   macro avg       0.81      0.80      0.80     75532
weighted avg       0.81      0.80      0.80     75532



In [57]:
test_df=ImagePredictor.Dataset.from_csv('/content/drive/MyDrive/covid/test_df.csv')
res = predictor.evaluate(test_df)
print('Top-1 test acc: %.3f' % res[0])

Top-1 test acc: 0.804


In [62]:
test_pred=predictor.predict(test_df)
print(classification_report(test_df.label,test_pred))

AttributeError: ignored

In [61]:
predictor.save('/content/drive/MyDrive/covid/predictor.ag')


AttributeError: ignored

# final evaluation on validation data

In [33]:
#now loop each folder of each directoy and create a dataframe for each folder
def evaluate(path,label):
  pos_test_pred=[]
  for i, j in zip(path,label):
    test_df=pd.DataFrame(zip(i,j),columns=['image','label'])
    test_df.to_csv('test_df.csv',index=False)
    test_df=ImagePredictor.Dataset.from_csv('/content/test_df.csv')
    pred=list(predictor.predict(test_df))
    #find maximum occurent of element
    pos_test_pred.append(max(pred,key=pred.count))
    return pos_test_pred



165 165


In [34]:
neg_test_pred=[]
for i, j in zip(test_neg_path,test_neg_label):
  test_df=pd.DataFrame(zip(i,j),columns=['image','label'])
  test_df.to_csv('test_df.csv',index=False)
  test_df=ImagePredictor.Dataset.from_csv('/content/test_df.csv')
  pred=list(predictor.predict(test_df))
  #find maximum occurent of element
  neg_test_pred.append(max(pred,key=pred.count))
print(len(test_neg_path),len(neg_test_pred))  

209 209


In [35]:
test_true=test_pos_label+test_neg_label
test_true=[int(np.mean(i)) for i in test_true]
test_pred=pos_test_pred+neg_test_pred
print(len(test_true),len(test_pred))  

374 374


In [36]:
from sklearn.metrics import classification_report
print(classification_report(test_true,test_pred))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85       165
           1       0.87      0.91      0.89       209

    accuracy                           0.87       374
   macro avg       0.87      0.87      0.87       374
weighted avg       0.87      0.87      0.87       374

