<a href="https://colab.research.google.com/github/wilberquito/ds-thesis/blob/main/Code/NN/MelanomaClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Melanoma classifier with Pytorch


In [2]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

print(f"In colab: {IN_COLAB}")

In colab: True


In [3]:
import requests
from pathlib import Path
import pandas as pd

In [4]:
def pull_kaggle_auth():
  kaggle_json_url = 'https://raw.githubusercontent.com/wilberquito/ds-thesis/main/Code/NN/kaggle.json'
  kaggle_json = '/root/.kaggle/kaggle.json'
  
  if Path(kaggle_json).exists():
    print('Kaggle auth already configured')
  else:
    Path('/root/.kaggle').mkdir(parents=True, exist_ok=True)
    with open(kaggle_json, 'wb') as f:
      print('Configuring kaggle auth...')
      request = requests.get(kaggle_json_url)
      f.write(request.content)
      print(request.content)
      !chmod 600 ~/.kaggle/kaggle.json

pull_kaggle_auth()

Configuring kaggle auth...
b'{"username":"wilberquito","key":"a1cef5be103afee00ae036396afa6cbe"}'


In [5]:
def pull_scripts():
  if not IN_COLAB:
    print('No script will be downloaded as you are working in a local machine...')
    return
    
  dataset_scripts = Path('datasets.py')
  if not dataset_scripts.exists():
    print('Downloading dataset.py script...')
    with open(dataset_scripts, 'wb') as f:
      req = requests.get('https://raw.githubusercontent.com/wilberquito/ds-thesis/main/Code/NN/datasets.py')
      f.write(req.content)
      
pull_scripts()

Downloading dataset.py script...


In [7]:
def pull_data():
  downloader_script_url = 'https://raw.githubusercontent.com/wilberquito/ds-thesis/main/Code/NN/downloader.sh'
  downloader_script = 'downloader.sh'

  if not Path(downloader_script).exists():
    print('Downloading script downloader...')
    with open(downloader_script, 'wb') as f:
      request = requests.get(downloader_script_url)
      f.write(request.content)
    print('Downloading data using the downloader script...')
    !sh downloader.sh

pull_data() 

Downloading script downloader...
Downloading data using the downloader script...
Downloading jpeg-isic2019-512x512.zip to /content/data
 99% 1.37G/1.38G [00:09<00:00, 191MB/s]
100% 1.38G/1.38G [00:09<00:00, 158MB/s]
Downloading jpeg-melanoma-512x512.zip to /content/data
100% 2.63G/2.63G [00:28<00:00, 116MB/s]
100% 2.63G/2.63G [00:29<00:00, 97.3MB/s]
Downloading jpeg-isic2019-768x768.zip to /content/data
100% 2.74G/2.74G [00:19<00:00, 158MB/s]
100% 2.74G/2.74G [00:19<00:00, 153MB/s]
Downloading jpeg-melanoma-768x768.zip to /content/data
100% 5.31G/5.32G [00:36<00:00, 204MB/s]
100% 5.32G/5.32G [00:36<00:00, 157MB/s]
Downloading jpeg-isic2019-1024x1024.zip to /content/data
100% 4.72G/4.72G [00:30<00:00, 173MB/s]
100% 4.72G/4.72G [00:30<00:00, 166MB/s]
Downloading jpeg-melanoma-1024x1024.zip to /content/data
100% 8.84G/8.85G [01:11<00:00, 132MB/s]
100% 8.85G/8.85G [01:11<00:00, 132MB/s]


In [10]:
def pull_weights():

  if (Path('weights').exists()):
    print('models already downloaded')
  else:
    Path('weights').mkdir(parents=True, exist_ok=True)
    m1_url = 'https://github.com/wilberquito/ds-thesis/raw/vicorob_model/weights/8c_b3_768_512_18ep_best_20_fold0.pth'
    m2_url = 'https://github.com/wilberquito/ds-thesis/raw/vicorob_model/weights/8c_b3_768_512_18ep_best_fold0.pth'
    m3_url = 'https://github.com/wilberquito/ds-thesis/raw/vicorob_model/weights/8c_b3_768_512_18ep_final_fold0.pth'

    for url in [m1_url, m2_url, m3_url]:
      filename = url.split('/')[-1]
      save_path = Path('weights') / Path(filename)
      with open(save_path, 'wb') as f:
        print(f'Downloading {url}...')
        req = requests.get(url)
        f.write(req.content)

pull_weights()

Downloading https://github.com/wilberquito/ds-thesis/raw/vicorob_model/weights/8c_b3_768_512_18ep_best_20_fold0.pth...
Downloading https://github.com/wilberquito/ds-thesis/raw/vicorob_model/weights/8c_b3_768_512_18ep_best_fold0.pth...
Downloading https://github.com/wilberquito/ds-thesis/raw/vicorob_model/weights/8c_b3_768_512_18ep_final_fold0.pth...


## Exploratory data analysis

In [6]:
path = Path(f'./data/jpeg-melanoma-{512}x{512}/train.csv')
df = pd.read_csv(path)
df.head()

FileNotFoundError: ignored

In [None]:
df.info()

In [None]:
df.describe()

### Diagnosis

Here I notice that the are a lot of `unknown` diagnosis. 

There are also few examples of `solar lentigo`, `cafe-au-lait macule` and `atypical melanocytic proliferation`. 

In [None]:
diagnosis_freq = df['diagnosis'].value_counts()
diagnosis_freq

In [None]:
_ = sns.barplot(x=diagnosis_freq.values, y=diagnosis_freq.index)

### TfRecord

There are some entries that it's `tfrecord` features are labeled as -1

In [None]:

tfrecord_freq = df['tfrecord'].value_counts()
_ = sns.barplot(y=tfrecord_freq.values, x=tfrecord_freq.index)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_xscale("log")

# Plot the orbital period with horizontal boxes
sns.boxplot(x="tfrecord", y="diagnosis", data=df,
            whis=[0, 100], width=.6, palette="vlag")

# # Add in points to show each observation
# sns.stripplot(x="distance", y="method", data=df,
#               size=4, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

## Generate dataset for trainning and testing

In [None]:
from datasets import get_df

In [None]:
df_train, df_test, meta_features, n_meta_features, melanoma_id = get_df(8, './data', '512', True)
df_train.head()

In [None]:
# You can notice that from anotom_site_general_challenge it generates onehot encoding of this categorical variable
# and generates the features that start with site_
df_train.columns

In [None]:
diagnosis_freq = df_train['diagnosis'].value_counts()
diagnosis_freq, len(diagnosis_freq)

In [None]:
diagnosis_freq = df_train['target'].value_counts()
diagnosis_freq, len(diagnosis_freq)

In [None]:
melanoma_id