In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from skimage import io, filters, morphology, segmentation, img_as_ubyte, transform, color
import matplotlib.pyplot as plt
from skimage.draw import polygon

from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import joblib

import os
import shutil
import pandas as pd

In [12]:
np.random.seed(1)


#Data Selection and Balancing for Diagnostic Analysis: Creating a Balanced Dataset with Pandas

1. The make_df function reads the metadata from a dataset stored in a CSV file called "metadata.csv" and creates a pandas DataFrame (df) with the columns "patient_id", "img_id", and "diagnostic". The function then creates a new DataFrame (new_df) by selecting only the columns "patient_id", "img_id", and "diagnostic" from df.
2. In the new_df DataFrame, a new column called "healthy" is created using the np.where function. The "healthy" column is assigned a value of 1 if the corresponding "diagnostic" column value is "NEV" (indicating a healthy diagnosis), and 0 otherwise.
3. The select_data function takes the new_df DataFrame as input. It creates a new DataFrame called final_data by filtering rows where the "healthy" column value is 1, indicating a healthy diagnosis.
4. Another DataFrame called filtered_data is created by filtering rows where the "healthy" column value is 0, indicating a non-healthy diagnosis.
5. The sample function is used on the filtered_data DataFrame to randomly select 244 rows (representing non-healthy diagnoses) using a random state of 42. These randomly selected rows are stored in the random_rows DataFrame.
6. The pd.concat function is used to concatenate (pd.concat([final_data, random_rows])) the final_data DataFrame and the random_rows DataFrame, resulting in a new DataFrame that combines the healthy and randomly selected non-healthy rows.
7. The index of the combined DataFrame is set to "patient_id" using the set_index function.
8. The sample function is used again on the combined DataFrame with frac=1 to shuffle the rows randomly.
9. The resulting shuffled DataFrame is returned as final_data from the select_data function.
10 Finally, the final_data DataFrame is printed.

The overall purpose of the code is to create a pandas DataFrame (final_data) that represents a selected subset of the original dataset. It ensures that the "healthy" diagnosis ("NEV") is included in the final DataFrame and randomly selects a certain number of non-healthy diagnoses from the remaining data. The resulting DataFrame is shuffled to provide a random order of the data. This process aims to create a balanced dataset for further analysis or modeling purposes, where both healthy and non-healthy data points are included.

In [13]:
def make_df():
  path = os.path.join(os.getcwd(), "metadata.csv")
  df = pd.read_csv(path)
  new_df = df[["patient_id", "img_id", "diagnostic"]]

  new_df["healthy"] = np.where(new_df["diagnostic"] == "NEV", 1, 0) 
  return new_df 


def select_data(new_df):
  final_data = new_df[new_df["healthy"] == 1]
  filtered_data = new_df[new_df["healthy"] == 0]

  random_rows = filtered_data.sample(n = 244, random_state=42)
  final_data = pd.concat([final_data, random_rows])
  final_data = final_data.set_index("patient_id")

  final_data = final_data.sample(frac=1)

  return final_data


final_data = select_data(make_df())

final_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["healthy"] = np.where(new_df["diagnostic"] == "NEV", 1, 0)


Unnamed: 0_level_0,img_id,diagnostic,healthy
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PAT_520,PAT_520_983_221.png,BCC,0
PAT_1934,PAT_1934_3890_306.png,NEV,1
PAT_244,PAT_244_374_726.png,NEV,1
PAT_119,PAT_119_181_684.png,BCC,0
PAT_972,PAT_972_1843_756.png,MEL,0
...,...,...,...
PAT_1392,PAT_1392_1352_828.png,SEK,0
PAT_621,PAT_621_1182_456.png,NEV,1
PAT_872,PAT_872_1707_638.png,BCC,0
PAT_1216,PAT_1216_759_365.png,NEV,1


In [7]:
def slic_samples(img):
  new_img = img.copy()
  new_img = new_img[:, :, :3]

  foreground_mask = np.all(new_img != [0, 0, 0], axis=-1)

  segments = segmentation.slic(new_img * foreground_mask[..., np.newaxis], n_segments=36, compactness=3)

  mean_colours = np.zeros((np.max(segments)+1, 3))

  for label in enumerate(np.unique(segments)):
    mask = segments == label[1]
    mean_colours[label[0], :] = new_img[mask].mean(axis=0)

  palette_height, palette_width = 50, 300
  colours = mean_colours[np.all(mean_colours, axis=1)]
  color_palette = np.zeros((palette_height, len(colours), 3))

  for i in range(len(colours)):
    color_palette[:, i, :] = colours[i]

  return color_palette

In [8]:
def make_datasample(img, name):
  # asym = check_asymmetry(img)
  col = slic_samples(img)
  common_shape = (50, 27, 3)

  col = np.pad(col, [(0, common_shape[0] - col.shape[0]),
                                  (0, common_shape[1] - col.shape[1]),
                                  (0, common_shape[2] - col.shape[2])], mode='constant')

  col = col.ravel()
  x = col

  if (final_data[final_data["img_id"] == name]["healthy"] == 1).bool():
    y = 1
  else:
    y = 0
  
  return [x, y]

In [16]:
def build_datasample_new():
  path = os.path.join(os.getcwd(), "segmented_photos")
  arr = []

  for i in os.listdir(path):
    image = io.imread(os.path.join(path, i))
    image = transform.resize(image, (200, 200), anti_aliasing=True)

    arr.append(make_datasample(image, i))

  np.random.shuffle(arr)
  return arr


arr_col = build_datasample_new()
print(len(arr_col))



279
