In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Path to the CSV file
file_path = '/content/drive/MyDrive/data.csv'

# Load the CSV into a DataFrame
data = pd.read_csv(file_path)
data.drop('id', axis=1, inplace=True)
data.drop('Unnamed: 32', axis=1, inplace=True)
data = data.dropna()
data = data.drop('diagnosis', axis=1)

In [24]:

# first index is M, second is B
ranges = {
    'radius_mean': [17.5, 12.5],
    'perimeter_mean': [117, 79],
    'area_mean': [950, 500],
    'compactness_mean': [.135, .075],
    'concavity_mean': [.15, .05],
    'concave points_mean': [.085, .025],
    'radius_worst': [21, 14],
    'perimeter_worst': [135, 90],
    'area_worst': [1300, 550],
    'concavity_worst': [.4, .18],
    'concave points_worst': [.18, .07]
}

def get_class(datapoint):
  votes = [0, 0]
  for column_name in ranges.keys():
    m_ref = ranges[column_name][0]
    b_ref = ranges[column_name][1]
    m_dist = abs(datapoint[column_name] - m_ref)
    b_dist = abs(datapoint[column_name] - b_ref)
    if m_dist < b_dist:
      votes[0] += 1
    else:
      votes[1] += 1
  if votes[0] > votes[1]:
    return 0
  return 1

In [25]:
# Part of the evaluation function to get the differnce between 2 datapoints
def weighted_difference(data1, data2):

    class1 = get_class(data1)
    class2 = get_class(data2)

    if class1 == class2:
      return 1000

    # Define weights for each column
    weights = {
        "radius_mean": 0.05,
        "texture_mean": 0.05,
        "perimeter_mean": 0.01,
        "area_mean": 0.001,
        "smoothness_mean": 10,
        "compactness_mean": 0.03,
        "concavity_mean": 3,
        "concave points_mean": 2,
        "symmetry_mean": 5,
        "fractal_dimension_mean": 3,
        "radius_se": 10,
        "texture_se": 0.5,
        "perimeter_se": 0.25,
        "area_se": .002,
        "smoothness_se": 10,
        "compactness_se": 3,
        "concavity_se": 25,
        "concave points_se": 15,
        "symmetry_se": 30,
        "fractal_dimension_se": 0.33,
        "radius_worst": 0.33,
        "texture_worst": 0.025,
        "perimeter_worst": 0.005,
        "area_worst": 0.00025,
        "smoothness_worst": 5,
        "compactness_worst": 1,
        "concavity_worst": 5,
        "concave points_worst": 3,
        "symmetry_worst": 2,
        "fractal_dimension_worst": 4
    }

    total_weighted_difference = 0
    for column, value1 in data1.items():
        value2 = data2[column]
        weight = weights[column]
        # print(column, abs(value1 - value2) * weight)


        try:
            diff = abs(value1 - value2)
        except ValueError:
            raise ValueError(f"Column '{column}' not found in mappings or ranges.")

        total_weighted_difference += weight * diff

    return total_weighted_difference

In [34]:
# Format for Datapoint:
# {'radius_mean': 24.25, 'texture_mean': 20.2, 'perimeter_mean': 166.2, 'area_mean': 1761.0, 'smoothness_mean': 0.1447, 'compactness_mean': 0.2867, 'concavity_mean': 0.4268, 'concave points_mean': 0.2012, 'symmetry_mean': 0.2655, 'fractal_dimension_mean': 0.06877, 'radius_se': 1.509, 'texture_se': 3.12, 'perimeter_se': 9.807, 'area_se': 233.0, 'smoothness_se': 0.02333, 'compactness_se': 0.09806, 'concavity_se': 0.1278, 'concave points_se': 0.01822, 'symmetry_se': 0.04547, 'fractal_dimension_se': 0.009875, 'radius_worst': 26.02, 'texture_worst': 23.99, 'perimeter_worst': 180.9, 'area_worst': 2073.0, 'smoothness_worst': 0.1696, 'compactness_worst': 0.4244, 'concavity_worst': 0.5803, 'concave points_worst': 0.2248, 'symmetry_worst': 0.3222, 'fractal_dimension_worst': 0.08009}

data_point1 = data.iloc[122]
data_point2 = data.iloc[291]
data_point3 = data.iloc[346]
data_point4 = data.iloc[453]
data_point5 = data.iloc[45]

data_point5 = dict()
for row, val in data_point1.to_dict().items():
  data_point5[row] = val
data_point5['radius_mean'] = 13
data_point5['perimeter_mean'] = 90
data_point5['area_mean'] = 1000
data_point5['compactness_mean'] = .08
data_point5['concavity_mean'] = .06
data_point5['concave points_mean'] = .025
data_point5['radius_worst'] = 15



print(get_class(data_point5))
print('5:', data_point5)
print('1:', data_point1.to_dict())

key = [data_point1, data_point2, data_point3, data_point4, data_point5]
submission = [data_point1, data_point1, data_point1, data_point1, data_point1]

1
5: {'radius_mean': 13, 'texture_mean': 20.2, 'perimeter_mean': 90, 'area_mean': 1000, 'smoothness_mean': 0.1447, 'compactness_mean': 0.08, 'concavity_mean': 0.06, 'concave points_mean': 0.025, 'symmetry_mean': 0.2655, 'fractal_dimension_mean': 0.06877, 'radius_se': 1.509, 'texture_se': 3.12, 'perimeter_se': 9.807, 'area_se': 233.0, 'smoothness_se': 0.02333, 'compactness_se': 0.09806, 'concavity_se': 0.1278, 'concave points_se': 0.01822, 'symmetry_se': 0.04547, 'fractal_dimension_se': 0.009875, 'radius_worst': 15, 'texture_worst': 23.99, 'perimeter_worst': 180.9, 'area_worst': 2073.0, 'smoothness_worst': 0.1696, 'compactness_worst': 0.4244, 'concavity_worst': 0.5803, 'concave points_worst': 0.2248, 'symmetry_worst': 0.3222, 'fractal_dimension_worst': 0.08009}
1: {'radius_mean': 24.25, 'texture_mean': 20.2, 'perimeter_mean': 166.2, 'area_mean': 1761.0, 'smoothness_mean': 0.1447, 'compactness_mean': 0.2867, 'concavity_mean': 0.4268, 'concave points_mean': 0.2012, 'symmetry_mean': 0.2655

In [35]:
differences = []
messages = []
threshold = 5

# Function to evaluate the datapoints
def evaluate(submissionDatapoints):
    status = True
    for datapointIndex in range(5):
        difference = weighted_difference(key[datapointIndex], submissionDatapoints[datapointIndex])
        differences.append(difference)
        if difference > threshold:
            status = False
            if difference == 1000:
                print(f'{datapointIndex + 1}) You are not classifying the datapoint correctly')
            else:
              print(f'{datapointIndex + 1}) Your difference Score is {difference} for row {datapointIndex + 1}, It must be less than 5')
        else:
            print(f'{datapointIndex + 1}) Classifying correctly under the threshold!')
    if status == True:
        print("All points are classifying correctly under Threshold! Congratulations!")
    else:
      print("\nNot quite there yet, keep trying!")

In [36]:
# Call function to evaluate datapoints
evaluate(submission)

1) You are not classifying the datapoint correctly
2) Your difference Score is 31.080893899999996 for row 2, It must be less than 5
3) Your difference Score is 35.06313173000001 for row 3, It must be less than 5
4) Your difference Score is 31.868896359999994 for row 4, It must be less than 5
5) Your difference Score is 7.181101 for row 5, It must be less than 5

Not quite there yet, keep trying!
