In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.cluster import DBSCAN

OUTPUT = 0
DROP = True
IQR_OPTION = 1

data = pd.read_csv("breast-cancer.csv")
diagnosis = data['diagnosis']

: 

In [None]:
if OUTPUT:
    print(data.head())
    print(data.info())
    print(data.describe())
    categories = ['M', 'L']
    plt.figure()
    class_count = diagnosis.value_counts()
    sns.barplot(x=class_count.index, y=class_count.values)

# Data Preprocessing

### Remove ID column

In [None]:
data = data.iloc[:, 1:]

### Check if there are any empty values

In [None]:
if OUTPUT:
    print(data.isnull().any())

### Outlier Detection and Treatment

In [None]:
features = list(data)
if OUTPUT:
    print(features)
features = features[2:]

#### Using IQR

In [None]:
# features = data.iloc[:, 1:]
# if OUTPUT:
#     plt.figure(figsize=(15, 11))
#     for i in range(len(features.columns)):
#         plt.subplot(5, 6, i+1)
#         sns.boxplot(y=features.columns[i], data=features)
#     plt.tight_layout
#     plt.show()

##### IQR Option 1: Detect Outlier Based on All Values

In [None]:
# if IQR_OPTION == 1:
#     index = []

#     for feature in features:
#         values = data[feature].values

#         q1 = np.percentile(values, 25)
#         q2 = np.percentile(values, 50)
#         q3 = np.percentile(values, 75)
#         iqr = q3 - q1
        
#         max = q3 + (1.5 * iqr)
#         min = q1 - (1.5 * iqr)
#         count = 0
#         for i in range(len(values)):
#             if values[i] > max or values[i] < min:
#                 count += 1
#                 index.append(i)
#                 if not DROP:
#                     if values[i] > max:
#                         values[i] = max
#                     else:
#                         values[i] = min
#         if OUTPUT:
#             print(f"For {feature} there are {count} outliers")
#         data[feature] = values
        
#     if DROP:
#         index = list(dict.fromkeys(index))
#         if OUTPUT:
#             print(f"Number of entries in old dataframe: {len(data)}")
#         no_outliers_data = data.drop(index=index)      
#         if OUTPUT:
#             print(f"Number of entries in new dataframe: {len(no_outliers_data)}")
#         data = no_outliers_data.reset_index(drop=True)
#         if OUTPUT:
#             categories = ['M', 'L']
#             plt.figure()
#             class_count = no_outliers_data['diagnosis'].value_counts()
#             print(class_count)
#             sns.barplot(x=class_count.index, y=class_count.values)

##### IQR Option 2: Detect Outlier Based on Diagnosis

In [None]:

# if IQR_OPTION == 2:
#     benign = data[data['diagnosis'] == 'B'].reset_index(drop=True)

#     index = []
#     features = list(benign)
#     features = features[1:]
#     for feature in features:
#         values = benign[feature].values

#         q1 = np.percentile(values, 25)
#         q2 = np.percentile(values, 50)
#         q3 = np.percentile(values, 75)
#         iqr = q3 - q1
        
#         max = q3 + (1.5 * iqr)
#         min = q1 - (1.5 * iqr)
#         count = 0
#         for i in range(len(values)):
#             if values[i] > max or values[i] < min:
#                 count += 1
#                 index.append(i)
#                 if not DROP:
#                     if values[i] > max:
#                         values[i] = max
#                     else:
#                         values[i] = min
#         benign[feature] = values


#     if DROP:
#         if OUTPUT:
#             print(f"Number of benign cases before outlier removal {len(benign)}")
#         index = list(dict.fromkeys(index))
#         no_outliers_benign = benign.drop(index=index)
#         if OUTPUT:
#             print(f"Number of benign cases after outlier removal {len(no_outliers_benign)}")

In [None]:
# if IQR_OPTION == 2:
#     malignant = data[data['diagnosis'] == 'M'].reset_index(drop=True)


#     index = []
#     features = list(malignant)
#     features = features[1:]
#     for feature in features:
#         values = malignant[feature].values

#         q1 = np.percentile(values, 25)
#         q2 = np.percentile(values, 50)
#         q3 = np.percentile(values, 75)
#         iqr = q3 - q1
        
#         max = q3 + (1.5 * iqr)
#         min = q1 - (1.5 * iqr)
#         count = 0
#         for i in range(len(values)):
#             if values[i] > max or values[i] < min:
#                 count += 1
#                 index.append(i)
#                 if not DROP:
#                     if values[i] > max:
#                         values[i] = max
#                     else:
#                         values[i] = min
#         malignant[feature] = values

#     if DROP:
#         if OUTPUT:
#             print(f"Number of malignant cases before outlier removal {len(benign)}")
#         index = list(dict.fromkeys(index))
#         no_outliers_malignant = malignant.drop(index=index)
#         if OUTPUT:
#             print(f"Number of malignant cases after outlier removal {len(no_outliers_malignant)}")

In [None]:
# if IQR_OPTION == 2:
#     if DROP:
#         malignant = no_outliers_malignant
#         benign = no_outliers_benign
#         temp_df = pd.concat([malignant, benign], ignore_index=True)
#         if OUTPUT:
#             plt.figure()
#             class_count = temp_df['diagnosis'].value_counts()
#             print(class_count)
#             sns.barplot(x=class_count.index, y=class_count.values)
    
    
#     new_data = pd.concat([malignant, benign], ignore_index=True)
#     data = new_data.sample(frac=1).reset_index(drop=True)

In [None]:
dbscan = DBSCAN(eps=2, min_samples=3).fit(data.iloc[:,2:])
print(dbscan.labels_)

### Dimension Reduction

In [None]:
if OUTPUT:
    plt.figure(figsize=(25,20))
    sns.heatmap(data=(data.iloc[:, 2:]).corr(), linewidths=10, cmap='crest')

In [None]:
# data = data.drop(columns=['radius_worst', 'perimeter_worst', 'area_worst', 'perimeter_se', 'area_se'])

### Data Encoding

In [None]:
labelEncoder = LabelEncoder()
labelEncoder.fit(data['diagnosis'])
data['diagnosis'] = labelEncoder.transform(data['diagnosis'])
if OUTPUT:
    print(data['diagnosis'])

### Data Normalization

In [None]:
normalizer = MinMaxScaler(feature_range=(1, 10))
data = normalizer.fit_transform(data)
if OUTPUT:
    print(data)