In [13]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import pickle
from sklearn.preprocessing import LabelEncoder
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv("breast_cancer_survival.csv")

In [4]:
df.isnull().sum()

Age                    0
Gender                 0
Protein1               0
Protein2               0
Protein3               0
Protein4               0
Tumour_Stage           0
Histology              0
ER status              0
PR status              0
HER2 status            0
Surgery_type           0
Date_of_Surgery        0
Date_of_Last_Visit    17
Patient_Status        13
dtype: int64

In [5]:
df = df.dropna(how="any")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 333
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 317 non-null    int64  
 1   Gender              317 non-null    object 
 2   Protein1            317 non-null    float64
 3   Protein2            317 non-null    float64
 4   Protein3            317 non-null    float64
 5   Protein4            317 non-null    float64
 6   Tumour_Stage        317 non-null    object 
 7   Histology           317 non-null    object 
 8   ER status           317 non-null    object 
 9   PR status           317 non-null    object 
 10  HER2 status         317 non-null    object 
 11  Surgery_type        317 non-null    object 
 12  Date_of_Surgery     317 non-null    object 
 13  Date_of_Last_Visit  317 non-null    object 
 14  Patient_Status      317 non-null    object 
dtypes: float64(4), int64(1), object(10)
memory usage: 39.6+ K

In [7]:
stages = df["Tumour_Stage"].value_counts()

values = stages.values
names = stages.index

fig = px.pie(df, values=values, names=names, hole=0.5)
fig.show()

In [8]:
histologies = df["Histology"].value_counts()

values = histologies.values
names = histologies.index

fig = px.pie(df, values=values, names=names, hole=0.5)
fig.show()

In [11]:
surgeries = df["Surgery_type"].value_counts()

values = surgeries.values
names = surgeries.index

fig = px.pie(df, values=values, names=names, hole=0.5)
fig.show()

In [10]:
df.shape

(317, 15)

In [14]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [16]:
cols = ["Gender", "Tumour_Stage", "Histology", "ER status", "PR status", "HER2 status", "Surgery_type"]

In [17]:
for col in cols:
    df[col] = label_encoder(df[col])

Gender ['FEMALE' 'MALE']
Tumour_Stage ['I' 'II' 'III']
Histology ['Infiltrating Ductal Carcinoma' 'Infiltrating Lobular Carcinoma'
 'Mucinous Carcinoma']
ER status ['Positive']
PR status ['Positive']
HER2 status ['Negative' 'Positive']
Surgery_type ['Lumpectomy' 'Modified Radical Mastectomy' 'Other' 'Simple Mastectomy']


In [20]:
df = df.drop(["Date_of_Surgery", "Date_of_Last_Visit"], axis=1)

In [21]:
X = df.drop(["Patient_Status"], axis=1)
X = np.array(X)

y = df["Patient_Status"]
y = np.array(y)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4242)

In [23]:
model = SVC()
model.fit(X_train, y_train)

In [24]:
model.score(X_test, y_test)

0.78125

In [27]:
pickle.dump(model, open("model.pkl", "wb"))

In [28]:
df.columns

Index(['Age', 'Gender', 'Protein1', 'Protein2', 'Protein3', 'Protein4',
       'Tumour_Stage', 'Histology', 'ER status', 'PR status', 'HER2 status',
       'Surgery_type', 'Patient_Status'],
      dtype='object')