<a href="https://colab.research.google.com/github/yaminigangu/STML/blob/main/STML11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

file_path = "/content/drive/MyDrive/breast_cancer_survival.csv"
data = pd.read_csv(file_path)

data.head(), data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334 entries, 0 to 333
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 334 non-null    int64  
 1   Gender              334 non-null    object 
 2   Protein1            334 non-null    float64
 3   Protein2            334 non-null    float64
 4   Protein3            334 non-null    float64
 5   Protein4            334 non-null    float64
 6   Tumour_Stage        334 non-null    object 
 7   Histology           334 non-null    object 
 8   ER status           334 non-null    object 
 9   PR status           334 non-null    object 
 10  HER2 status         334 non-null    object 
 11  Surgery_type        334 non-null    object 
 12  Date_of_Surgery     334 non-null    object 
 13  Date_of_Last_Visit  317 non-null    object 
 14  Patient_Status      321 non-null    object 
dtypes: float64(4), int64(1), object(10)
memory usage: 39.3+ K

(   Age  Gender  Protein1  Protein2  Protein3  Protein4 Tumour_Stage  \
 0   42  FEMALE   0.95256   2.15000  0.007972 -0.048340           II   
 1   54  FEMALE   0.00000   1.38020 -0.498030 -0.507320           II   
 2   63  FEMALE  -0.52303   1.76400 -0.370190  0.010815           II   
 3   78  FEMALE  -0.87618   0.12943 -0.370380  0.132190            I   
 4   42  FEMALE   0.22611   1.74910 -0.543970 -0.390210           II   
 
                        Histology ER status PR status HER2 status Surgery_type  \
 0  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
 1  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
 2  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   Lumpectomy   
 3  Infiltrating Ductal Carcinoma  Positive  Positive    Negative        Other   
 4  Infiltrating Ductal Carcinoma  Positive  Positive    Positive   Lumpectomy   
 
   Date_of_Surgery Date_of_Last_Visit Patient_Status  
 0       20-May-1

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


file_path = "/content/drive/MyDrive/breast_cancer_survival.csv"
data = pd.read_csv(file_path)

data_cleaned = data.drop(columns=["Date_of_Surgery", "Date_of_Last_Visit"])
data_cleaned = data_cleaned.dropna(subset=["Patient_Status"])

label_encoders = {}
categorical_columns = ["Gender", "Tumour_Stage", "Histology",
                       "ER status", "PR status", "HER2 status", "Surgery_type", "Patient_Status"]

for column in categorical_columns:
    le = LabelEncoder()
    data_cleaned[column] = le.fit_transform(data_cleaned[column])
    label_encoders[column] = le


X = data_cleaned.drop(columns=["Patient_Status"])
y = data_cleaned["Patient_Status"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [5]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)


svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print(f"SVM Accuracy: {svm_accuracy:.2f}")


SVM Accuracy: 0.75


In [6]:
from sklearn.neighbors import KNeighborsClassifier


knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

knn_predictions = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)

print(f"KNN Accuracy: {knn_accuracy:.2f}")


KNN Accuracy: 0.72


In [7]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(random_state=42, max_iter=1000)
log_reg_model.fit(X_train, y_train)

log_reg_predictions = log_reg_model.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)

print(f"Logistic Regression Accuracy: {log_reg_accuracy:.2f}")


Logistic Regression Accuracy: 0.75


In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

svm_model_pca = SVC(random_state=42)
knn_model_pca = KNeighborsClassifier()
log_reg_model_pca = LogisticRegression(random_state=42, max_iter=1000)


svm_pca_accuracy = train_and_evaluate(svm_model_pca, X_train_pca, X_test_pca, y_train, y_test)
knn_pca_accuracy = train_and_evaluate(knn_model_pca, X_train_pca, X_test_pca, y_train, y_test)
log_reg_pca_accuracy = train_and_evaluate(log_reg_model_pca, X_train_pca, X_test_pca, y_train, y_test)

print(f"SVM with PCA Accuracy: {svm_pca_accuracy:.2f}")
print(f"KNN with PCA Accuracy: {knn_pca_accuracy:.2f}")
print(f"Logistic Regression with PCA Accuracy: {log_reg_pca_accuracy:.2f}")


SVM with PCA Accuracy: 0.75
KNN with PCA Accuracy: 0.72
Logistic Regression with PCA Accuracy: 0.75
