In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [11]:
data = pd.read_csv("BRCA.csv")

In [12]:
data.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [28]:
#Removing the columns not needed for classification
df = data.iloc[:, 1:]
df = df.drop(['Date_of_Surgery', 'Date_of_Last_Visit'], axis=1)

In [29]:
df.head()

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Patient_Status
0,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,Alive
1,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,Dead
2,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,Alive
3,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,Alive
4,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,Dead


In [30]:
df.describe()

Unnamed: 0,Age,Protein1,Protein2,Protein3,Protein4
count,334.0,334.0,334.0,334.0,334.0
mean,58.886228,-0.029991,0.946896,-0.090204,0.009819
std,12.961212,0.563588,0.911637,0.585175,0.629055
min,29.0,-2.3409,-0.97873,-1.6274,-2.0255
25%,49.0,-0.358888,0.362173,-0.513748,-0.37709
50%,58.0,0.006129,0.992805,-0.17318,0.041768
75%,68.0,0.343598,1.6279,0.278353,0.42563
max,90.0,1.5936,3.4022,2.1934,1.6299


In [31]:
df.duplicated().sum()

6

In [32]:
df.isnull().sum()

Age                7
Gender             7
Protein1           7
Protein2           7
Protein3           7
Protein4           7
Tumour_Stage       7
Histology          7
ER status          7
PR status          7
HER2 status        7
Surgery_type       7
Patient_Status    20
dtype: int64

In [33]:
df.shape

(341, 13)

In [34]:
from sklearn.preprocessing import OneHotEncoder

# Specify the categorical columns to encode
categorical_columns = ['Gender', 'Tumour_Stage', 'Histology', 'ER status', 'PR status', 'HER2 status', 'Surgery_type']

# Initialize the encoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the data for the specified columns
encoded_data = encoder.fit_transform(df[categorical_columns])

# Create a DataFrame with the encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns and concatenate the new one-hot encoded columns
df_encoded = pd.concat([df.drop(categorical_columns, axis=1), encoded_df], axis=1)

# Display the first few rows of the encoded DataFrame
print(df_encoded.head())

    Age  Protein1  Protein2  Protein3  Protein4 Patient_Status  Gender_FEMALE  \
0  36.0  0.080353   0.42638   0.54715  0.273680          Alive            1.0   
1  43.0 -0.420320   0.57807   0.61447 -0.031505           Dead            1.0   
2  69.0  0.213980   1.31140  -0.32747 -0.234260          Alive            1.0   
3  56.0  0.345090  -0.21147  -0.19304  0.124270          Alive            1.0   
4  56.0  0.221550   1.90680   0.52045 -0.311990           Dead            1.0   

   Gender_MALE  Gender_nan  Tumour_Stage_I  ...  PR status_Positive  \
0          0.0         0.0             0.0  ...                 1.0   
1          0.0         0.0             0.0  ...                 1.0   
2          0.0         0.0             0.0  ...                 1.0   
3          0.0         0.0             0.0  ...                 1.0   
4          0.0         0.0             0.0  ...                 1.0   

   PR status_nan  HER2 status_Negative  HER2 status_Positive  HER2 status_nan  \
0    



In [67]:
# Drop rows with any missing values
df_encoded = df_encoded.dropna()

# Verify that missing values have been removed
print(df_encoded.isnull().sum())

Age                                         0
Protein1                                    0
Protein2                                    0
Protein3                                    0
Protein4                                    0
Patient_Status                              0
Gender_FEMALE                               0
Gender_MALE                                 0
Gender_nan                                  0
Tumour_Stage_I                              0
Tumour_Stage_II                             0
Tumour_Stage_III                            0
Tumour_Stage_nan                            0
Histology_Infiltrating Ductal Carcinoma     0
Histology_Infiltrating Lobular Carcinoma    0
Histology_Mucinous Carcinoma                0
Histology_nan                               0
ER status_Positive                          0
ER status_nan                               0
PR status_Positive                          0
PR status_nan                               0
HER2 status_Negative              

In [71]:
from sklearn.model_selection import train_test_split
#from imblearn.over_sampling import RandomOverSampler

# Assuming 'Patient_Status' is your target variable
X = df_encoded.drop('Patient_Status', axis=1)  # Features
y =df_encoded['Patient_Status'].map({'Alive': 1, 'Dead': 0})             # Target

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (256, 28)
Testing set shape: (65, 28)


In [72]:
# Optional: Scale features (logistic regression benefits from scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Initialize the model with a solver that supports both l1 and l2 (e.g., 'liblinear')
model = LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced')

# Use precision score as the scoring metric if you want to optimize for precision
grid = GridSearchCV(model, param_grid, scoring=make_scorer(precision_score), cv=5)
grid.fit(X_train_scaled, y_train)

print("Best parameters:", grid.best_params_)
print("Best precision score:", grid.best_score_)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best parameters: {'C': 0.1, 'penalty': 'l1'}
Best precision score: 0.8092793522267205


In [76]:
best_model = LogisticRegression(C=0.1, penalty='l1', solver='liblinear', max_iter=1000, class_weight='balanced')
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Generate and print the confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[ 6  7]
 [30 22]]

Classification Report:
              precision    recall  f1-score   support

           0       0.17      0.46      0.24        13
           1       0.76      0.42      0.54        52

    accuracy                           0.43        65
   macro avg       0.46      0.44      0.39        65
weighted avg       0.64      0.43      0.48        65

