In [3]:
# pip install torch torchvision

Note: you may need to restart the kernel to use updated packages.


In [63]:
import os
import json
import pandas as pd
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [64]:
folder_path = 'Breast_Cancer_Positive_Sample'

birth_dates = []
genders = []
patient_ids = []

files = os.listdir(folder_path)

for file_name in files:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    patient_id = data['entry'][0]['resource']['id']
    birth_date = data['entry'][0]['resource'].get('birthDate', 'NaN')
    gender = data['entry'][0]['resource'].get('gender', 'NaN')
    
    patient_ids.append(patient_id)
    birth_dates.append(birth_date)
    genders.append(gender)

df_positive = pd.DataFrame({'PatientID':patient_ids, 'Birthdate': birth_dates, 'Gender': genders})
df_positive.dropna(inplace=True)
df_positive['Birthdate'] = pd.to_datetime(df_positive['Birthdate'], format='%Y-%m-%d')
current_year = datetime.now().year
df_positive['Age'] = current_year - df_positive['Birthdate'].dt.year
df_positive['Gender'] = df_positive['Gender'].replace({'male': 1, 'female': 0})
df_positive = df_positive.assign(label=1)

print(df_positive)

                                 PatientID  Birthdate Gender    Age  label
0     ec0e5f5f-16ab-727f-b197-0de19a958fd5 1922-03-29      1  102.0      1
1     09fa88ee-b168-8154-0d14-9112e48da9c3 1942-11-18      0   82.0      1
2     02d7d46c-4547-4cc9-8b92-3be485999a0b 1949-01-10      0   75.0      1
3     f4417827-662f-45a6-9e1e-e6af564866bd 1956-04-23      0   68.0      1
4     0700620f-bdb2-6524-ad51-cc2e3eb0bcc0 1955-08-22      1   69.0      1
...                                    ...        ...    ...    ...    ...
3171  0a32ce42-b5fa-5a46-9206-5b4adf94e53b 1950-06-09      0   74.0      1
3172  b6905f47-cf3b-4fe0-9487-6ce226edf941 1950-08-18      0   74.0      1
3173  0fccd627-fee4-3fe4-9c86-902cd4cd4d2c 1934-06-14      0   90.0      1
3174  35544423-a31f-261d-35c3-bd37f616452f 1970-10-15      0   54.0      1
3175  d0a6bb67-fbe6-4253-b3f8-12e306c1e3fe 1956-03-09      0   68.0      1

[3176 rows x 5 columns]


In [65]:
folder_path = 'Breast_Cancer_Negative_Sample'

birth_dates = []
genders = []
patient_ids = []

files = os.listdir(folder_path)

for file_name in files:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    patient_id = data['entry'][0]['resource']['id']
    birth_date = data['entry'][0]['resource'].get('birthDate', 'NaN')
    gender = data['entry'][0]['resource'].get('gender', 'NaN')
    
    patient_ids.append(patient_id)
    birth_dates.append(birth_date)
    genders.append(gender)

df_negative = pd.DataFrame({'PatientID':patient_ids, 'Birthdate': birth_dates, 'Gender': genders})
df_negative.dropna(inplace=True)
df_negative['Birthdate'] = pd.to_datetime(df_negative['Birthdate'], format='%Y-%m-%d')
current_year = datetime.now().year
df_negative['Age'] = current_year - df_negative['Birthdate'].dt.year
df_negative['Gender'] = df_negative['Gender'].replace({'male': 1, 'female': 0})
df_negative = df_negative.assign(label=0)

print(df_negative)

                                 PatientID  Birthdate  Gender  Age  label
0     9d98f98d-ada6-84e5-f39e-13cd57dfafe7 1932-04-17       1   92      0
1     c03d39ba-ba9b-4fc7-a6cc-db154181f8c0 1954-12-15       1   70      0
2     d2c605fc-fc07-81ed-242b-66330cbdd486 1968-01-02       0   56      0
3     b3ad18dd-9466-e052-9c06-9d1055cdff64 1970-10-12       0   54      0
4     abf45661-0226-820f-a32f-898459db5bd6 1939-01-27       0   85      0
...                                    ...        ...     ...  ...    ...
4328  aac2b95b-9bee-4a85-35d6-b88707a9c368 1941-05-28       0   83      0
4329  08f5a962-838d-27ea-ecd0-2a25c6316b9e 1950-01-17       0   74      0
4330  be87e04e-6639-5342-2571-a27815c6307b 1953-03-16       0   71      0
4331  7245c2d1-34fd-0438-878c-32696a638f57 1942-09-07       0   82      0
4332  2f1d925e-4a13-00b1-814a-7d2d966bf719 1962-08-14       0   62      0

[4333 rows x 5 columns]


In [141]:
df_combined = pd.concat([df_positive, df_negative], ignore_index=True)
df_combined = df_combined.dropna()
X = df_combined[['Gender','Age']]
y = df_combined['label']

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [170]:
class BreastCancerRiskModel(nn.Module):
    def __init__(self, input_size):
        super(BreastCancerRiskModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)  
        self.fc2 = nn.Linear(128, 128)  
        self.fc3 = nn.Linear(128, 128)  
        self.fc_final = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.sigmoid(self.fc_final(x))
        return x

model = BreastCancerRiskModel(input_size=X_train_tensor.shape[1])

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 50
for epoch in range(num_epochs):
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
        
with torch.no_grad():
    model.eval()
    outputs = model(X_test_tensor)
    # Convert probabilities to binary predictions using a threshold of 0.5
    predicted_labels = (outputs >= 0.5).float()
    accuracy = (predicted_labels == y_test_tensor).float().mean().item()
    print(f'Accuracy: {accuracy:.4f}')


Accuracy: 0.7763


In [None]:
model = BreastCancerRiskModel(input_size=2)

while True:
    try:
        age = float(input("Please enter your age: "))
        gender = input("Please enter your gender (male/female): ")
        gender = 1 if gender.lower() == 'male' else 0

        input_data = torch.tensor([[age, gender]], dtype=torch.float32)
        
        # Step 3: Pass the preprocessed data through the model to get the risk score
        with torch.no_grad():
            model.eval()
            risk_score = model(input_data)

        # Step 4: Interpret the model output
        # Print the risk score
        print(f"The estimated breast cancer risk for the patient based on their age and gender is: {risk_score.item():.4f}")

    except ValueError:
        print("Please enter a valid age.")

Please enter your age: 80
Please enter your gender (male/female): female
The estimated breast cancer risk for the patient based on their age and gender is: 0.8170
Please enter your age: 80
Please enter your gender (male/female): male
The estimated breast cancer risk for the patient based on their age and gender is: 0.8168
Please enter your age: 20
Please enter your gender (male/female): female
The estimated breast cancer risk for the patient based on their age and gender is: 0.5938
Please enter your age: 50
Please enter your gender (male/female): female
The estimated breast cancer risk for the patient based on their age and gender is: 0.7188
