In [None]:
print("Setting up the environment...")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

print("Libraries imported successfully.\n")


Setting up the environment...
Libraries imported successfully.



In [None]:
# Mounting Google Drive
# print("Mounting Google Drive...")
# from google.colab import drive
# drive.mount('/content/drive')

# Set the file paths (update paths if necessary)
file1_path = 'data/merging_df4_df5.csv'  # Replace with actual file path
file2_path = 'data/yr_diagno(2010-21).csv'  # Replace with actual file path

# Load datasets
import pandas as pd

print("Loading datasets...")
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

print(f"Dataset 1 loaded with {df1.shape[0]} rows and {df1.shape[1]} columns.")
print(f"Dataset 2 loaded with {df2.shape[0]} rows and {df2.shape[1]} columns.\n")

Mounting Google Drive...
Mounted at /content/drive
Loading datasets...
Dataset 1 loaded with 1778000 rows and 6 columns.
Dataset 2 loaded with 440 rows and 2 columns.



In [None]:
# Remove entirely duplicate rows from Dataset 1
df1 = df1.drop_duplicates()
print(f"Dataset 1 after removing duplicates: {df1.shape[0]} rows and {df1.shape[1]} columns.")
print("Preview of Dataset 1 after removing duplicates:")
print(df1.head(), "\n")

# Remove entirely duplicate rows from Dataset 2
df2 = df2.drop_duplicates()
print(f"Dataset 2 after removing duplicates: {df2.shape[0]} rows and {df2.shape[1]} columns.")
print("Preview of Dataset 2 after removing duplicates:")
print(df2.head(), "\n")


Dataset 1 after removing duplicates: 11520 rows and 6 columns.
Preview of Dataset 1 after removing duplicates:
  Primary Site - labeled Tumor Size Over Time Recode (1988+)  \
0       C18.9-Colon, NOS                             001-400   
1       C18.9-Colon, NOS                             001-400   
2       C18.9-Colon, NOS                             001-400   
3       C18.9-Colon, NOS                             001-400   
4       C18.9-Colon, NOS                             001-400   

  Behavior code ICD-O-3    Diagnostic Confirmation              Sex  \
0                Benign  Microscopically confirmed  Male and female   
1                Benign  Microscopically confirmed  Male and female   
2                Benign  Microscopically confirmed  Male and female   
3                Benign  Microscopically confirmed  Male and female   
4                Benign  Microscopically confirmed  Male and female   

                   Reason no cancer-directed surgery  
0                     

In [None]:
merged_data = pd.merge(df1, df2, on='Primary Site - labeled', how='inner')
print(f"Merged dataset has {merged_data.shape[0]} rows and {merged_data.shape[1]} columns.")
print("Preview of merged dataset:")
print(merged_data.head())

Merged dataset has 253440 rows and 7 columns.
Preview of merged dataset:
  Primary Site - labeled Tumor Size Over Time Recode (1988+)  \
0       C18.9-Colon, NOS                             001-400   
1       C18.9-Colon, NOS                             001-400   
2       C18.9-Colon, NOS                             001-400   
3       C18.9-Colon, NOS                             001-400   
4       C18.9-Colon, NOS                             001-400   

  Behavior code ICD-O-3    Diagnostic Confirmation              Sex  \
0                Benign  Microscopically confirmed  Male and female   
1                Benign  Microscopically confirmed  Male and female   
2                Benign  Microscopically confirmed  Male and female   
3                Benign  Microscopically confirmed  Male and female   
4                Benign  Microscopically confirmed  Male and female   

  Reason no cancer-directed surgery Year of diagnosis  
0                 Surgery performed         1992-2021  
1  

In [None]:
# Filter data based on 'Year of diagnosis'
print("Filtering data based on 'Year of diagnosis'...")

# Converting 'Year of diagnosis' to numeric and filtering
merged_data['Year of diagnosis'] = pd.to_numeric(merged_data['Year of diagnosis'], errors='coerce')
filtered_data = merged_data[merged_data['Year of diagnosis'] >= 2011]  # Filtering for year >= 1998

print(f"Filtered dataset has {filtered_data.shape[0]} rows and {filtered_data.shape[1]} columns.")
print(f"Columns in the dataset: {list(filtered_data.columns)}\n")


Filtering data based on 'Year of diagnosis'...
Filtered dataset has 126720 rows and 7 columns.
Columns in the dataset: ['Primary Site - labeled', 'Tumor Size Over Time Recode (1988+)', 'Behavior code ICD-O-3', 'Diagnostic Confirmation', 'Sex', 'Reason no cancer-directed surgery', 'Year of diagnosis']



In [None]:
print("Filtered dataset preview:")
print(filtered_data.head(), "\n")

Filtered dataset preview:
   Primary Site - labeled Tumor Size Over Time Recode (1988+)  \
11       C18.9-Colon, NOS                             001-400   
12       C18.9-Colon, NOS                             001-400   
13       C18.9-Colon, NOS                             001-400   
14       C18.9-Colon, NOS                             001-400   
15       C18.9-Colon, NOS                             001-400   

   Behavior code ICD-O-3    Diagnostic Confirmation              Sex  \
11                Benign  Microscopically confirmed  Male and female   
12                Benign  Microscopically confirmed  Male and female   
13                Benign  Microscopically confirmed  Male and female   
14                Benign  Microscopically confirmed  Male and female   
15                Benign  Microscopically confirmed  Male and female   

   Reason no cancer-directed surgery  Year of diagnosis  
11                 Surgery performed             2011.0  
12                 Surgery perform

In [None]:
print("Columns in filtered_data:")
print(filtered_data.columns.tolist())

Columns in filtered_data:
['Primary Site - labeled', 'Tumor Size Over Time Recode (1988+)', 'Behavior code ICD-O-3', 'Diagnostic Confirmation', 'Sex', 'Reason no cancer-directed surgery', 'Year of diagnosis']


In [None]:
print("Preprocessing data...")

# Encoding categorical features
categorical_cols = ['Behavior code ICD-O-3', 'Diagnostic Confirmation', 'Sex', 'Reason no cancer-directed surgery']
filtered_data = pd.get_dummies(filtered_data, columns=categorical_cols)

print(f"Data after encoding categorical columns has {filtered_data.shape[1]} features.")
print("Preview of data after encoding:")
print(filtered_data.head(), "\n")

# Process 'Tumor Size Over Time Recode (1988+)' into numeric values

# Replace invalid values with a default value (e.g., '0-0')
filtered_data['Tumor Size Over Time Recode (1988+)'] = filtered_data[
    'Tumor Size Over Time Recode (1988+)'
].where(
    filtered_data['Tumor Size Over Time Recode (1988+)'].str.contains(r'^\d+-\d+$', na=False),
    '0-0'  # Replace invalid values with '0-0'
)

filtered_data['Tumor Size'] = (
    filtered_data['Tumor Size Over Time Recode (1988+)']
    .str.split('-').str[1].astype(float)
)
filtered_data.drop(columns=['Tumor Size Over Time Recode (1988+)'], inplace=True)

print("Processed 'Tumor Size Over Time Recode (1988+)' into numeric values.")
print("Preview of data after processing 'Tumor Size':")
print(filtered_data.head(), "\n")

# Drop missing values if any
missing_before = filtered_data.isnull().sum().sum()
filtered_data.dropna(inplace=True)
missing_after = filtered_data.isnull().sum().sum()

print(f"Missing values before: {missing_before}, after: {missing_after}.")
print("Final preprocessed data preview:")
print(filtered_data.head(), "\n")


Preprocessing data...
Data after encoding categorical columns has 28 features.
Preview of data after encoding:
   Primary Site - labeled Tumor Size Over Time Recode (1988+)  \
11       C18.9-Colon, NOS                             001-400   
12       C18.9-Colon, NOS                             001-400   
13       C18.9-Colon, NOS                             001-400   
14       C18.9-Colon, NOS                             001-400   
15       C18.9-Colon, NOS                             001-400   

    Year of diagnosis  Behavior code ICD-O-3_Benign  \
11             2011.0                          True   
12             2012.0                          True   
13             2013.0                          True   
14             2014.0                          True   
15             2015.0                          True   

    Behavior code ICD-O-3_Borderline malignancy  \
11                                        False   
12                                        False   
13            

In [None]:
!free -h

               total        used        free      shared  buff/cache   available
Mem:            12Gi       1.4Gi       6.7Gi       1.0Mi       4.5Gi        10Gi
Swap:             0B          0B          0B


In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Define features (X) and target (y)
X = filtered_data.drop(columns=['Primary Site - labeled', 'Year of diagnosis'])  # Drop non-relevant columns
y = (filtered_data['Year of diagnosis'] >= 1990).astype(int)  # Define target based on year of diagnosis

# Select top 10 features
selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print(f"Selected features: {list(selected_features)}")

# Create a DataFrame for the selected features
X = pd.DataFrame(X_selected, columns=selected_features)
print("Preview of selected features:")
print(X.head())


Selected features: ['Behavior code ICD-O-3_Benign', 'Behavior code ICD-O-3_Borderline malignancy', 'Behavior code ICD-O-3_In situ', 'Behavior code ICD-O-3_Malignant', 'Sex_Female', 'Sex_Male', 'Sex_Male and female', 'Reason no cancer-directed surgery_Not recommended', 'Reason no cancer-directed surgery_Unknown; death certificate; or autopsy only (2003+)', 'Tumor Size']
Preview of selected features:
   Behavior code ICD-O-3_Benign  Behavior code ICD-O-3_Borderline malignancy  \
0                           1.0                                          0.0   
1                           1.0                                          0.0   
2                           1.0                                          0.0   
3                           1.0                                          0.0   
4                           1.0                                          0.0   

   Behavior code ICD-O-3_In situ  Behavior code ICD-O-3_Malignant  Sex_Female  \
0                            0.0    

In [None]:
print("Unique classes in target variable `y`:", y.unique())
print("Class distribution in `y`:\n", y.value_counts())


Unique classes in target variable `y`: [1]
Class distribution in `y`:
 Year of diagnosis
1    126720
Name: count, dtype: int64


In [None]:
# Check the range of 'Year of diagnosis'
print("Range of 'Year of diagnosis':")
print(filtered_data['Year of diagnosis'].min(), "to", filtered_data['Year of diagnosis'].max())

# Adjust the target definition
if filtered_data['Year of diagnosis'].max() >= 2010:
    print("Using a different threshold for target definition...")
    y = (filtered_data['Year of diagnosis'] >= 2010).astype(int)
else:
    print("Switching to a different feature for target variable...")
    y = (filtered_data['Tumor Size'] > 200).astype(int)

# Verify the new target
print("Target variable distribution:")
print(y.value_counts())

# Proceed with SMOTE if both classes exist
if len(y.unique()) > 1:
    print("Proceeding with SMOTE...")
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print("Class distribution after SMOTE:")
    print(pd.Series(y_resampled).value_counts())
else:
    print("Target variable still has only one class. Adjust your filtering or target definition.")


Range of 'Year of diagnosis':
2011.0 to 2021.0
Using a different threshold for target definition...
Target variable distribution:
Year of diagnosis
1    126720
Name: count, dtype: int64
Target variable still has only one class. Adjust your filtering or target definition.


In [None]:
print("Class distribution in target variable `y`:")
print(y.value_counts())

Class distribution in target variable `y`:
Year of diagnosis
1    126720
Name: count, dtype: int64


In [None]:
# from sklearn.model_selection import train_test_split

# # Assuming 'X' is your features DataFrame and 'y' is your target variable
# print("Splitting the dataset into training and testing sets...")

# # Split data into 70% training and 30% testing
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# print(f"Training set size: {len(X_train)} samples")
# print(f"Testing set size: {len(X_test)} samples")

# Redefine the target variable
print("Redefining the target variable...")
y = (filtered_data['Year of diagnosis'] >= 2015).astype(int)  # Adjust threshold as needed

# Verify the target distribution
print("Target variable distribution after redefining:")
print(y.value_counts())

# Check if both classes exist
if len(y.unique()) < 2:
    print("Error: The target variable still has only one class. Adjust your threshold or feature for classification.")
else:
    # Stratified split to ensure both classes are represented
    from sklearn.model_selection import train_test_split

    print("Splitting the dataset into training and testing sets with stratification...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Verify class distribution in train and test sets
    print("Class distribution in `y_train` after stratified splitting:")
    print(y_train.value_counts())
    print("Class distribution in `y_test` after stratified splitting:")
    print(y_test.value_counts())



Redefining the target variable...
Target variable distribution after redefining:
Year of diagnosis
1    80640
0    46080
Name: count, dtype: int64
Splitting the dataset into training and testing sets with stratification...
Class distribution in `y_train` after stratified splitting:
Year of diagnosis
1    56448
0    32256
Name: count, dtype: int64
Class distribution in `y_test` after stratified splitting:
Year of diagnosis
1    24192
0    13824
Name: count, dtype: int64


In [None]:
print("Features being used for training:")
print(X.columns)
print(f"Number of features: {X.shape[1]}")

Features being used for training:
Index(['Behavior code ICD-O-3_Benign',
       'Behavior code ICD-O-3_Borderline malignancy',
       'Behavior code ICD-O-3_In situ', 'Behavior code ICD-O-3_Malignant',
       'Sex_Female', 'Sex_Male', 'Sex_Male and female',
       'Reason no cancer-directed surgery_Not recommended',
       'Reason no cancer-directed surgery_Unknown; death certificate; or autopsy only (2003+)',
       'Tumor Size'],
      dtype='object')
Number of features: 10


In [None]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.datasets import make_classification

# Generate synthetic data for example
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42, weights=[0.7, 0.3])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define basic classifiers
rf = RandomForestClassifier(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)

# Meta classifiers
bagging_rf = BaggingClassifier(estimator=rf, n_estimators=10, random_state=42)
adaboost_dt = AdaBoostClassifier(estimator=dt, n_estimators=50, random_state=42)
subspace_rf = BaggingClassifier(estimator=rf, n_estimators=10, max_features=0.5, random_state=42)

# Combine classifiers using ensemble voting
voting_clf = VotingClassifier(
    estimators=[
        ('bagging_rf', bagging_rf),
        ('adaboost_dt', adaboost_dt),
        ('subspace_rf', subspace_rf)
    ],
    voting='soft'  # 'soft' for probability-based voting
)

# Train the models and evaluate
print("Training classifiers...")

# Fit individual meta classifiers
bagging_rf.fit(X_train, y_train)
adaboost_dt.fit(X_train, y_train)
subspace_rf.fit(X_train, y_train)

# Fit the voting classifier
voting_clf.fit(X_train, y_train)

# Evaluate individual meta classifiers
print("\nEvaluating classifiers:")
for name, clf in [('Bagging RF', bagging_rf), ('AdaBoost DT', adaboost_dt), ('Subspace RF', subspace_rf)]:
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]
    print(f"{name} - Accuracy: {accuracy_score(y_test, y_pred):.4f}, AUC: {roc_auc_score(y_test, y_proba):.4f}")

# Evaluate the voting classifier
y_pred_voting = voting_clf.predict(X_test)
y_proba_voting = voting_clf.predict_proba(X_test)[:, 1]
print(f"\nVoting Classifier - Accuracy: {accuracy_score(y_test, y_pred_voting):.4f}, AUC: {roc_auc_score(y_test, y_proba_voting):.4f}")

print("Model training and evaluation complete.")


Training classifiers...





Evaluating classifiers:
Bagging RF - Accuracy: 0.8533, AUC: 0.9026
AdaBoost DT - Accuracy: 0.8567, AUC: 0.8255
Subspace RF - Accuracy: 0.8467, AUC: 0.8957

Voting Classifier - Accuracy: 0.8900, AUC: 0.9050
Model training and evaluation complete.


In [None]:
# from sklearn.ensemble import VotingClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression

# print("Training an ensemble VotingClassifier...")

# # Define individual models
# rf = RandomForestClassifier(random_state=42)
# dt = DecisionTreeClassifier(random_state=42)
# lr = LogisticRegression(max_iter=1000, random_state=42)

# # Create a VotingClassifier
# voting_clf = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('lr', lr)], voting='soft')

# # Train the model
# voting_clf.fit(X_train, y_train)

# print("Model training complete.")


In [None]:
print("Class distribution in `y_train`:")
print(y_train.value_counts())

Class distribution in `y_train`:


AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
import matplotlib.pyplot as plt

# Predict on the test data
print("Evaluating the model...")
y_pred = voting_clf.predict(X_test)
y_prob = voting_clf.predict_proba(X_test)[:, 1]  # Predicted probabilities for ROC-AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot the ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guessing
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts())

# Train the model again with the resampled data
voting_clf.fit(X_resampled, y_resampled)


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
import matplotlib.pyplot as plt

# Predict on the test data
print("Evaluating the model...")
y_pred = voting_clf.predict(X_test)
y_prob = voting_clf.predict_proba(X_test)[:, 1]  # Predicted probabilities for ROC-AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot the ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guessing
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()


Evaluating the model...


NameError: name 'voting_clf' is not defined

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Initialize individual models with class weights
rf = RandomForestClassifier(random_state=42, class_weight={0: 3, 1: 1})
dt = DecisionTreeClassifier(random_state=42, class_weight={0: 3, 1: 1})  # Apply class weights to DecisionTree
lr = LogisticRegression(max_iter=1000, random_state=42)

xgb_model = xgb.XGBClassifier(random_state=42, scale_pos_weight=3)  # You can adjust the scale_pos_weight based on your dataset

# Create the VotingClassifier with the additional XGBoost model
voting_clf = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('lr', lr), ('xgb', xgb_model)], voting='soft')


# Train the VotingClassifier
print("Training the VotingClassifier with class weights...")
voting_clf.fit(X_train, y_train)

# Model is trained
print("Model training complete.")


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
import matplotlib.pyplot as plt

# Predict on the test data
print("Evaluating the model...")
y_pred = voting_clf.predict(X_test)
y_prob = voting_clf.predict_proba(X_test)[:, 1]  # Predicted probabilities for ROC-AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot the ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guessing
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(voting_clf, X, y, cv=5, scoring='roc_auc')  # You can also use accuracy or other metrics
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation ROC-AUC: {cv_scores.mean():.2f}")


In [None]:
# Remove any special characters or spaces from column names
X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_test.columns = X_test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# Now train the LightGBM model
import lightgbm as lgb
from sklearn.metrics import accuracy_score, roc_auc_score

# Use LightGBM for classification
lgb_model = lgb.LGBMClassifier(class_weight='balanced', random_state=42)
lgb_model.fit(X_train, y_train)

# Evaluate performance
y_pred_lgb = lgb_model.predict(X_test)
print(f"Accuracy of LightGBM: {accuracy_score(y_test, y_pred_lgb):.2f}")
print(f"ROC-AUC of LightGBM: {roc_auc_score(y_test, lgb_model.predict_proba(X_test)[:, 1]):.2f}")


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# Initialize individual models
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')
lr = LogisticRegression(max_iter=1000, random_state=42)

# Create VotingClassifier
voting_clf = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('lr', lr)], voting='soft')

# Train the model
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test)
y_prob = voting_clf.predict_proba(X_test)[:, 1]  # Probabilities for AUC

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Verify the class distribution after SMOTE
print(f"Class distribution after SMOTE:\n{pd.Series(y_train_resampled).value_counts()}")


In [None]:
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Initialize individual models
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')
lr = LogisticRegression(max_iter=1000, random_state=42)

# Define XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, scale_pos_weight=3)  # Use scale_pos_weight for imbalanced data

# Create the VotingClassifier with XGBoost
voting_clf = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('lr', lr), ('xgb', xgb_model)], voting='soft')

# Train the VotingClassifier on the resampled data
voting_clf.fit(X_train_resampled, y_train_resampled)

# Model is trained
print("Model training complete.")


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid for RandomForest
param_dist_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Randomized Search for RandomForest
random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                      param_distributions=param_dist_rf,
                                      n_iter=10, cv=5,
                                      scoring='roc_auc',
                                      n_jobs=-1, random_state=42, verbose=1)

# Fit the RandomizedSearchCV model
random_search_rf.fit(X_train_resampled, y_train_resampled)

# Get the best estimator
best_rf_model = random_search_rf.best_estimator_

# Add the tuned model to VotingClassifier
voting_clf = VotingClassifier(estimators=[('rf', best_rf_model), ('dt', dt), ('lr', lr), ('xgb', xgb_model)], voting='soft')

# Train the updated VotingClassifier
voting_clf.fit(X_train_resampled, y_train_resampled)


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
import matplotlib.pyplot as plt

# Predict on the test data
y_pred = voting_clf.predict(X_test)
y_prob = voting_clf.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

# Print metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot the ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (area = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guessing
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()


In [None]:
import lightgbm as lgb

# Initialize LightGBM model
lgb_model = lgb.LGBMClassifier(class_weight='balanced', random_state=42)

# Train the model
lgb_model.fit(X_train_resampled, y_train_resampled)

# Evaluate performance
y_pred_lgb = lgb_model.predict(X_test)
print(f"Accuracy of LightGBM: {accuracy_score(y_test, y_pred_lgb):.2f}")
print(f"ROC-AUC of LightGBM: {roc_auc_score(y_test, lgb_model.predict_proba(X_test)[:, 1]):.2f}")


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize RandomForest model (you can also adjust parameters as needed)
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Train the model on the resampled data (if you're using resampling like SMOTE)
rf_model.fit(X_train_resampled, y_train_resampled)

# Alternatively, if you haven't used resampling, use X_train and y_train
# rf_model.fit(X_train, y_train)

import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances from the trained model
importances = rf_model.feature_importances_

# Get the feature names from the training data
features = X_train_resampled.columns  # or X_train.columns if you haven't resampled

# Create a DataFrame with feature names and importance scores
feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})

# Sort the DataFrame by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Random Forest)')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Initialize RandomForest model (you can also adjust parameters as needed)
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Train the model on the resampled data (if you're using resampling like SMOTE)
rf_model.fit(X_train_resampled, y_train_resampled)

# Alternatively, if you haven't used resampling, use X_train and y_train
# rf_model.fit(X_train, y_train)

importances = rf_model.feature_importances_

# Check if X_train_resampled is a NumPy array, if so, manually define the feature names
if isinstance(X_train_resampled, np.ndarray):
    # Define the feature names manually if X_train_resampled is a NumPy array
    features = ['Behavior code ICD-O-3_Benign',
                'Behavior code ICD-O-3_Borderline malignancy',
                'Behavior code ICD-O-3_In situ',
                'Behavior code ICD-O-3_Malignant',
                'Sex_Female',
                'Sex_Male',
                'Sex_Male and female',
                'Reason no cancer-directed surgery_Recommended but not performed, unknown reason',
                'Reason no cancer-directed surgery_Surgery performed',
                'Tumor Size']  # Make sure these match the columns used during training
else:
    # If X_train_resampled is a DataFrame, use its columns directly
    features = X_train_resampled.columns

# Create a DataFrame with feature names and importance scores
feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})

# Sort the DataFrame by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Random Forest)')
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix as heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Survived', 'Predicted Not Survived'],
            yticklabels=['Actual Survived', 'Actual Not Survived'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guessing
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve (ROC)')
plt.legend(loc="lower right")
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

# Calculate precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, y_prob)

# Calculate average precision score (AUC-PR)
auc_pr = average_precision_score(y_test, y_prob)

# Plot Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'Precision-Recall curve (AUC = {auc_pr:.2f})', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()

print(f"AUC-PR: {auc_pr:.2f}")


In [None]:
# Check the column names in the training data
print(X_train.columns)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances from the trained RandomForest model
importances = rf_model.feature_importances_

# Use correct column names from your dataset
features = X_train.columns

# Create a DataFrame with feature names and importance scores
feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})

# Sort the DataFrame by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance (Random Forest)')
plt.show()


In [None]:
# Plot the distribution of 'Tumor Size' and 'Age at Diagnosis'
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(X_train['Tumor_Size'], kde=True, bins=30, color='skyblue')
plt.title('Tumor Size Distribution')

plt.tight_layout()
plt.show()


In [None]:
# Calculate correlation matrix
corr_matrix = X_train.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Plot the gender distribution (Male vs Female)
plt.figure(figsize=(8, 6))
sns.countplot(x='Sex_Male', data=X_train)  # Adjust column name based on how your data is encoded
plt.title('Gender Distribution (Male vs Female)')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])
plt.show()


In [None]:
# Assuming that `y_pred` contains the predicted labels and `y_test` contains actual survival outcomes

# Add gender to the predictions data
predictions_df = pd.DataFrame({
    'Gender': X_test['Sex_Male'],  # Adjust this column name as needed
    'Predicted_Survival': y_pred
})

# Map gender values for clarity
predictions_df['Gender'] = predictions_df['Gender'].map({0: 'Female', 1: 'Male'})

# Plot the predicted survival rates for Male vs Female
plt.figure(figsize=(8, 6))
sns.countplot(x='Gender', hue='Predicted_Survival', data=predictions_df, palette='Set1')
plt.title('Predicted Survival by Gender (Male vs Female)')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])
plt.legend(title='Survived', loc='upper right', labels=['Not Survived', 'Survived'])
plt.show()


In [None]:
# Plot Tumor Size distribution by Gender (Male vs Female)
plt.figure(figsize=(8, 6))
sns.boxplot(x='Sex_Male', y='Tumor_Size', data=X_train, palette='Set1')
plt.title('Tumor Size Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Tumor Size (in cm)')
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])
plt.show()


In [None]:
print(X_train.columns)



In [None]:
print(merged_data.columns)

In [None]:
print(merged_data[['Year of diagnosis']].head())

In [None]:
# Group the data by year of diagnosis and count the occurrences
diagnoses_by_year = merged_data.groupby('Year of diagnosis').size()

# Plot the number of diagnoses per year
plt.figure(figsize=(10, 6))
diagnoses_by_year.plot(kind='line', marker='o', color='blue')
plt.title('Number of Diagnoses Over the Years')
plt.xlabel('Year of Diagnosis')
plt.ylabel('Number of Diagnoses')
plt.grid(True)
plt.show()


In [None]:
unique_years = merged_data['Year of diagnosis'].unique()
print("Unique years in the dataset:", unique_years)

In [None]:
missing_years = merged_data['Year of diagnosis'].isnull().sum()
print(f"Missing values in 'Year of diagnosis': {missing_years}")

In [None]:
# Check the distribution of diagnoses by year
diagnoses_by_year = merged_data['Year of diagnosis'].value_counts().sort_index()

# Print the distribution
print(diagnoses_by_year)

# Plot the distribution of diagnoses by year
plt.figure(figsize=(10, 6))
diagnoses_by_year.plot(kind='bar', color='skyblue')
plt.title('Number of Diagnoses by Year of Diagnosis')
plt.xlabel('Year of Diagnosis')
plt.ylabel('Number of Diagnoses')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Check the count of diagnoses per year (if you suspect the data is grouped)
diagnoses_by_year = merged_data.groupby('Year of diagnosis').size()

# Print the counts of diagnoses for each year
print(diagnoses_by_year)


In [None]:
# Check the minimum and maximum years in the dataset
min_year = merged_data['Year of diagnosis'].min()
max_year = merged_data['Year of diagnosis'].max()

print(f"Year range in the dataset: {min_year} - {max_year}")
