In [None]:
# General Libraries
import numpy as np  # For numerical computations
import pandas as pd  # For data manipulation and analysis
import seaborn as sns  # For statistical data visualization
from sklearn.preprocessing import StandardScaler, MinMaxScaler  # For scaling features
from sklearn.impute import SimpleImputer  # For handling missing values
# Model Selection and Training
from sklearn.linear_model import LinearRegression, LogisticRegression  # Example models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC  # Support Vector Classifier
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  # For tuning
# Evaluation Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score  # For regression models

In [None]:
dataset= pd.read_csv('train.csv')

In [322]:
dataset

Unnamed: 0,id,cancer_stage,age,gender,treatment_type,treatment_duration,bmi,survived
0,1388370,3.0,0.302885,0.0,3.0,0.360146,0.061721,0
1,2271027,1.0,0.298077,0.0,3.0,0.681901,0.040882,0
2,2064058,3.0,0.346154,1.0,3.0,0.691042,0.125071,0
3,1961956,1.0,0.254808,1.0,2.0,0.025594,0.102312,1
4,491759,4.0,0.322115,0.0,2.0,0.407678,0.127523,0
...,...,...,...,...,...,...,...,...
431995,1333116,2.0,0.274038,1.0,3.0,0.884826,0.122749,0
431996,2027790,3.0,0.298077,1.0,1.0,0.254113,0.123213,0
431997,1264819,3.0,0.341346,0.0,2.0,0.521024,0.096373,0
431998,3234807,2.0,0.250000,1.0,1.0,0.446069,0.069056,0


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432000 entries, 0 to 431999
Data columns (total 22 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  432000 non-null  int64  
 1   age                 421222 non-null  float64
 2   gender              413617 non-null  object 
 3   country             417551 non-null  object 
 4   diagnosis_date      432000 non-null  object 
 5   cancer_stage        408665 non-null  object 
 6   family_history      432000 non-null  object 
 7   smoking_status      406741 non-null  object 
 8   bmi                 411640 non-null  float64
 9   cholesterol_level   425898 non-null  float64
 10  hypertension        432000 non-null  int64  
 11  asthma              432000 non-null  int64  
 12  cirrhosis           432000 non-null  object 
 13  other_cancer        432000 non-null  object 
 14  treatment_type      413949 non-null  object 
 15  end_treatment_date  432000 non-nul

In [None]:
numerical_data = dataset.select_dtypes(include=['float64', 'int64'])
correlation = numerical_data.corr()
print(correlation["survived"])
dataset['gender'] = dataset['gender'].map({'Male': 1, 'Female': 0}) #it has two status

id                   0.001238
age                 -0.173306
bmi                 -0.023874
cholesterol_level   -0.063192
hypertension         0.015853
asthma              -0.173575
comorbidity_score   -0.200493
survived             1.000000
Name: survived, dtype: float64


In [None]:
cancer_stage_mapping = {
    'Stage I': 1,
    'Stage II': 2,
    'Stage III': 3,
    'Stage IV': 4,
    'Stage 1': 1,
    'Stage 2': 2,
    'Stage 3': 3,
    'Stage 4': 4,
     'Stage one': 1,
    'Stage two': 2,
    'Stage three': 3,
    'Stage four': 4,
}
dataset['cancer_stage'] = dataset['cancer_stage'].map(cancer_stage_mapping)


In [None]:
# Define the custom mapping for smoking status
smoking_status_mapping = {
    'Never Smoked': 0,
    'Light Smoker': 1,
    'Moderate Smoker': 2,
    'Occasional Smoker': 3,
    'Heavy Smoker': 4,
    'Former Smoker': 5
}

# Apply the mapping to the smoking_status column
dataset['smoking_status'] = dataset['smoking_status'].map(smoking_status_mapping)



In [None]:

treatment_type_mapping = {
    'Chemotherapy': 1,
    'Radiation': 2,
    'Surgery': 3,
    'Targeted Therapy': 4,
    'Immunotherapy': 5,
    'Hormone Therapy': 6,
    'Palliative Care': 7,
}

dataset['treatment_type'] = dataset['treatment_type'].map(treatment_type_mapping)

In [None]:

diet_type_mapping = {
    'Vegetarian':1,
    'Pescatarian':2,
    'Omnivore':3,
    'Vegan':4
}
dataset['diet_type'] = dataset['diet_type'].map(diet_type_mapping)


In [None]:
dataset = dataset.drop(['country', 'occupation_type', 'marital_status','education_level'], axis=1)
dataset

Unnamed: 0,id,age,gender,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,comorbidity_score,diet_type,survived
0,1388370,54.0,0.0,2019-03-13,,1,0.0,18.308815,188.738536,1,0,1,0,3.0,2020-03-27,3.565785,1.0,0
1,2271027,53.0,0.0,2022-11-22,1.0,1,,11.948707,246.918596,0,1,1,NON,3.0,2024-05-31,3.245782,2.0,0
2,2064058,63.0,1.0,2014-09-09,3.0,1,,37.643908,266.184808,1,0,0,no,3.0,2016-03-23,1.622766,1.0,0
3,1961956,44.0,1.0,2014-08-25,1.0,0,3.0,,227.632301,0,0,1,0,2.0,2015-03-10,,1.0,1
4,491759,58.0,,2020-01-11,4.0,0,,38.392270,263.429276,1,1,0,0,2.0,2021-02-20,2.788087,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431995,1333116,48.0,1.0,2017-01-04,2.0,0,5.0,36.935237,180.286197,1,1,0,1,3.0,2018-11-02,5.193529,3.0,0
431996,2027790,53.0,1.0,2022-06-21,3.0,1,3.0,37.076944,273.780148,1,1,yes,0,1.0,2023-05-09,4.659336,3.0,0
431997,1264819,62.0,0.0,2021-01-17,,1,,28.885104,291.515584,1,0,1,NON,2.0,2022-04-30,3.584620,3.0,0
431998,3234807,43.0,1.0,2019-06-18,2.0,1,,20.547698,271.589043,1,1,0,0,,2020-08-18,2.699991,3.0,0


In [None]:
# Convert the columns to datetime format if they're not already
dataset['diagnosis_date'] = pd.to_datetime(dataset['diagnosis_date'])
dataset['end_treatment_date'] = pd.to_datetime(dataset['end_treatment_date'])

# Calculate the difference between end_treatment_date and diagnosis_date
dataset['treatment_duration'] = (dataset['end_treatment_date'] - dataset['diagnosis_date']).dt.days #treatement duration with days

In [None]:
dataset = dataset.drop(['diagnosis_date', 'end_treatment_date'], axis=1)

In [None]:

family_history_mapping = {
    '1':1,
    '0':0,
   'true':1,
    'False':0,
    'No':0,
    'no':0,
    'yes':1,
    'Yes':1
}

dataset['family_history'] = dataset['family_history'].map(family_history_mapping)

cirrhosis_mapping = {
    '1':1,
    '0':0,
    'No':0,
    'no':0,
    'yes':1,
    'Yes':1
}

dataset['cirrhosis'] = dataset['cirrhosis'].map(cirrhosis_mapping)

other_cancer_mapping= {
    '1':1,
    '0':0,
    'No':0,
    'no':0,
    'yes':1,
    'Yes':1,
    'NON':0
}

dataset['other_cancer'] = dataset['other_cancer'].map(other_cancer_mapping)


In [None]:
dataset.corr()['survived'] #to see the corrolation

Unnamed: 0,survived
id,0.001238
age,-0.173306
gender,-0.145994
cancer_stage,-0.467956
family_history,-0.232365
smoking_status,-0.073482
bmi,-0.023874
cholesterol_level,-0.063192
hypertension,0.015853
asthma,-0.173575


In [None]:
dataset = dataset[['id','cancer_stage','age','gender','treatment_type','treatment_duration','bmi','survived']]
dataset

Unnamed: 0,id,cancer_stage,age,gender,treatment_type,treatment_duration,bmi,survived
0,1388370,,54.0,0.0,3.0,380,18.308815,0
1,2271027,1.0,53.0,0.0,3.0,556,11.948707,0
2,2064058,3.0,63.0,1.0,3.0,561,37.643908,0
3,1961956,1.0,44.0,1.0,2.0,197,,1
4,491759,4.0,58.0,,2.0,406,38.392270,0
...,...,...,...,...,...,...,...,...
431995,1333116,2.0,48.0,1.0,3.0,667,36.935237,0
431996,2027790,3.0,53.0,1.0,1.0,322,37.076944,0
431997,1264819,,62.0,0.0,2.0,468,28.885104,0
431998,3234807,2.0,43.0,1.0,,427,20.547698,0


In [None]:
dataset.isnull().sum()

Unnamed: 0,0
id,0
cancer_stage,50560
age,10778
gender,89179
treatment_type,161809
treatment_duration,0
bmi,20360
survived,0


In [None]:
# Exclude the 'id' column
null_counts = dataset.drop('id', axis=1).isnull().sum(axis=1)

# Count how many rows have at least one null value
rows_with_nulls = (null_counts > 0).sum()
print(f"Number of rows with at least one null value (excluding 'id' column): {rows_with_nulls}")

Number of rows with at least one null value (excluding 'id' column): 252295


In [None]:
dataset.loc[:, 'cancer_stage'] = dataset['cancer_stage'].fillna(dataset['cancer_stage'].mode()[0])

In [None]:
# 2. 'age' - Impute with the median
dataset.loc[:, 'age'] = dataset['age'].fillna(dataset['age'].median())

# 3. 'gender' - Impute with the mode
dataset.loc[:, 'gender'] = dataset['gender'].fillna(dataset['gender'].mode()[0])

# 4. 'treatment_type' - Impute with the mode
dataset.loc[:, 'treatment_type'] = dataset['treatment_type'].fillna(dataset['treatment_type'].mode()[0])

# 5. 'bmi' - Impute with the median
dataset.loc[:, 'bmi'] = dataset['bmi'].fillna(dataset['bmi'].median())

# Verify there are no missing values remaining
print(dataset.isnull().sum())


id                    0
cancer_stage          0
age                   0
gender                0
treatment_type        0
treatment_duration    0
bmi                   0
survived              0
dtype: int64


In [None]:

numerical_features = ['age', 'bmi', 'treatment_duration']

# Initialize the scaler
scaler = StandardScaler()

# Scale the numerical features and assign them back explicitly
dataset.loc[:, numerical_features] = scaler.fit_transform(dataset[numerical_features])



# Define the numerical features to scale
numerical_features = ['age', 'bmi', 'treatment_duration']

# Initialize the scaler
scaler = MinMaxScaler()

# Apply the scaler to the numerical features

dataset.loc[:, numerical_features] = scaler.fit_transform(dataset[numerical_features]).astype('float64')


# Verify the scaling
print(dataset.head())



        id  cancer_stage       age  gender  treatment_type  \
0  1388370           3.0  0.302885     0.0             3.0   
1  2271027           1.0  0.298077     0.0             3.0   
2  2064058           3.0  0.346154     1.0             3.0   
3  1961956           1.0  0.254808     1.0             2.0   
4   491759           4.0  0.322115     0.0             2.0   

   treatment_duration       bmi  survived  
0            0.360146  0.061721         0  
1            0.681901  0.040882         0  
2            0.691042  0.125071         0  
3            0.025594  0.102312         1  
4            0.407678  0.127523         0  


In [354]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import randint

# Assuming dataset is already loaded as a DataFrame
# Features and target
X = dataset.drop(['survived', 'id'], axis=1)  # Drop 'survived' (target) and 'id' (non-informative feature)
y = dataset['survived']  # Target feature

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost Classifier
model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Hyperparameter space
param_dist = {
    'n_estimators': randint(50, 150),  # Number of trees
    'max_depth': randint(5, 20),       # Maximum tree depth
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Learning rate
    'min_child_weight': randint(1, 5), # Minimum sum of instance weight needed in a child
    'subsample': [0.6, 0.8, 1.0],      # Fraction of samples used for training each tree
    'colsample_bytree': [0.6, 0.8, 1.0] # Fraction of features used for each tree
}

# RandomizedSearchCV for XGBoost
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=10,        # Number of parameter settings to sample
    cv=2,             # Cross-validation with 2 folds
    n_jobs=-1,        # Use all CPU cores
    random_state=42,  # For reproducibility
    verbose=1         # Print progress
)

# Train the model
random_search.fit(X_train, y_train)

# Generate predictions using the best model found
y_pred = random_search.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Prepare the final output with 'id' and predictions
final_output = dataset.loc[X_test.index, ['id']].copy()  # Retrieve 'id' column for the test set
final_output['survived'] = y_pred  # Add predictions

# Save or display the final output
print(final_output.head())
final_output.to_csv('predictions.csv', index=False)


Fitting 2 folds for each of 10 candidates, totalling 20 fits


Parameters: { "use_label_encoder" } are not used.



Model Accuracy: 0.74
             id  survived
413297  2701383         0
341194   413433         0
382540  1822544         0
329718  2337185         1
150554  2558575         0


In [356]:
#the testing section


test_data = pd.read_csv('test.csv')
numerical_data = test_data.select_dtypes(include=['float64', 'int64'])
correlation = numerical_data.corr()
# Step 2: Preprocess the test data

dataset['gender'] = dataset['gender'].map({'Male': 1, 'Female': 0}) #it has two status


In [357]:
cancer_stage_mapping = {
    'Stage I': 1,
    'Stage II': 2,
    'Stage III': 3,
    'Stage IV': 4,
    'Stage 1': 1,
    'Stage 2': 2,
    'Stage 3': 3,
    'Stage 4': 4,
     'Stage one': 1,
    'Stage two': 2,
    'Stage three': 3,
    'Stage four': 4,
}
test_data['cancer_stage'] =test_data['cancer_stage'].map(cancer_stage_mapping)

smoking_status_mapping = {
    'Never Smoked': 0,
    'Light Smoker': 1,
    'Moderate Smoker': 2,
    'Occasional Smoker': 3,
    'Heavy Smoker': 4,
    'Former Smoker': 5
}
test_data['smoking_status'] = test_data['smoking_status'].map(smoking_status_mapping)
treatment_type_mapping = {
    'Chemotherapy': 1,
    'Radiation': 2,
    'Surgery': 3,
    'Targeted Therapy': 4,
    'Immunotherapy': 5,
    'Hormone Therapy': 6,
    'Palliative Care': 7,
}

test_data['treatment_type'] = test_data['treatment_type'].map(treatment_type_mapping)
diet_type_mapping = {
    'Vegetarian':1,
    'Pescatarian':2,
    'Omnivore':3,
    'Vegan':4
}
test_data['diet_type'] =test_data['diet_type'].map(diet_type_mapping)
# Convert the columns to datetime format if they're not already
test_data['diagnosis_date'] = pd.to_datetime(test_data['diagnosis_date'])
test_data['end_treatment_date'] = pd.to_datetime(test_data['end_treatment_date'])

# Calculate the difference between end_treatment_date and diagnosis_date
test_data['treatment_duration'] = (test_data['end_treatment_date'] - test_data['diagnosis_date']).dt.days
family_history_mapping = {
    '1':1,
    '0':0,
   'true':1,
    'False':0,
    'No':0,
    'no':0,
    'yes':1,
    'Yes':1
}

test_data['family_history'] = test_data['family_history'].map(family_history_mapping)

cirrhosis_mapping = {
    '1':1,
    '0':0,
    'No':0,
    'no':0,
    'yes':1,
    'Yes':1
}

test_data['cirrhosis'] = test_data['cirrhosis'].map(cirrhosis_mapping)

other_cancer_mapping= {
    '1':1,
    '0':0,
    'No':0,
    'no':0,
    'yes':1,
    'Yes':1,
    'NON':0
}

test_data['other_cancer'] = test_data['other_cancer'].map(other_cancer_mapping)


In [358]:
test_data = test_data[['id','cancer_stage','age','gender','treatment_type','treatment_duration','bmi']]
dataset['gender'] = dataset['gender'].map({'Male': 1, 'Female': 0}) #it has two status
gender_mapping ={
    'Male':0,
    'Female':1
}
test_data['gender'] =test_data['gender'].map(gender_mapping)
test_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['gender'] =test_data['gender'].map(gender_mapping)


Unnamed: 0,id,cancer_stage,age,gender,treatment_type,treatment_duration,bmi
0,2141894,1.0,53.0,1.0,1.0,244,33.663288
1,50270,2.0,50.0,1.0,3.0,522,45.021176
2,1273289,1.0,46.0,,3.0,226,41.501656
3,2589854,1.0,,,1.0,369,27.102252
4,753728,2.0,41.0,0.0,2.0,320,
...,...,...,...,...,...,...,...
107995,1791212,4.0,62.0,0.0,,434,
107996,157790,3.0,54.0,,1.0,227,19.030969
107997,922153,1.0,58.0,0.0,3.0,520,32.883689
107998,1655262,2.0,57.0,1.0,,214,35.302971


In [359]:
# 2. 'age' - Impute with the median
test_data.loc[:, 'age'] = test_data['age'].fillna(test_data['age'].median())

# 3. 'gender' - Impute with the mode
test_data.loc[:, 'gender'] = test_data['gender'].fillna(test_data['gender'].mode()[0])

# 4. 'treatment_type' - Impute with the mode
test_data.loc[:, 'treatment_type'] = test_data['treatment_type'].fillna(test_data['treatment_type'].mode()[0])

# 5. 'bmi' - Impute with the median
test_data.loc[:, 'bmi'] = test_data['bmi'].fillna(test_data['bmi'].median())

# Verify there are no missing values remaining
print(test_data.isnull().sum())

id                        0
cancer_stage          12531
age                       0
gender                    0
treatment_type            0
treatment_duration        0
bmi                       0
dtype: int64


In [360]:
# 2. 'age' - Impute with the median
test_data.loc[:, 'age'] = test_data['age'].fillna(test_data['age'].median())

# 3. 'gender' - Impute with the mode
test_data.loc[:, 'gender'] = dataset['gender'].fillna(test_data['gender'].mode()[0])

# 4. 'treatment_type' - Impute with the mode
test_data.loc[:, 'treatment_type'] = test_data['treatment_type'].fillna(test_data['treatment_type'].mode()[0])

# 5. 'bmi' - Impute with the median
test_data.loc[:, 'bmi'] = dataset['bmi'].fillna(test_data['bmi'].median())

# Verify there are no missing values remaining
print(dataset.isnull().sum())

test_data.loc[:, 'cancer_stage'] = test_data['cancer_stage'].fillna(test_data['cancer_stage'].mode()[0])

id                         0
cancer_stage               0
age                        0
gender                432000
treatment_type             0
treatment_duration         0
bmi                        0
survived                   0
dtype: int64


In [361]:
numerical_features = ['age', 'bmi', 'treatment_duration']
scaler = StandardScaler()
test_data.loc[:, numerical_features] = scaler.fit_transform(test_data[numerical_features])
numerical_features = ['age', 'bmi', 'treatment_duration']
scaler = MinMaxScaler()

# Apply the scaler to the numerical features

test_data.loc[:, numerical_features] = scaler.fit_transform(test_data[numerical_features]).astype('float64')

 -0.96734569]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  test_data.loc[:, numerical_features] = scaler.fit_transform(test_data[numerical_features])


In [363]:
# Drop 'id' (if it exists in the test data) and ensure the same structure
X_test_final = test_data.drop(['id'], axis=1)

# Step 4: Generate predictions using the trained RandomizedSearchCV model
y_pred_test = random_search.predict(X_test_final)

# Step 5: Prepare the final output with 'id' and 'survived'
final_output_test = test_data[['id']].copy()  # Keep 'id' column
final_output_test['survived'] = y_pred_test  # Add predictions

# Step 6: Save the results to a CSV or display
final_output_test.to_csv('submission_example.csv', index=False)

# Alternatively, print the first few rows of the result
print(final_output_test.head())

        id  survived
0  2141894         1
1    50270         0
2  1273289         1
3  2589854         1
4   753728         1
