# SECB3203 - Programming for Bioinformatics
## Group 10 - Pancreatic Cancer Prediction
###  Group Members
- WELSON WOONG LU BIN (A23CS0196)
- RAVINESH A/L MARAN (A23CS0175)
- BERNICE LOU MIN YUN (A23CS0056)


---
## Import Libraries (Machine Learning and Prediction)

In [1]:
# For EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, RobustScaler

# For Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, BayesianRidge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.exceptions import FitFailedWarning

---

## Progress 2 
### Importing Data Set
  - Understanding the data
  - Importing and exporting data in Python
  - Getting started analyzing data in Python
  - Python packages for Data Science




In [2]:
# import dataset from csv to python 
df = pd.read_csv(r"C:\bioinfo2\p4b\project\Pancrease_Cancer_Prediction\pancreatic_cancer_prediction_sample.csv")
df.head()

Unnamed: 0,Country,Age,Gender,Smoking_History,Obesity,Diabetes,Chronic_Pancreatitis,Family_History,Hereditary_Condition,Jaundice,...,Stage_at_Diagnosis,Survival_Time_Months,Treatment_Type,Survival_Status,Alcohol_Consumption,Physical_Activity_Level,Diet_Processed_Food,Access_to_Healthcare,Urban_vs_Rural,Economic_Status
0,Canada,64,Female,0,0,0,0,0,0,0,...,Stage III,13,Surgery,0,0,Medium,Low,High,Urban,Low
1,South Africa,77,Male,1,1,0,0,0,0,0,...,Stage III,13,Chemotherapy,0,1,Medium,Medium,Medium,Urban,Low
2,India,71,Female,0,0,0,0,0,0,0,...,Stage IV,3,Chemotherapy,1,0,Medium,High,Low,Rural,Middle
3,Germany,56,Male,0,0,0,0,1,0,1,...,Stage IV,6,Radiation,0,1,Low,Low,Medium,Rural,Middle
4,United States,82,Female,0,0,0,0,1,0,0,...,Stage IV,9,Chemotherapy,1,0,Low,Medium,Medium,Rural,Low


In [3]:
# export dataset 
df.to_csv("pancreatic_cancer_data_processed.csv", index=False)


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,50000.0,64.54094,9.973847,30.0,58.0,65.0,71.0,90.0
Smoking_History,50000.0,0.29954,0.458061,0.0,0.0,0.0,1.0,1.0
Obesity,50000.0,0.24826,0.432008,0.0,0.0,0.0,0.0,1.0
Diabetes,50000.0,0.19998,0.399989,0.0,0.0,0.0,0.0,1.0
Chronic_Pancreatitis,50000.0,0.0993,0.299067,0.0,0.0,0.0,0.0,1.0
Family_History,50000.0,0.15168,0.358714,0.0,0.0,0.0,0.0,1.0
Hereditary_Condition,50000.0,0.04944,0.216787,0.0,0.0,0.0,0.0,1.0
Jaundice,50000.0,0.19922,0.399418,0.0,0.0,0.0,0.0,1.0
Abdominal_Discomfort,50000.0,0.2965,0.456719,0.0,0.0,0.0,1.0,1.0
Back_Pain,50000.0,0.25286,0.434656,0.0,0.0,0.0,1.0,1.0


In [5]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
Country,50000,9,United States,17608
Gender,50000,2,Male,25962
Stage_at_Diagnosis,50000,4,Stage IV,19922
Treatment_Type,50000,3,Chemotherapy,24910
Physical_Activity_Level,50000,3,Medium,20038
Diet_Processed_Food,50000,3,Medium,20122
Access_to_Healthcare,50000,3,Medium,25268
Urban_vs_Rural,50000,2,Urban,35003
Economic_Status,50000,3,Middle,24881


### Data Wrangling (Pandas / Numpy)
  - Identifying and handling missing values
  - Data formatting
  - Data normalization (centering/scaling)
  - Binning
  - Indicator variables

In [6]:
# missing value and duplicate value check
print('Missing Value (%)')
print(df.isna().mean()*100)
print('\nDuplicate Row (%)')
print(df.duplicated().mean())

Missing Value (%)
Country                          0.0
Age                              0.0
Gender                           0.0
Smoking_History                  0.0
Obesity                          0.0
Diabetes                         0.0
Chronic_Pancreatitis             0.0
Family_History                   0.0
Hereditary_Condition             0.0
Jaundice                         0.0
Abdominal_Discomfort             0.0
Back_Pain                        0.0
Weight_Loss                      0.0
Development_of_Type2_Diabetes    0.0
Stage_at_Diagnosis               0.0
Survival_Time_Months             0.0
Treatment_Type                   0.0
Survival_Status                  0.0
Alcohol_Consumption              0.0
Physical_Activity_Level          0.0
Diet_Processed_Food              0.0
Access_to_Healthcare             0.0
Urban_vs_Rural                   0.0
Economic_Status                  0.0
dtype: float64

Duplicate Row (%)
8e-05


In [7]:
print("Number of duplicate rows BEFORE removal:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Number of duplicate rows AFTER removal:", df.duplicated().sum())



Number of duplicate rows BEFORE removal: 4
Number of duplicate rows AFTER removal: 0


In [8]:
# standardise column names and text values for stage_at_diagnosis
df.columns = df.columns.str.lower()
df['gender'] = df['gender'].str.lower()
df['stage_at_diagnosis'] = df['stage_at_diagnosis'].str.replace(' ', '_')

print(df.columns)
print(df['gender'].unique())
print(df['stage_at_diagnosis'].unique())



Index(['country', 'age', 'gender', 'smoking_history', 'obesity', 'diabetes',
       'chronic_pancreatitis', 'family_history', 'hereditary_condition',
       'jaundice', 'abdominal_discomfort', 'back_pain', 'weight_loss',
       'development_of_type2_diabetes', 'stage_at_diagnosis',
       'survival_time_months', 'treatment_type', 'survival_status',
       'alcohol_consumption', 'physical_activity_level', 'diet_processed_food',
       'access_to_healthcare', 'urban_vs_rural', 'economic_status'],
      dtype='object')
['female' 'male']
['Stage_III' 'Stage_IV' 'Stage_II' 'Stage_I']


In [15]:
# data normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[['age', 'survival_time_months']] = scaler.fit_transform(
    df[['age', 'survival_time_months']]
)

# actual
print(df[['age', 'survival_time_months']].describe())

# rounded
desc = df[['age', 'survival_time_months']].describe()
desc.loc['count'] = desc.loc['count'].astype(int)
desc.round(10)

                age  survival_time_months
count  4.999600e+04          4.999600e+04
mean  -1.449623e-17         -1.705439e-17
std    1.000010e+00          1.000010e+00
min   -3.463106e+00         -1.144306e+00
25%   -6.558025e-01         -7.007351e-01
50%    4.602341e-02         -3.458782e-01
75%    6.475885e-01          4.525496e-01
max    2.552545e+00          4.001118e+00


Unnamed: 0,age,survival_time_months
count,49996.0,49996.0
mean,-0.0,-0.0
std,1.00001,1.00001
min,-3.463106,-1.144306
25%,-0.655803,-0.700735
50%,0.046023,-0.345878
75%,0.647588,0.45255
max,2.552545,4.001118


In [10]:
# binning age column
df['age_group'] = pd.cut(
    df['age'],
    bins=[-3, -1, 0, 3],   # scaled values after StandardScaler
    labels=['Young', 'Middle-aged', 'Elderly']
)

print(df[['age', 'age_group']].head(10))
print(df['age_group'].value_counts())


        age    age_group
0 -0.054237  Middle-aged
1  1.249154      Elderly
2  0.647588      Elderly
3 -0.856324  Middle-aged
4  1.750458      Elderly
5 -1.558150        Young
6  0.246545      Elderly
7 -0.856324  Middle-aged
8 -1.457889        Young
9 -1.157107        Young
age_group
Elderly        25158
Middle-aged    16994
Young           7776
Name: count, dtype: int64


In [11]:
# indicator variable 
df_encoded = pd.get_dummies(
    df,
    columns=['gender', 'treatment_type', 'stage_at_diagnosis', 'urban_vs_rural'],
    drop_first=True
)

print(df_encoded.columns)
print(df_encoded.head())



Index(['country', 'age', 'smoking_history', 'obesity', 'diabetes',
       'chronic_pancreatitis', 'family_history', 'hereditary_condition',
       'jaundice', 'abdominal_discomfort', 'back_pain', 'weight_loss',
       'development_of_type2_diabetes', 'survival_time_months',
       'survival_status', 'alcohol_consumption', 'physical_activity_level',
       'diet_processed_food', 'access_to_healthcare', 'economic_status',
       'age_group', 'gender_male', 'treatment_type_Radiation',
       'treatment_type_Surgery', 'stage_at_diagnosis_Stage_II',
       'stage_at_diagnosis_Stage_III', 'stage_at_diagnosis_Stage_IV',
       'urban_vs_rural_Urban'],
      dtype='object')
         country       age  smoking_history  obesity  diabetes  \
0         Canada -0.054237                0        0         0   
1   South Africa  1.249154                1        1         0   
2          India  0.647588                0        0         0   
3        Germany -0.856324                0        0         