## Loading the required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

## Importing the dataset

In [2]:
gall = pd.read_csv("E:/data sets/Gall bladder cancer/gallbladder.csv")

In [None]:
gall.head()

In [None]:
# Checking the datatypes
gall.dtypes

In [None]:
# checking for missing values
gall.isnull().sum()

In [None]:
# checking for duplicates
gall.duplicated().sum()

In [None]:
gall.describe()

In [3]:
gall = gall.drop(columns=['Survival_Months'],axis=1)
gallnumerics = gall.select_dtypes(exclude = 'object')

In [None]:
sns.set(style="whitegrid")
for i in range(len(gallnumerics)):
    plt.subplot(5,3,i+1)
    plt.title(f"Histogram of {gallnumerics.columns[i]}")
    sns.histplot(gallnumerics, x=gallnumerics.columns[i],kde = True)
plt.tight_layout()    
plt.show()

In [4]:
gall = gall.drop(columns=['Patient_ID'],axis=1)
categories = gall.select_dtypes(include = 'object')


In [None]:
for col in categories:
    sns.countplot(x = col, data = gall, palette = 'magma')
    plt.title(f"Distribution of {col}")
    plt.show()
              

## Correlation analysis

In [None]:
corr_matrix = gallnumerics.corr()
sns.heatmap(gallnumerics.corr(numeric_only = True),cmap ='coolwarm', annot = True,fmt='.2f',vmin=-1, vmax=1)
plt.title('Correlologram of Gall Bladder Cancer')

In [None]:
## Use of nested loop to iterate over all the possible pairs
for i in range(len(gallnumerics.columns)):
    for j in range(i + 1, len(gallnumerics.columns)):
        ## Get the feature names of the pair
        feature_1 = gallnumerics.columns[i]
        feature_2 = gallnumerics.columns[j]
        correlation_value = gallnumerics[feature_1].corr(gallnumerics[feature_2])
        ## conditional statement of correlation
        if correlation_value > 0.7:
            interpretation = 'Strong positive correlation'
        elif 0.3 < correlation_value <= 0.7:
            interpretation = 'Moderate positive Correlation'
        elif -0.3 <= correlation_value <= -1:
            interpretation = 'Weak negative Correlation'
        elif 0 < correlation_value <= 0.3:
            interpretation = 'Weak positive correlation'
        elif -0.7 <= correlation_value <-0.3:
            interpretation = 'Moderate negative correlation'
        else:
            interpretation = 'weak negative Correlation'
        print(f"Correlation between {feature_1} and {feature_2}: {correlation_value:.2f} : {interpretation}")    
    
   
       

## Boxplot

In [None]:
for i in range(len(gallnumerics)):
    plt.subplot(5,3,i+1)
    plt.title(f"Boxplot of {gallnumerics.columns[i]}")
    sns.boxplot(gallnumerics, x=gallnumerics.columns[i])
plt.tight_layout()    
plt.show()

In [None]:
gall['Outcome'].value_counts()

In [None]:
gall['Outcome'].value_counts()/len(gall)

In [5]:
le = LabelEncoder()

for col in gall.select_dtypes(include=['object','category']).columns:
    gall[col] = le.fit_transform(gall[col])

In [6]:
gall = gall.replace({True: 1, False:0})

In [7]:
## Independent varibles and the target variable(dependent)
X = gall[['Age','Gender','Stage','Ethnicity','Smoking_History','Alcohol_Consumption','Family_History','Diabetes','Gallstones','Jaundice','CEA_Level']]
y = gall['Outcome']

In [8]:
# Splitting the data
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=2)

In [9]:
model = LogisticRegression()

In [10]:
#Training the Logistic Regression model
model.fit(X_train,y_train)

In [11]:
#Accuracy score
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction,y_test)
class_report = classification_report(X_test_prediction, y_test)
print(class_report)

              precision    recall  f1-score   support

           0       0.09      0.53      0.16        15
           1       0.94      0.57      0.71       185

    accuracy                           0.56       200
   macro avg       0.51      0.55      0.43       200
weighted avg       0.87      0.56      0.67       200



In [12]:
#Saving the model
import pickle
filename = 'gall_cancer_model.sav'
pickle.dump(model,open(filename, 'wb'))

In [13]:
#Loading the saved model
loaded_model = pickle.load(open('gall_cancer_model.sav','rb'))

In [None]:
#model = RandomForestClassifier(random_state = 2)

In [None]:
# Hyperparameter grid for tuning
#param = {
 #   'n_estimators': [400,800,1200],
  #  'max_depth': [30,50,70],
   # 'min_samples_split': [5,10],
    #'min_samples_leaf': [2,4]
#} 

In [None]:
# using gridsearchCV
#grid_search = GridSearchCV(
 #   estimator = model,
  #  param_grid = param,
   # cv = 10,
    #n_jobs = -1,
    #verbose = 5,
    #scoring = 'accuracy'
#)
#grid_search.fit(X_train,y_train)

In [None]:
#print("Best parameters:",
 #    grid_search.best_params_)
#best_model = grid_search.best_estimator_

In [None]:
#y_pred =best_model.predict(X_test)

In [None]:
## Predict on the test set
#y_pred = best_model.predict_proba(X_test)[:,1]

In [None]:
#accuracy = accuracy_score(y_test,y_pred)
#print(f"Accuracy: {accuracy:.4f}")

In [None]:
#X_test_prediction = best_model.predict(X_test)
#testing_data_accuracy = accuracy_score(X_test_prediction,y_test)
#class_report = classification_report(X_test_prediction, y_test)
#print(class_report)

In [None]:
# ROC AUC score 
#roc = roc_auc_score(y_test,y_pred)
#print(f"ROC AUC: {roc:.4f}")

In [None]:
# saving the model
#import pickle
#filename = 'galled_model.sav'
#pickle.dump(model,open(filename,'wb'))


In [None]:
#loading the model
#loaded_model = pickle.load(open('galled_model.sav','rb'))