# Lung Cancer Analysis
Objective: Perform EDA and modeling

# Import necessary packages

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load Data

In [4]:
data =  pd.read_csv("/kaggle/input/lung-cancer-risk-in-25-countries/lung_cancer_prediction_dataset.csv")
df = data.copy()
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/lung-cancer-risk-in-25-countries/lung_cancer_prediction_dataset.csv'

# Exploratory Data Analysis

In [None]:
def basic_data_insights(df):
    """ This method provides basic details about data."""
    print("\n Dimension of data: ", df.shape)
    print("\n Columns of data: \n", df.columns.to_list())
    print( df.info())
    print("\n Percentage of null values in each column: \n",
         (df.isna().sum()/len(df))*100)
    print("\n Total duplicate records: ", df.duplicated().sum())
    print("\n Basic Statistical summary: ", df.describe())


# calling methoda
basic_data_insights(df)
    

We got 220632 records and 24 columns. 'Lung_Cancer_Diagnosis' seems our target variable.
There are 15 categorical features and 9 numerical features.
Cancer_Stage and Treatment_Type columns have more than 90% null values. So, it's better to drop these columns.
Basic Statistical Summary can be seen.

In [None]:
# Handling Missing Values
# Imputing missing values with mode.

def impute_with_mode(df, columns):
    """This function replaces null values with mode of that column"""
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for col in columns:
        if col in df.columns:
            mode = df[col].mode()[0]
            df[col].fillna(mode, inplace = True)
        else:
            print(f"Column {col} not found in data")

    return df

# calling it.
columns = ['Cancer_Stage','Treatment_Type']
df = impute_with_mode(df,columns)

# validation.
print(df[['Cancer_Stage','Treatment_Type']].isna().sum())

There are no missing values in data now.

In [None]:
# Visualizing categorical columns
def plot_cat_distribution(df,cat_cols):
    """This method plots the distribution of categorical features using countplot"""
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for col in cat_cols:
        plt.figure(figsize=(20,9))
        sns.countplot(data=df, x=col)
        plt.title(f"Distribution of {col}")
        plt.show()
        print("\n")

# calling it
cat_cols = df.select_dtypes(include=['object']).columns
plot_cat_distribution(df, cat_cols)
    

In [None]:
# Visualize numerical column's distribution
def plot_num_distribution(df, num_cols):
    """This function provides the distribution of numerical features"""
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for col in num_cols:
        plt.figure(figsize=(17,9))
        sns.histplot(df[col], kde = True)
        plt.title(f"Distribution of {col}")
        plt.show()
        print("\n")

# calling it
num_cols = df.select_dtypes(include=['int64','float64']).columns
plot_num_distribution(df, num_cols)

In [None]:
# Correlation between numerical features
def plot_correlation_heatmap(df):
    """ This function plots correlation heatmap for numerical features"""
    # Calculate correlation matrix
    corr_matrix = df.corr()
    # Create a heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.show()

# Calling it
plot_correlation_heatmap(df[num_cols])

We can see correlation among few numerical features, like Mortality Rate and Survival years have correlation.
Similarly, Years of smoking and cigarettes per day. population size and annul lung cancer deaths.
Years of Smoking and  Cigarettes per day.

In [None]:
# Relation between categorical features and target variable ie Lung_Cancer_Diagnosis
def plot_target_cat(df, cat_cols):
    """This function provides the distribution of numerical features"""
    warnings.simplefilter(action='ignore', category=FutureWarning)
    for col in cat_cols:
        if col != 'Lung_Cancer_Diagnosis' and col !='Country':
           print(f"\n Relation between Lung Cancer and {col}")
           crstab = pd.crosstab(df['Lung_Cancer_Diagnosis'],df[col], margins = True)
           print(crstab)
           print("\n")
           plt.figure(figsize=(12,6))
           sns.countplot(df, x = col, hue = 'Lung_Cancer_Diagnosis')
           plt.title(f"Relation between Lung Cancer and {col}")
           plt.show()
           print("\n")

# calling fucntion
plot_target_cat(df,cat_cols)


We can see how Lung cancer is related to other categorical features.

In [None]:
# Chi square test to check significant association of categorical columns with Lung Cancer
# it will give p value
from scipy.stats import chi2_contingency

for col in cat_cols:
    print(f"Chi-Square Test for {col}:")
    
    # Create the contingency table between the categorical column and 'Lung Cancer Diagnosis'
    contingency_table = pd.crosstab(df[col], data['Lung_Cancer_Diagnosis'])
    
    # Perform the chi-square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    # Print the p-value for the test
    print(f"p-value: {p}")
    
    # You can also print the expected counts for each category
    print("Expected frequencies:")
    print(expected)
    
    # If the p-value is small (below 0.05), it suggests a statistically significant relationship
    if p < 0.05:
        print(f"Conclusion: There is a statistically significant relationship between {col} and Lung Cancer.")
    else:
        print(f"Conclusion: There is no statistically significant relationship between {col} and Lung Cancer.")
    
    print("\n")

We can see following categorical columns have significant realationship with Lung Cancer:
1. Treatment Type
2. Adenocarcinoma_Type
3. Cancer Stage
4. Gender
5. Smoker

In [None]:
# Distribution of target variable using piechart

def plot_target_pie_chart(df, target_column):
    # Get the value counts of the target column
    target_counts = df[target_column].value_counts()

    # Plot the pie chart
    plt.figure(figsize=(14, 8))
    target_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=['#66b3ff', '#99ff99'])
    plt.title(f'Distribution of {target_column}')
    plt.ylabel('')  
    plt.show()

# calling it
plot_target_pie_chart(df, 'Lung_Cancer_Diagnosis')


4.1 % of people have been diagnosed lung cancer positive.

# Machine Learning

# Data Preprocessing and Feature Engineering

In [None]:
# Import additional libraries
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  GradientBoostingClassifier, AdaBoostClassifier

In [None]:
# Separating features and target
X = data.drop(columns=['Lung_Cancer_Diagnosis','ID'])
y = data['Lung_Cancer_Diagnosis']
print("\n Shape of X: ", X.shape)
print("\n Shape of y: ", y.shape)

In [None]:
# identify categorical columns
categorical_features = X.select_dtypes(include=['object']).columns
print(categorical_features)

In [None]:
# creating preprocessing pipeline
# We will be using OneHotEncoder for encoding categorical variable, we can also use Label encoder coz categorical columns seem ordinal but may contain
# more than 2 values.
# we will also do scaling for few models like Logistic Regression
preprocessor = ColumnTransformer(transformers=[
    ('categorical', OneHotEncoder(drop='first', sparse = False), categorical_features),
    ('scaling', StandardScaler(), X.select_dtypes(exclude=['object']).columns)
])


In [None]:
# Inititalise classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Ada Boost': AdaBoostClassifier(),
   }

In [None]:
# Function to train and evaluate models
def train_and_evaluate(classifiers, X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Loop through classifiers
    for model_name, model in classifiers.items():
        print(f"Training {model_name}:")
        
        # Create a pipeline for preprocessing and the model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifiers', model)
        ])

        # Fit the model
        pipeline.fit(X_train, y_train)

        # Make predictions and evaluate
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{model_name} Accuracy: {accuracy:.4f}\n")



In [None]:
# Train and evaluate all models
train_and_evaluate(classifiers, X, y)