# Lung Cancer Prediction Project

## Loading and Exploring the data

In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [54]:
data = pd.read_csv("..\Datasets\lung_cancer.csv")

In [None]:
data.head()

So we can see our various features and our target variable which is the column **'LUNG_CANCER'**

In [None]:
print("Shape of data: ", data.shape)

We can see that the dataset contains 309 rows and 16 columns

In [None]:
data.info()

### Datatype Conversion
We can see that the LUNG_CANCER column is an object data type so we'll convert the data into numerical data type by converting **YES to 1** and **NO to 0**.

We will also convert the other columns to numerical data types between 1 and 0 to help us when we're building the model.

In [None]:
data.loc[data['GENDER']=='M','GENDER'] = 1
data.loc[data['GENDER']=='F','GENDER'] = 0
data['GENDER'] = data['GENDER'].astype('int')
data.head()


In [None]:
data.loc[data['LUNG_CANCER']=='YES','LUNG_CANCER'] = 1
data.loc[data['LUNG_CANCER']=='NO','LUNG_CANCER'] = 0
data['LUNG_CANCER'] = data['LUNG_CANCER'].astype('int')
data.head()

We have now changed the Gender and Lung_cancer columns to numerical values 1 and 0 where Male and having lung_cancer = 1 and Female and Not having Lung_Cancer = 0

In [None]:
data.info()

In [61]:
data["SMOKING"].replace({2: 1, 1: 0}, inplace=True)
data["YELLOW_FINGERS"].replace({2: 1, 1: 0}, inplace=True)
data["ANXIETY"].replace({2: 1, 1: 0}, inplace=True)
data["PEER_PRESSURE"].replace({2: 1, 1: 0}, inplace=True)
data["CHRONIC DISEASE"].replace({2: 1, 1: 0}, inplace=True)
data["FATIGUE "].replace({2: 1, 1: 0}, inplace=True)
data["ALLERGY "].replace({2: 1, 1: 0}, inplace=True)
data["WHEEZING"].replace({2: 1, 1: 0}, inplace=True)
data["ALCOHOL CONSUMING"].replace({2: 1, 1: 0}, inplace=True)
data["COUGHING"].replace({2: 1, 1: 0}, inplace=True)
data["SHORTNESS OF BREATH"].replace({2: 1, 1: 0}, inplace=True)
data["SWALLOWING DIFFICULTY"].replace({2: 1, 1: 0}, inplace=True)
data["CHEST PAIN"].replace({2: 1, 1: 0}, inplace=True)

In [None]:
data.sample(5)

In [63]:
data.rename(columns={"CHRONIC DISEASE": "CHRONIC_DISEASE",
                   "ALCOHOL CONSUMING": 'ALCOHOL_CONSUMPTION',
                    "SHORTNESS OF BREATH": "SHORTNESS_OF_BREATH",
                    "CHEST PAIN":"CHEST_PAIN",
                    "SWALLOWING DIFFICULTY": "SWALLOWING_DIFFICULTY"},
          inplace=True, errors='raise')

In [None]:
data.head()

In [65]:
data.to_csv("../Datasets/processed_lung_cancer.csv")

We will also rename some columns so as to remove the white space in them

We have now converted the variables from "2" and "1" to "1" and "0".

In [None]:
data.isnull().sum()

We can see that the data does not contain any missing values.

In [None]:
data.describe().T

## Key Insights

A large portion of individuals in the dataset are diagnosed with lung cancer (LUNG_CANCER mean = 0.874).

Variables like FATIGUE and SHORTNESS_OF_BREATH have high means (0.673 and 0.641), suggesting frequent occurrence.

AGE has a significant range (21–87), with an older average population.

## Exploratory Data Analysis

In [None]:
len_lung_cancer = len(data['LUNG_CANCER'][data.LUNG_CANCER == 1])
len_not_lung_cancer = len(data['LUNG_CANCER'][data.LUNG_CANCER == 0])

arr = np.array([len_lung_cancer, len_not_lung_cancer])
labels = ['Has Lung Cancer',"Doesn't have Lung Cancer"]
print('Total No. of Non-Lung Cancer Cases: ', len_not_lung_cancer)
print('Total No. of Lung Cancer cases: ', len_lung_cancer)

plt.pie(arr, labels = labels, explode=[0.3, 0.0], shadow = True, autopct='%1.1f%%',)
plt.savefig("../Charts/Cancer Analysis.png", bbox_inches='tight')
plt.show()

The Pie Chart helps us to see clearly the ratio of people with lung cancer and people who do not have lung cancer. 


In [None]:
fig, ax = plt.subplots(figsize=(18,18))
sns.set_context('poster')
corr = data.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns,cmap='Blues', annot=True, square=True, annot_kws={"size":15})
ax.set_title('Collinearity of Lung Cancer Attributes\n')

ax.set_xticklabels(ax.get_xticklabels(), fontsize=11) 
ax.set_yticklabels(ax.get_yticklabels(), fontsize=11) 

corr_table = corr.unstack().reset_index()
corr_table.columns = ['Feature 1', 'Feature 2', 'Correlation']
corr_table = corr_table[corr_table['Feature 1'] != corr_table['Feature 2']]
corr_table = corr_table.sort_values(by='Correlation', ascending=False)
# print top 10 positive correlations
print(corr_table.head(3))
# print top 10 negative correlations
print(corr_table.tail(3))
# print top positive correlated features with LUNG_CANCER
print(corr_table[corr_table['Feature 1'] == 'LUNG_CANCER'].sort_values(by='Correlation', ascending=False).head(10))
plt.savefig('../Charts/HeatMap1.png', bbox_inches='tight')

## Insights from the Heatmap

Strong Positive Correlations:
YELLOW_FINGERS and ANXIETY have the highest positive correlation (0.5658), suggesting a notable link between the two.

Moderate Positive Correlations with LUNG_CANCER:
ALLERGY (0.3278) and ALCOHOL_CONSUMPTION (0.2885) show moderate positive correlations with lung cancer.

Weak Positive Correlations with LUNG_CANCER:
SWALLOWING_DIFFICULTY (0.2597) and WHEEZING (0.2493) also show weak but relevant associations.

Lowest Correlation with LUNG_CANCER:
ANXIETY (0.1449) and FATIGUE (0.1507) have the weakest association.

In [None]:
ax = sns.barplot(x=data['LUNG_CANCER'], y=data['ALLERGY '])
plt.xlabel("LUNG_CANCER", fontsize=14)
plt.ylabel("ALLERGY", fontsize=14)
plt.title("Relationship Between Allergy and Lung Cancer", fontsize=14)
plt.savefig('../Charts//Relationship Between Allergy and Lung Cancer.png', bbox_inches='tight')
plt.show()

ax = sns.barplot(x=data['LUNG_CANCER'], y=data['ALCOHOL_CONSUMPTION'])
plt.xlabel("LUNG_CANCER", fontsize=14)
plt.ylabel("ALCOHOL_CONSUMPTION", fontsize=14)
plt.title("Relationship Between Alcohol Consumption and Lung Cancer", fontsize=14)
plt.savefig('../Charts/Relationship Between Alcohol Consumption and Lung Cancer.png', bbox_inches='tight')
plt.show()



In [None]:
# age distribution plot
sns.histplot(data['AGE'])
plt.savefig('../Charts/ageplot.png', bbox_inches='tight')
plt.show()

## Splitting the Data

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop("LUNG_CANCER", axis = 1)
y = data.LUNG_CANCER

print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Engineering

In [74]:
def add_interaction(X):
    features = X.columns
    n = len(features)
    X_int = X.copy(deep=True)
    
    for i in range(n):
        feature_i_name = features[i]
        feature_i_data = X[feature_i_name]
        
        for j in range(i+1, n):
            feature_j_name = features[j]
            feature_j_data = X[feature_j_name]
            feature_i_j_name = feature_i_name+"_x_"+feature_j_name
            X_int[feature_i_j_name] = feature_i_data * feature_j_data
            
    return X_int

x_train_mod = add_interaction(X_train)
x_test_mod = add_interaction(X_test)

What the above function does is that it creates more feature columns by multiplying each column with another column and creates a new column for the product.

In [None]:
x_train_mod.head()

We can now see that we have 120 columns for the modified training set after applying the function.

## Model Building
The evaluate_model function below will be used to evaluate the accuracy of our model and also give the confusion matrix of our model.

In [105]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

def evaluate_model(y_test, y_pred):
    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    print("Precision Score: ", precision_score(y_test, y_pred))
    print("Recall Score: ", recall_score(y_test, y_pred))
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    
    df = {'y_Actual': y_test, 'y_Predicted': y_pred}

    df1 = pd.DataFrame(df, columns = ['y_Actual','y_Predicted'])

    clf_confusion_matrix = pd.crosstab(df['y_Predicted'], df['y_Actual'], rownames = ['Predicted'], colnames=['Actual'])

    sns.heatmap(clf_confusion_matrix, annot=True)
    plt.savefig('../Charts/confusion Matriz.png', bbox_inches='tight')
    

### Logistic Regression
We will be using the Logistic Regression model as a baseline model.

In [77]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter = 1000)
lr_clf.fit(X_train, y_train)

lr_clf_pred = lr_clf.predict(X_test)

In [None]:
y_pred = lr_clf.predict(X_test)
evaluate_model(y_test, y_pred)

Building Logistic Regression model with standardiser

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

lr_clf_pipe = make_pipeline(StandardScaler(), LogisticRegression())
lr_clf_pipe.fit(X_train, y_train)

y_pred1 = lr_clf_pipe.predict(X_test)
evaluate_model(y_test, y_pred1)

# Importing Data for Finding Cancer Stage

In [80]:
df=pd.read_csv('../Datasets/lung_cancer_for_stage.csv')

# Data outline and preprocessing

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['stage_of_cancer'].unique()

**Convert 'stage_of_cancer' to 0, 1, 2,3 and 4**

In [84]:
df['stage_of_cancer'] = df['stage_of_cancer'].replace({'IA': 1, 'IB' : 1,'IIA':2,'IIB':2,'IIIA':3,'IIIB':3,'IV':4})
df['smoker'] = df['smoker'].replace({'Current':1,'Former':0})

In [85]:
df=df.fillna({'stage_of_cancer': 0, 'days_to_cancer': 0, 'race': 'Others'})

In [86]:
df[['stage_of_cancer', 'days_to_cancer']] = df[['stage_of_cancer', 'days_to_cancer']].astype('int')

In [None]:
df.info()

**Pairplot of features**

In [None]:
sns.set(style="whitegrid", palette="muted")
pairplot = sns.pairplot(df.drop(columns=['pid','smoker']), diag_kind="kde", plot_kws={'alpha':0.6, 's':20, 'edgecolor':'k'}, height=2.5)
pairplot.fig.suptitle("Pairplot of Features", y=1.02)
plt.savefig('../Charts/pairplot.png', bbox_inches='tight')
plt.show()

**Heatmap of correlation**

In [None]:
sns.heatmap(df.drop(columns=['pid','race','gender']).corr(),cbar=False, cmap='Blues', fmt='.1f',annot=True, square=True, annot_kws={"size":15})
plt.title("Correlation Matrix")
plt.savefig('../Charts/Correlation Matrix.png', bbox_inches='tight')

**The number of each stages**

In [None]:
df.groupby('stage_of_cancer')['stage_of_cancer'].count()

In [None]:
df.groupby('stage_of_cancer')['stage_of_cancer'].count().plot.barh()
plt.xlabel('Count')
plt.ylabel('Stage of Cancer')
plt.title('Distribution of Stage of Cancer')
plt.savefig('../Charts/Distribution of Stage of Cancer.png', bbox_inches='tight')
plt.show()

In [None]:
col=['age', 'gender', 'race', 'smoker', 'days_to_cancer']
fig = plt.figure(figsize=(15,15))

for i in range(len(col)):
    plt.subplot(3,2,i+1)
    plt.title(col[i])
    sns.histplot(data=df,y=df[col[i]],hue='stage_of_cancer')

plt.tight_layout()
plt.savefig('../Charts/Distribution of Features.png', bbox_inches='tight')
plt.show()

**The number of non cancer (0) is very huge, so I should drop 0 to analize the features of each stage Ⅰ to Ⅳ.**

# Stage Ⅰ to Ⅳ Analysis

**Dataset which does not include "non cancer"**

In [93]:
df1=df

**Pairplot of features**

In [None]:
sns.pairplot(df1.drop(columns=['pid','smoker']))
plt.savefig('../Charts/pairplot1.png', bbox_inches='tight')

**Histplot of features by stage**

In [None]:
col=['age', 'gender', 'race', 'smoker', 'days_to_cancer']
fig = plt.figure(figsize=(15,15))

for i in range(len(col)):
    plt.subplot(3,2,i+1)
    plt.title(col[i])
    sns.histplot(data=df1,y=df1[col[i]],hue='stage_of_cancer')

plt.tight_layout()
plt.savefig('../Charts/Distribution of Features1.png', bbox_inches='tight')
plt.show()

**High stage is found more in Male, Current Smoker and longer days to cancer.**

# Smoker Lung Cancer Stage Classification Model

**Convert categorical variable into dummy/indicator variables**

In [None]:
df2=pd.get_dummies(df1)
df2=df2.reset_index(drop=True)
df2.head()

In [None]:
#replace all True values with 1 and False values with 0
df2=df2.replace({True:1,False:0})
df2.head()

**Split data to X and y**

In [98]:
df2.to_csv('../Datasets/processed_lung_cancer_stage.csv')

In [None]:
X= df2.drop(['pid','stage_of_cancer'],axis=1)
y= df2['stage_of_cancer']
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

**Split X,y to train and test**

In [100]:
X_train,X_test,y_train,y_test= train_test_split(X,y,random_state=0,test_size=0.2)

**Standardization**

In [101]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
X_train_sc= scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train_sc, y_train)
y_pred = clf.predict(X_test_sc)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
print("Classification Report: ", classification_report(y_test, y_pred))




## Risk for cancer

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X= df2.drop(['pid','days_to_cancer'],axis=1)
y= df2['days_to_cancer']
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

X_train,X_test,y_train,y_test= train_test_split(X,y,random_state=0,test_size=0.2)

scaler= StandardScaler()
X_train_sc= scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_sc, y_train)
y_pred = lr.predict(X_test_sc)

print("accuracy Score",lr.score(X_test_sc,y_test))
print("precision score",r2_score(y_test,y_pred))

plt.scatter(y_test, y_pred)
plt.xlabel("Actual Days to Cancer")
plt.ylabel("Predicted Days to Cancer")
plt.title("Actual vs Predicted Days to Cancer")
plt.savefig('../Charts/Actual vs Predicted Days to Cancer.png', bbox_inches='tight')
plt.show()

