In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.model_selection import train_test_split


In [4]:

df = pd.read_csv('dataset/lung.csv')
print(df.head())
print(df.info())
print(df.describe())

  gender  age  smoking  yellow_fingers  anxiety  peer_pressure  \
0      M   69        1               2        2              1   
1      M   74        2               1        1              1   
2      F   59        1               1        1              2   
3      M   63        2               2        2              1   
4      F   63        1               2        1              1   

   chronic disease  fatigue  allergy  wheezing  alcohol consuming  coughing  \
0                1        2        1         2                  2         2   
1                2        2        2         1                  1         1   
2                1        2        1         2                  1         2   
3                1        1        1         1                  2         1   
4                1        1        1         2                  1         2   

   shortness of breath  swallowing difficulty  chest pain lung_cancer  
0                    2                      2           

In [None]:
#Understand the data types and missing values
print(df.dtypes)
missing_value = df.isnull().sum()
print('Missing values:')
print(missing_value)



In [None]:
#using visualization for showing missing values
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df.isnull(), cbar=False)
plt.show()


In [None]:
# Data preprocessing
#Handle Missing Data
simple_imputer = SimpleImputer(strategy = 'mean')
missing_col = ['AGE']
df[missing_col] = simple_imputer.fit_transform(df[missing_col])

simple_imputer = SimpleImputer(strategy = 'most_frequent')
missing_col = ['GENDER','YELLOW_FINGERS','ANXIETY','WHEEZING','ALCOHOL CONSUMING','PEER_PRESSURE','FATIGUE','ALLERGY','COUGHING','SHORTNESS OF BREATH','CHEST PAIN','SWALLOWING DIFFICULTY']
df[missing_col] = simple_imputer.fit_transform(df[missing_col])

print("Missing Values Now:")
print(df.isnull().sum())

sns.heatmap(df.isnull(), cbar=False)
plt.show()

In [None]:
#Encoding Categorical data
label_Encoder = LabelEncoder()

cols_to_encode = ['GENDER','SMOKING','YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','CHRONIC DISEASE','FATIGUE','ALLERGY','WHEEZING','ALCOHOL CONSUMING','COUGHING','SHORTNESS OF BREATH','SWALLOWING DIFFICULTY','CHEST PAIN','LUNG_CANCER']
for col in cols_to_encode:
    df[col] = label_Encoder.fit_transform(df[col])

df

In [None]:
import pandas as pd

#Calculate the lower and upper bounds
Q1 = df['AGE'].quantile(0.25)
Q3 = df['AGE'].quantile(0.75)
IQR = Q3 - Q1

#Calculate the lower and 
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(lower_bound)
print(upper_bound)
#Identifying outliers
outliers = df[(df['AGE'] < lower_bound) | (df['AGE'] > upper_bound)]

print("Outliers detected")
print(outliers)

In [None]:
# Plotting the outliers
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['AGE'])
plt.title("Boxplot for Age before removing outliers")
plt.show()

In [None]:
# Cap the values at the lower and upper bounds
df['AGE'] = df['AGE'].clip(lower=lower_bound, upper=upper_bound)

# Check the result
print(df['AGE'].describe())


In [None]:
# Create a boxplot for 'Age' column
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['AGE'])
plt.title("Boxplot for Age after removing outlier")
plt.show()


In [None]:
#Feature Scaling
FeatureScal_col = ['AGE']
Minmax = MinMaxScaler()
df[FeatureScal_col] = Minmax.fit_transform(df[FeatureScal_col])
print("Normalised Data:")
print(df[FeatureScal_col].head())
df

In [None]:
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].unique())
    print()

In [None]:
#Splitting to train and test

X = df.iloc[:,:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

In [None]:
df.to_csv('lungCancer_data_preprocessed.csv', index=False)


In [None]:
#five number summary

column_name ='AGE'
summary = {
    'Minimum' : df[column_name].min(),
    'Q1' : df[column_name].quantile(0.25),
    'Median' : df[column_name].median(),
    'Q3' : df[column_name].quantile(0.75),
    'Maximum' : df[column_name].max()
}

#printing out the result
print(f"Five-number Summary for '{column_name}':")
for key, value in summary.items():
    print(f"{key}:{value}")


In [None]:
#boxplot for five number summary
import seaborn as sns

#create a boxplot for the 'Age' column
plt.figure(figsize=(8,6))
sns.boxplot(x=df['AGE'], color = 'lightblue')

#set tje tile and labels
plt.title(f"boxplot for {column_name}")
plt.xlabel(column_name)

#show the plot
plt.show()


In [None]:
#training the model using Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score,recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Tree Visualization
from sklearn.tree import export_graphviz
from IPython.display import Image


In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
#hyperparamter tuning
param_dist = {'n_estimators' : randint(50,500),
              'max_depth': randint(1,20)}

#create a random forest classifier
rf = RandomForestClassifier()

rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist,
                                 n_iter = 5,
                                 cv = 5)
#Fit the random search object to the data
rand_search.fit(X_train, y_train)

In [None]:
#Create a variable for the best model
best_rf = rand_search.best_estimator_

#print the best hyperparameters
print('best hyperparameters:', rand_search.best_params_)

In [None]:
#Generate prediction with the best mode
y_pred = best_rf.predict(X_test)

#Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm).plot();

In [None]:
import pickle
filename = 'model/lung_cancer_model.sav'
pickle.dump(best_rf, open(filename, 'wb'))

In [None]:
df.to_csv('data_preprocessed/lung.csv',index=False)