In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
df=pd.read_csv('/content/survey lung cancer.csv')
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [None]:
#Removing duplicates rows, Preprocessing
df.drop_duplicates(inplace=True)

In [None]:
#The percentage distribution of categorical data, Gives the proportion of each unique value
(df["GENDER"].value_counts() / len(df["GENDER"])) * 100

Unnamed: 0_level_0,count
GENDER,Unnamed: 1_level_1
M,51.449275
F,48.550725


In [None]:
#calculates the percentage distribution of each unique value in the LUNG_CANCER
(df["LUNG_CANCER"].value_counts() / len(df["LUNG_CANCER"])) *100

Unnamed: 0_level_0,count
LUNG_CANCER,Unnamed: 1_level_1
YES,86.231884
NO,13.768116


In [None]:
#The Preprocessing step to convert DataFrame (df) into numerical values
df["LUNG_CANCER"]= df["LUNG_CANCER"].map({"YES":1,"NO":0})
df["GENDER"] = df["GENDER"].map({"M":1,"F":0})

In [None]:
x=df.iloc[::,:-1].values
y=df.iloc[::,-1].values

In [None]:
x.shape

(276, 15)

In [None]:
y.shape

(276,)

In [None]:
# 80% Train, 20% Test
#The model learns patterns and relationships between x_train (input) and y_train (output)
# Evaluate the performance of the trained model and Ensures the model generalizes well to unseen data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=7)


In [None]:
x_train.shape, y_train.shape

((220, 15), (220,))

In [None]:
# Creating the Model
#A deeper tree can model more complex data but risks overfitting.
#Limiting the depth helps control complexity and improve generalization
dr = DecisionTreeClassifier(max_depth=10,random_state=7,max_leaf_nodes=10)

# Training The Model
# Fit the model to training data
dr.fit(x_train, y_train)


# Make predictions
y_pred = dr.predict(x_test)
train_pred = dr.predict(x_train)

# Evaluate the model
print("Training Accuracy:",accuracy_score(y_train,train_pred))
print("Testing Accuracy",accuracy_score(y_test,y_pred)) # Actual, Predicted




Training Accuracy: 0.9363636363636364
Testing Accuracy 0.8928571428571429


In [None]:
#Use Case
#Useful for evaluating imbalanced datasets.
#Helps compare performance across different classes and overall
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.60      0.67        10
           1       0.92      0.96      0.94        46

    accuracy                           0.89        56
   macro avg       0.83      0.78      0.80        56
weighted avg       0.89      0.89      0.89        56

