# Lung Cancer

In [None]:
!pip install pandas matplotlib

In [1]:
## Import needed libs
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Load Dataset
- We will provide the path to .csv file we have of the dataset
- Read the content of the dataset using `read_csv()` function provided by `pandas` lib

In [3]:
# Load the dataset
file_path = 'Data/survey lung cancer.csv'  # Replace this with the actual file path
data = pd.read_csv(file_path)
df = pd.DataFrame(data)

### Exploring

We loaded our dataset already, so now we will explore it and see some basic info that will help us extract some useful data out of it.

<br>

#### Display first 5 Rows
- We will start by showing the `head` of the dataset (5 rows only) to see how the data looks like:

In [4]:
# Display the first few rows of the dataset to understand its structure
print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
  GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0      M   69        1               2        2              1   
1      M   74        2               1        1              1   
2      F   59        1               1        1              2   
3      M   63        2               2        2              1   
4      F   63        1               2        1              1   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                1         2         1         2                  2         2   
1                2         2         2         1                  1         1   
2                1         2         1         2                  1         2   
3                1         1         1         1                  2         1   
4                1         1         1         2                  1         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN LUNG_CANCER  
0              

Based on the data we retrieved from the first 5 rows, we can notice that most of the columns records the responses as (YES=2, NO=1) and takes the Age and Gender as well.

<br>

#### Dataset Info
- We can now display all the columns and their `Dtype` (Data Type) so we can now what type of value we are dealing with later on.
- We can also get how many entries in this dataset.

`data.info()` will help us get these information:

In [5]:
# Get information about the dataset including column names, data types, and missing values
print("\nDataset information:")
print(df.info())


Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  L

In [6]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())


Missing values in the dataset:
GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64


In [7]:
# Summary statistics
summary_stats = df.describe()
# Summary statistics of numerical columns
print("\nSummary statistics of numerical columns:")
print(summary_stats)


Summary statistics of numerical columns:
              AGE     SMOKING  YELLOW_FINGERS     ANXIETY  PEER_PRESSURE  \
count  309.000000  309.000000      309.000000  309.000000     309.000000   
mean    62.673139    1.563107        1.569579    1.498382       1.501618   
std      8.210301    0.496806        0.495938    0.500808       0.500808   
min     21.000000    1.000000        1.000000    1.000000       1.000000   
25%     57.000000    1.000000        1.000000    1.000000       1.000000   
50%     62.000000    2.000000        2.000000    1.000000       2.000000   
75%     69.000000    2.000000        2.000000    2.000000       2.000000   
max     87.000000    2.000000        2.000000    2.000000       2.000000   

       CHRONIC DISEASE    FATIGUE     ALLERGY     WHEEZING  ALCOHOL CONSUMING  \
count       309.000000  309.000000  309.000000  309.000000         309.000000   
mean          1.504854    1.673139    1.556634    1.556634           1.556634   
std           0.500787    0.46

In [None]:
# For example, you can create a histogram of ages
plt.hist(df['AGE'], bins=20, color='skyblue')
plt.xlabel('AGE')
plt.ylabel('Frequency')
plt.title('Distribution of Ages')
plt.show()

In [None]:
# Assuming 'GENDER' column exists in the dataset differentiating between males and females

# Filter data by gender
males_data = df[df['GENDER'] == 'Male']
females_data = df[df['GENDER'] == 'Female']

# Boxplot comparison of ages between males and females
plt.figure(figsize=(8, 6))
sns.boxplot(x='GENDER', y='AGE', data=df)
plt.title('Age Distribution between Males and Females')
plt.xlabel('Gender')
plt.ylabel('Age')
plt.show()

In [None]:
# Example: Bar plot comparing smoking status between genders
plt.figure(figsize=(10, 6))
sns.countplot(x='SMOKING', hue='GENDER', data=df)
plt.title('Smoking Status by Gender')
plt.xlabel('Smoking Status')
plt.xticks(ticks=[0, 1], labels=['NO', 'YES'])  # Assuming 1 represents 'NO' and 2 represents 'YES'
plt.ylabel('Count')
plt.legend(title='Gender')
plt.show()

In [None]:
# Assuming 'LUNG_CANCER' and 'GENDER' columns exist in the dataset

# Plotting the distribution of cancer status among genders
plt.figure(figsize=(8, 6))
sns.countplot(x='LUNG_CANCER', hue='GENDER', data=df)
plt.title('Distribution of Cancer Status by Gender')
plt.xlabel('Cancer Status')
plt.ylabel('Count')
plt.legend(title='Gender')
plt.show()

In [None]:
# Plotting relationships using seaborn
sns.pairplot(df, hue='LUNG_CANCER', diag_kind='hist')
plt.show()

In [None]:
# Boxplot of age by lung cancer status
plt.figure(figsize=(8, 6))
sns.boxplot(x='LUNG_CANCER', y='AGE', data=df)
plt.title('Age distribution by Lung Cancer')
plt.show()

In [None]:
# Countplot for categorical variables
categorical_columns = ['GENDER', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE']
for col in categorical_columns:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=col, hue='LUNG_CANCER', data=df)
    plt.title(f'{col} count by Lung Cancer')
    plt.show()