In [7]:
# importing pandas here
import pandas as pd
df = pd.read_csv('Lung_Cancer_dataset.csv')
print(df.head())


     Name      Surname  Age  Smokes  AreaQ  Alkhol  Result
0    John         Wick   35       3      5       4       1
1    John  Constantine   27      20      2       5       1
2  Camela     Anderson   30       0      5       2       0
3    Alex       Telles   28       0      8       1       0
4   Diego     Maradona   68       4      5       6       1


In [8]:
df.drop(['Name', 'Surname'], axis=1, inplace=True)
print(df.head())

   Age  Smokes  AreaQ  Alkhol  Result
0   35       3      5       4       1
1   27      20      2       5       1
2   30       0      5       2       0
3   28       0      8       1       0
4   68       4      5       6       1


In [9]:
print(df.isnull().sum())

Age       0
Smokes    0
AreaQ     0
Alkhol    0
Result    0
dtype: int64


In [15]:
from sklearn.model_selection import train_test_split
X = df.drop('Result', axis = 1) # featur3es
y = df['Result'] # this is target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [38]:
y_pred = model.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9166666666666666
Confusion Matrix:
 [[8 0]
 [1 3]]


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

# loading data set
df = pd.read_csv('Lung_Cancer_dataset.csv')

# defining features x and y, which is target
X = df.drop(columns=['Name', 'Surname', 'Result'])
y = df['Result']

# here we split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# created baseline here for Decision Tree Model
dt_baseline = DecisionTreeClassifier(random_state=42)
dt_baseline.fit(X_train, y_train)

# Evaluating model
y_pred_baseline = dt_baseline.predict(X_test)
print("--- Baseline Decision Tree performance ---")
print(f'Accuracy: {accuracy_score(y_test, y_pred_baseline):.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred_baseline))
print("\n" + "="*50 + "\n")

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

dt_pca = DecisionTreeClassifier(random_state=42)
dt_pca.fit(X_train_pca, y_train)

# evaluating PCA model
y_pred_pca = dt_pca.predict(X_test_pca)
print("--- PCA Decision Tree performance ---")
print(f'Accuracy: {accuracy_score(y_test, y_pred_pca):.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred_pca))

--- Baseline Decision Tree performance ---
Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.75      0.86         4

    accuracy                           0.92        12
   macro avg       0.94      0.88      0.90        12
weighted avg       0.93      0.92      0.91        12



--- PCA Decision Tree performance ---
Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      0.75      0.86         4

    accuracy                           0.92        12
   macro avg       0.94      0.88      0.90        12
weighted avg       0.93      0.92      0.91        12



In [37]:
# getting feature importances from baseline did earlier model
feature_importance = dt_baseline.feature_importances_
feature_names = X.columns

# creating DataFrame for visualization
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
#
print("--- Feature Importance ---")
print(importance_df)

--- Feature Importance ---
  Feature  Importance
3  Alkhol    0.900664
0     Age    0.099336
1  Smokes    0.000000
2   AreaQ    0.000000
