<a href="https://colab.research.google.com/github/swarupneuro/desktop-tutorial/blob/main/Copy_of_Lung_cancer_prediction_large_file.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression

# Step 1: Generate synthetic lung cancer dataset
def generate_lung_cancer_data(num_records=100):
    data = {
        "Age": [random.randint(25, 80) for _ in range(num_records)],
        "Gender": [random.choice([0, 1]) for _ in range(num_records)],
        "Smoking": [random.choice([0, 1]) for _ in range(num_records)],
        "Yellow_Fingers": [random.choice([0, 1]) for _ in range(num_records)],
        "Anxiety": [random.choice([0, 1]) for _ in range(num_records)],
        "Peer_Pressure": [random.choice([0, 1]) for _ in range(num_records)],
        "Chronic_Disease": [random.choice([0, 1]) for _ in range(num_records)],
        "Fatigue": [random.choice([0, 1]) for _ in range(num_records)],
        "Allergy": [random.choice([0, 1]) for _ in range(num_records)],
        "Wheezing": [random.choice([0, 1]) for _ in range(num_records)],
        "Alcohol": [random.choice([0, 1]) for _ in range(num_records)],
        "Coughing": [random.choice([0, 1]) for _ in range(num_records)],
    }

    # Basic rule: high risk if sum of risk features > 2
    data["Lung_Cancer"] = [
        1 if (
            data["Smoking"][i] +
            data["Coughing"][i] +
            data["Chronic_Disease"][i] +
            data["Wheezing"][i]
        ) > 2 else 0 for i in range(num_records)
    ]

    return pd.DataFrame(data)

# Step 2: Create dataset
df = generate_lung_cancer_data(100)

# Step 3: Train logistic regression model
X = df.drop("Lung_Cancer", axis=1)
y = df["Lung_Cancer"]

model = LogisticRegression(max_iter=1000)
model.fit(X, y)

# Step 4: Predict
df["Prediction"] = model.predict(X)
df["Prediction_Prob"] = model.predict_proba(X)[:, 1]

# Step 5: Save to CSV
df.to_csv("lung_cancer_predictions_large.csv", index=False)
print("Saved as: lung_cancer_predictions_large.csv")
print(df.head())


Saved as: lung_cancer_predictions_large.csv
   Age  Gender  Smoking  Yellow_Fingers  Anxiety  Peer_Pressure  \
0   39       1        0               0        0              0   
1   64       1        0               1        1              0   
2   36       0        0               1        1              1   
3   27       0        0               1        0              1   
4   79       0        0               1        1              0   

   Chronic_Disease  Fatigue  Allergy  Wheezing  Alcohol  Coughing  \
0                1        0        0         0        0         0   
1                0        1        1         0        1         0   
2                0        0        0         1        0         0   
3                1        1        1         1        0         1   
4                1        1        1         1        0         1   

   Lung_Cancer  Prediction  Prediction_Prob  
0            0           0         0.038969  
1            0           0         0.005114  


In [2]:
from google.colab import files
# Change the filename to match the one used when saving the file
files.download("lung_cancer_predictions_large.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>