In [None]:
#1. Data Loading and Preprocessing

import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv("smoking.csv")

# Display the first few rows of the dataset
print("Dataset Overview:")
print(data.head())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Replace missing values (if any)
data.fillna(method='ffill', inplace=True)

# Create a new column for income groups using quantiles
data['income_group'] = pd.qcut(data['gross_income'], q=4, labels=["Low", "Lower-Middle", "Upper-Middle", "High"])

# Convert categorical variables to numerical (if necessary)
data['smoke'] = data['smoke'].map({'Yes': 1, 'No': 0})

# Save the cleaned dataset
data.to_csv("cleaned_smoking_data.csv", index=False)
print("\nPreprocessed data saved as 'cleaned_smoking_data.csv'")

#2. EDA and Visualizations

# Import libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Smoking prevalence by gender
gender_smoke = data.groupby("gender")["smoke"].mean()
plt.figure(figsize=(6, 4))
gender_smoke.plot(kind='bar', color=['skyblue', 'orange'])
plt.title("Smoking Prevalence by Gender")
plt.ylabel("Proportion of Smokers")
plt.xlabel("Gender")
plt.show()

# Smoking prevalence by income group
plt.figure(figsize=(8, 5))
sns.barplot(data=data, x='income_group', y='smoke', palette='viridis')
plt.title("Smoking Prevalence by Income Group")
plt.ylabel("Proportion of Smokers")
plt.xlabel("Income Group")
plt.show()

# Smoking frequency: Weekdays vs Weekends by gender
plt.figure(figsize=(8, 5))
sns.boxplot(data=data, x='gender', y='amt_weekdays', palette='Set2')
plt.title("Smoking Frequency on Weekdays by Gender")
plt.ylabel("Smoking Amount")
plt.xlabel("Gender")
plt.show()

plt.figure(figsize=(8, 5))
sns.boxplot(data=data, x='gender', y='amt_weekends', palette='Set3')
plt.title("Smoking Frequency on Weekends by Gender")
plt.ylabel("Smoking Amount")
plt.xlabel("Gender")
plt.show()

#3. Predictive Modeling

# Import libraries for machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

# Select features and target variable
features = ['age', 'gross_income', 'amt_weekdays', 'amt_weekends']
target = 'smoke'

X = data[features]
y = data[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_prob = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print("Model Accuracy:", accuracy)
print("ROC AUC Score:", roc_auc)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='darkorange')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()
