In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
df = pd.read_csv('asthma_data.csv')

# Data Cleaning: Remove duplicates and handle missing values
df.drop_duplicates(subset='patient_id', inplace=True)
df.dropna(subset=['age', 'gender', 'ethnicity', 'education_level', 'asthma_progression'], inplace=True)

# Encode categorical variables (convert gender, ethnicity, and education level to numeric)
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
df['ethnicity'] = df['ethnicity'].map({'Caucasian': 0, 'African American': 1, 'Asian': 2, 'Other': 3})
df['education_level'] = df['education_level'].map({'None': 0, 'High School': 1, 'Bachelor\'s': 2, 'Higher': 3})

# Select features and target variable
X = df[['age', 'gender', 'ethnicity', 'education_level']]
y = df['asthma_progression']  # Assumed binary variable indicating asthma progression

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling: Standardize the feature values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training: Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Add predictions to the original dataset
df['predicted_asthma_progression'] = model.predict(scaler.transform(X))

# Model Evaluation: Assess model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Display the first few rows of the dataset with the predictions
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'asthma_data.csv'