In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Load data
file_path = '/Users/zhangxijing/MasterNEU/INFO6105DataScienceEngineeringMethodsandTools/Dataset/diabetes_project.csv'
data = pd.read_csv(file_path)

# Preprocess data: Remove outliers, impute missing values, normalize
def preprocess_data(data):
    for column in data.columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        data = data[(data[column] >= (Q1 - 1.5 * IQR)) & (data[column] <= (Q3 + 1.5 * IQR))]

    imputer = KNNImputer(n_neighbors=5)
    data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    scaler = StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    return data

data = preprocess_data(data)

# Unsupervised learning for label generation
def generate_labels(data):
    clustering_features = data[['Glucose', 'BMI', 'Age']]
    kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(clustering_features)
    data['Outcome'] = clusters
    cluster0_mean_glucose = data.loc[data['Outcome'] == 0, 'Glucose'].mean()
    cluster1_mean_glucose = data.loc[data['Outcome'] == 1, 'Glucose'].mean()
    if cluster0_mean_glucose > cluster1_mean_glucose:
        data['Outcome'] = data['Outcome'].replace({0: 1, 1: 0})
    return data

data = generate_labels(data)

# Split data
X = data.drop('Outcome', axis=1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensemble model with cross-validation
def build_ensemble(X_train, y_train):
    models = [
        ('lr', LogisticRegression(max_iter=1000)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
    ]
    ensemble = VotingClassifier(estimators=models, voting='soft')
    ensemble.fit(X_train, y_train)
    return ensemble

ensemble = build_ensemble(X_train, y_train)

# Evaluate the model using cross-validation
scores = cross_val_score(ensemble, X, y, cv=5)
accuracy = scores.mean()
print(f'Cross-validated accuracy: {accuracy:.2f}')

Cross-validated accuracy: 0.99


In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Employing the model on other datasets
# Load new data
file_path = '/Users/zhangxijing/MasterNEU/INFO6105DataScienceEngineeringMethodsandTools/Dataset/Heart_Failure.csv'
data = pd.read_csv(file_path)

# Preprocess data: Handle categorical variables, remove outliers, impute missing values, normalize
def preprocess_data(data):
    # Separate the features into numerical and categorical
    categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
    numerical_features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
    
    # Define transformers for numerical and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Create the preprocessing engine
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Apply transformations and return the transformed data
    data = pd.DataFrame(preprocessor.fit_transform(data), columns=preprocessor.get_feature_names_out())
    return data

# data, y = preprocess_data(data)
data_preprocessed = preprocess_data(data.drop('HeartDisease', axis=1))
y = data['HeartDisease']

# Split data
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

# Ensemble model with cross-validation
def build_ensemble(X_train, y_train):
    models = [
        ('lr', LogisticRegression(max_iter=1000)),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
    ]
    ensemble = VotingClassifier(estimators=models, voting='soft')
    ensemble.fit(X_train, y_train)
    return ensemble

ensemble = build_ensemble(X_train, y_train)

# Evaluate the model using cross-validation
scores = cross_val_score(ensemble, data, y, cv=5)
accuracy = scores.mean()
print(f'Cross-validated accuracy: {accuracy:.2f}')

# Optionally, evaluate on X_test, y_test if needed for a direct test set evaluation
y_pred = ensemble.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {test_accuracy:.2f}')

NameError: name 'SimpleImputer' is not defined