In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named 'pandas'

# Missing Values Imputation

This notebook explores various missing value imputation techniques and their impact on model performance.

In [None]:
df = pd.read_csv('HousingData.csv') 

In [None]:
df.info()

In [None]:
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_percent = (df.isnull().sum() / df.isnull().count() * 100).sort_values(ascending=False)
missing_data = pd.concat([missing_values, missing_percent], axis=1, keys=['Total', 'Percent'])

print(missing_data[missing_data['Total'] > 0])

plt.figure(figsize=(12, 8))
ax = missing_values.plot(kind='bar', figsize=(12, 8))
plt.title('Number of Missing Values by Column', fontsize=16)
plt.xlabel('Columns', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df['target'] = (df['MEDV'] > 25).astype(int)

X = df.drop(['MEDV', 'target'], axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

## Creating Imputers

We'll create various imputers from scikit-learn and reparo libraries:

In [None]:
imputer_mean = SimpleImputer(strategy='mean')
imputer_median = SimpleImputer(strategy='median')
imputer_most_frequent = SimpleImputer(strategy='most_frequent')
imputer_constant = SimpleImputer(strategy='constant', fill_value=0)

from reparo.imputers import CDI, FRNNI, HotDeckImputation, KNNImputer, PMM, MICE, SICE

imputer_cdi = CDI()
imputer_frnni = FRNNI()
imputer_hotdeck = HotDeckImputation()
imputer_knn = KNNImputer(n_neighbors=5)
imputer_pmm = PMM()
imputer_sice = SICE()
imputer_mice = MICE()

imputers = {
    'Mean': imputer_mean,
    'Median': imputer_median,
    'Most_Frequent': imputer_most_frequent,
    'Constant': imputer_constant,
    'CDI': imputer_cdi,
    'FRNNI': imputer_frnni,
    'HotDeck': imputer_hotdeck,
    'KNN': imputer_knn,
    'PMM': imputer_pmm,
    'SICE': imputer_sice,
    'MICE': imputer_mice
}

## Training Imputers and Filling Missing Values

We'll train each imputer on the training data and then apply them to both training and testing data.

In [None]:
imputed_data = {}

def apply_sklearn_imputer(imputer_name, imputer):
    imputer.fit(X_train)
    X_train_imputed = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)
    return X_train_imputed, X_test_imputed

def apply_reparo_imputer(imputer_name, imputer):
    imputer.fit(X_train)
    X_train_imputed = imputer.transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    return X_train_imputed, X_test_imputed

for name, imputer in imputers.items():
    try:
        print(f"Applying {name} imputer...")
        if name in ['Mean', 'Median', 'Most_Frequent', 'Constant']:
            X_train_imputed, X_test_imputed = apply_sklearn_imputer(name, imputer)
        else:
            X_train_imputed, X_test_imputed = apply_reparo_imputer(name, imputer)
        
        imputed_data[name] = {
            'train': X_train_imputed,
            'test': X_test_imputed
        }
        print(f"Successfully applied {name} imputer")
    except Exception as e:
        print(f"Error applying {name} imputer: {e}")

## Model Training and Evaluation

Now we'll train different models on each imputed dataset and evaluate their performance.

In [None]:
# Create models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Gaussian Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

results = {}

scaler = StandardScaler()

for imputer_name, data in imputed_data.items():
    X_train_imputed = data['train']
    X_test_imputed = data['test']
    
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)
    
    results[imputer_name] = {}
    
    for model_name, model in models.items():
        try:
            model.fit(X_train_scaled, y_train)
            
            y_pred = model.predict(X_test_scaled)
            
            accuracy = accuracy_score(y_test, y_pred)
            
            results[imputer_name][model_name] = accuracy
            
            print(f"Imputer: {imputer_name}, Model: {model_name}, Accuracy: {accuracy:.4f}")
        except Exception as e:
            print(f"Error with {imputer_name} and {model_name}: {e}")
            results[imputer_name][model_name] = None

## Results Summary

Let's create a summary table to compare the performance of different imputation methods and models.

In [None]:
result_rows = []
for imputer_name, model_results in results.items():
    for model_name, accuracy in model_results.items():
        if accuracy is not None:
            result_rows.append({
                'Imputation Algorithm': imputer_name,
                'Prediction Algorithm': model_name,
                'Accuracy': accuracy
            })

results_df = pd.DataFrame(result_rows)
results_df = results_df.sort_values('Accuracy', ascending=False)

print("Top 10 performing combinations:")
display(results_df.head(10))

pivot_results = results_df.pivot(index='Imputation Algorithm', 
                                columns='Prediction Algorithm', 
                                values='Accuracy')

print("\nComparison across all methods:")
display(pivot_results)

plt.figure(figsize=(12, 8))
sns.heatmap(pivot_results, annot=True, cmap='YlGnBu', fmt='.3f')
plt.title('Accuracy by Imputation and Prediction Methods')
plt.tight_layout()
plt.show()