In [None]:
!unzip archive.zip

Archive:  archive.zip
  inflating: water_potability.csv    


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load the CSV data
data = pd.read_csv('./water_potability.csv')

In [None]:
# Split the data into features and target variable
X = data.drop('Potability', axis=1)  # Features
y = data['Potability']  # Target variable

In [None]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
from sklearn.impute import SimpleImputer

# Instantiate the imputer with strategy='mean'
imputer = SimpleImputer(strategy='mean')

In [None]:
# Fit the imputer on the training data
imputer.fit(X_train)

In [None]:

# Transform the training and testing data
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [None]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the classifier
rf_classifier.fit(X_train_imputed, y_train)

In [None]:
# Predictions on the testing set
predictions = rf_classifier.predict(X_test_imputed)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6676829268292683


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score



In [2]:
# Load the CSV data
data = pd.read_csv('./water_potability.csv')

In [3]:
# Split the data into features and target variable
X = data.drop('Potability', axis=1)  # Features
y = data['Potability']  # Target variable

In [4]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:

# Construct pipeline with feature scaling, missing value imputation, and random forest classifier
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameters grid for grid search
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [6]:
# Initialize stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
# Initialize grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy')

In [8]:
# Perform grid search to find the best hyperparameters
grid_search.fit(X_train, y_train)


In [9]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

In [10]:

# Predictions on the testing set using the best model
predictions = best_model.predict(X_test)

In [11]:
# Evaluate the best model
accuracy = accuracy_score(y_test, predictions)
print("Best Model Accuracy:", accuracy)
print("Best Model Hyperparameters:", grid_search.best_params_)

Best Model Accuracy: 0.6844512195121951
Best Model Hyperparameters: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 300}


In [20]:
import joblib
# Save the entire pipeline object
joblib.dump(best_model, 'water_poli.joblib')


['water_poli.joblib']