In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import pickle

# Step 1: Load and Merge Data
def load_and_merge_data():
    # Load datasets
    cities_data = pd.read_csv('Cities1.csv')
    who_pm_data = pd.read_csv('WHO_PM.csv')

    # Summarize WHO_PM data by 'Location'
    who_pm_summary = who_pm_data.groupby('Location')['FactValueNumeric'].mean().reset_index()
    who_pm_summary.columns = ['Location', 'Avg_PM2.5']

    # Prepare for merging: Lowercase and trim spaces
    cities_data['Country_Lower'] = cities_data['Country'].str.lower().str.strip()
    who_pm_summary['Location_Lower'] = who_pm_summary['Location'].str.lower().str.strip()

    # Merge datasets
    merged_data = pd.merge(
        cities_data, who_pm_summary,
        left_on='Country_Lower', right_on='Location_Lower',
        how='inner'
    )

    # Drop temporary columns
    merged_data = merged_data.drop(columns=['Country_Lower', 'Location_Lower'])

    return merged_data

# Step 2: Preprocess Data
def preprocess_data():
    # Load and merge the data
    data = load_and_merge_data()

    # Drop unnecessary columns
    data = data.drop(columns=['City', 'Region'])  # Remove columns if not needed

    # Encode the categorical column 'Country'
    label_encoder = LabelEncoder()
    data['Country'] = label_encoder.fit_transform(data['Country'])

    # Define features (X) and target (y)
    X = data[['Country', 'AirQuality', 'Avg_PM2.5']]
    y = data['WaterPollution']

    return X, y, label_encoder

# Step 3: Train and Save the Model
def train_and_save_model():
    # Preprocess the data
    X, y, label_encoder = preprocess_data()

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    model = RandomForestRegressor(random_state=42, n_estimators=100)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model Performance: MSE = {mse}, R2 Score = {r2}")

    # Save the trained model and the label encoder to a pickle file
    with open('water_quality_model.pkl', 'wb') as file:
        pickle.dump((model, label_encoder), file)
    print("Model saved as 'water_quality_model.pkl'.")

# Run the training and saving process
if __name__ == "__main__":
    train_and_save_model()

Model Performance: MSE = 592.4394392694603, R2 Score = 0.13487762672316428
Model saved as 'water_quality_model.pkl'.


In [None]:
 df = pd.read_csv('Cities1.csv')
 df.head()

Unnamed: 0,City,Region,Country,AirQuality,WaterPollution
0,New York City,New York,United States of America,46.816038,49.50495
1,"Washington, D.C.",District of Columbia,United States of America,66.129032,49.107143
2,San Francisco,California,United States of America,60.514019,43.0
3,Berlin,,Germany,62.36413,28.612717
4,Los Angeles,California,United States of America,36.621622,61.299435
