### Core Prediction Question
*Develop a predictive model to estimate the water Sodium levels of rivers and lakes based on parameters like features like CO2 levels, turbidity, dissolved oxygen, and contaminant levels.*

Authors: Nidhi Nayak, Vishali Kirthi Vallioor
ENERES 131 Final Project

#### Reading In Our Data Sets

In [None]:
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv("final_data/water_quality.csv")
df.head()

In [None]:
#printing the different characteristic names to identify necessary ones for pH level predictions
df.columns.unique()

In [None]:
#identifying relevant features to predict pH levels (after doing research)
relevant_sodium_characteristics = ['Turbidity', 'Calcium', 'Magnesium', 'Carbon dioxide', 'Sodium', 'Hardness, Ca, Mg', 
                                   'Organic carbon', 'Silica', 'Chloride', 'Arsenic']
filtered_sodium_dataframe = df[relevant_sodium_characteristics]
print(filtered_sodium_dataframe)

In [None]:
print("Basic Info about the Dataset: ")
print(filtered_sodium_dataframe.info())

print("\nNaN Count In Water Potability Dataset: ")
print(filtered_sodium_dataframe.isnull().sum())

print("\nDataset Described (Stats): ")
print(filtered_sodium_dataframe.describe())

print("\nSum of null values in columns: ")
print(filtered_sodium_dataframe.isnull().sum())

In [None]:
#replacing null values with the mean value for EDA
filtered_sodium_dataframe_nan_vals = filtered_sodium_dataframe.fillna(filtered_sodium_dataframe.mean())

### EDA

#### Correlation of All Relevant Variables

In [None]:
plt.figure(figsize=(12, 8))
corr_matrix = filtered_sodium_dataframe_nan_vals.corr()
sns.heatmap(corr_matrix, annot=True, cmap='rocket', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Between Relevant Features')

### Exploring Sodium Relationships In Depth

In [None]:
dropped_nan_sodium = filtered_sodium_dataframe['Sodium'].dropna()
filtered_sodium_log = np.log1p(dropped_nan_sodium)

sns.histplot(filtered_sodium_log, kde=True, color='orange')
plt.title('Log-Transformed Distribution of Sodium Levels')
plt.xlabel('Log Sodium')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))

dropped_nan_sodium_calcium = filtered_sodium_dataframe[['Sodium', 'Calcium']].dropna()
plt.subplot(1, 2, 1)
sns.scatterplot(data=dropped_nan_sodium_calcium, x='Calcium', y='Sodium', color='orange')
plt.title('Sodium vs Calcium')

dropped_nan_sodium_magnesium = filtered_sodium_dataframe[['Sodium', 'Magnesium']].dropna()
plt.subplot(1, 2, 2)
sns.scatterplot(data=dropped_nan_sodium_magnesium, x='Magnesium', y='Sodium', color='purple')
plt.title('Sodium vs Magnesium')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))

dropped_nan_sodium_turbidity = filtered_sodium_dataframe[['Sodium', 'Turbidity']].dropna()
plt.subplot(1, 2, 1)
sns.scatterplot(data=dropped_nan_sodium_turbidity, x='Turbidity', y='Sodium', color='purple')
plt.title('Sodium vs Turbidity')

dropped_nan_sodium_carbon_dioxide = filtered_sodium_dataframe[['Sodium', 'Carbon dioxide']].dropna()
plt.subplot(1, 2, 2)
sns.scatterplot(data=dropped_nan_sodium_carbon_dioxide, x='Carbon dioxide', y='Sodium', color='orange')
plt.title('Sodium vs Carbon dioxide')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))

dropped_nan_sodium_hardness = filtered_sodium_dataframe[['Sodium', 'Hardness, Ca, Mg']].dropna()
plt.subplot(1, 2, 1)
sns.scatterplot(data=dropped_nan_sodium_hardness, x='Hardness, Ca, Mg', y='Sodium', color='orange')
plt.title('Sodium vs Hardness, Ca, Mg')

dropped_nan_sodium_organic_carbon = filtered_sodium_dataframe[['Sodium', 'Organic carbon']].dropna()
plt.subplot(1, 2, 2)
sns.scatterplot(data=dropped_nan_sodium_organic_carbon, x='Organic carbon', y='Sodium', color='purple')
plt.title('Sodium vs Organic Carbon')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))

dropped_nan_sodium_silica = filtered_sodium_dataframe[['Sodium', 'Silica']].dropna()
plt.subplot(1, 2, 1)
sns.scatterplot(data=dropped_nan_sodium_silica, x='Silica', y='Sodium', color='purple')
plt.title('Sodium vs Silica')

dropped_nan_sodium_chloride = filtered_sodium_dataframe[['Sodium', 'Chloride']].dropna()
plt.subplot(1, 2, 2)
sns.scatterplot(data=dropped_nan_sodium_chloride, x='Chloride', y='Sodium', color='orange')
plt.title('Sodium vs Chloride')

plt.tight_layout()
plt.show()