# Data Preprocessing

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing Dataset

In [None]:
dataset =pd.read_csv('water_data.csv',encoding='unicode_escape')
dataset

## Data Visualization and Missing Values

In [None]:
dataset.info()

#### Convert to numeric

In [None]:
dataset.iloc[:,3:]=dataset.iloc[:,3:].applymap(lambda x: pd.to_numeric(x,errors= "coerce"))
dataset.info()

#### Drop Columns

In [None]:
dataset.drop(['STATE','STATION CODE',"LOCATIONS","TOTAL COLIFORM (MPN/100ml)Mean"],axis=1,inplace=True)

#### Check for Null Values

In [None]:
print(dataset.isnull().any())

#### Removing Outliners

In [None]:
dataset.describe()

In [None]:
plt.scatter(range(1991),dataset["PH"])

In [None]:
plt.scatter(range(1991),dataset["D.O. (mg/l)"])

In [None]:
plt.scatter(range(1991),dataset["CONDUCTIVITY (µmhos/cm)"])

In [None]:
plt.scatter(range(1991),dataset["B.O.D. (mg/l)"])

In [None]:
plt.scatter(range(1991),dataset["NITRATENAN N+ NITRITENANN (mg/l)"])

In [None]:
plt.scatter(range(1991),dataset["FECAL COLIFORM (MPN/100ml)"])

In [None]:
dataset=dataset[dataset["PH"]<14]
dataset=dataset[dataset["PH"]>4]
dataset=dataset[dataset["B.O.D. (mg/l)"]<190]
dataset=dataset[dataset["FECAL COLIFORM (MPN/100ml)"]<1000000000]

In [None]:
print(dataset.info())
dataset.describe()

#### Fill NaN

In [None]:
dataset['Temp']=dataset['Temp'].replace(np.NaN,dataset['Temp'].mean())#26.318446
dataset['D.O. (mg/l)']=dataset['D.O. (mg/l)'].replace(np.NaN,dataset['D.O. (mg/l)'].mean())
dataset['CONDUCTIVITY (µmhos/cm)']=dataset['CONDUCTIVITY (µmhos/cm)'].replace(np.NaN,dataset['CONDUCTIVITY (µmhos/cm)'].mean())
dataset['NITRATENAN N+ NITRITENANN (mg/l)']=dataset['NITRATENAN N+ NITRITENANN (mg/l)'].replace(np.NaN,dataset['NITRATENAN N+ NITRITENANN (mg/l)'].mean())
dataset.info()

#### Organize Dataset

In [None]:
df=dataset.groupby(by=["year"],sort=True,as_index=True).mean()
df

In [None]:
df.describe()

In [None]:
y=pd.Series()
yy=pd.DataFrame()

In [None]:
y=df["PH"].apply(lambda x: (0 if (8>=x>=7)
                      else (0.028 if (8.5>=x>=8) or (7>=x>=6.5)
                      else (0.084 if (9>=x>=8.8) or (6.5>=x>=6.3)
                      else (0.112 if (10>=x>=9)  or (6.3>=x>=6)
                      else  0.14)))))
yy["PH"]=df["PH"].apply(lambda x: (0 if (8>=x>=7)
                      else (0.028 if (8.5>=x>=8) or (7>=x>=6.5)
                      else (0.084 if (9>=x>=8.8) or (6.5>=x>=6.3)
                      else (0.112 if (10>=x>=9)  or (6.3>=x>=6)
                      else  0.14)))))

In [None]:
yy["D.O. (mg/l)"]=df["D.O. (mg/l)"].apply(lambda x: (0 if (8>=x>=6.5)
                                 else (0.04 if (6.5>=x>=6)
                                 else  0.2)))
y=y+yy["D.O. (mg/l)"]

In [None]:
yy["CONDUCTIVITY (µmhos/cm)"]=df["CONDUCTIVITY (µmhos/cm)"].apply(lambda x: (0 if (1500>=x>=50)
                                             else (0.012 if (2000>=x>=1500)
                                             else (0.048 if (2500>=x>=2000)
                                             else  0.06))))
y=y+yy["CONDUCTIVITY (µmhos/cm)"]

In [None]:
yy["NITRATENAN N+ NITRITENANN (mg/l)"]=df["NITRATENAN N+ NITRITENANN (mg/l)"].apply(lambda x: (0 if (1>=x)
                                           else (0.036 if (1.5>=x>=1)
                                           else (0.144 if (2>=x>=1.5)
                                           else  0.18))))
y=y+yy["NITRATENAN N+ NITRITENANN (mg/l)"]

In [None]:
yy["B.O.D. (mg/l)"]=df["B.O.D. (mg/l)"].apply(lambda x: (0 if (3>=x>=0)
                              else (0.024 if (5>=x>=3)
                              else (0.072 if (10>=x>=5)
                              else  0.12))))
y=y+yy["B.O.D. (mg/l)"]

In [None]:
yy["FECAL COLIFORM (MPN/100ml)"]=df["FECAL COLIFORM (MPN/100ml)"].apply(lambda x: (0 if (5000>=x>=0)
                                     else (0.04 if (10000>=x>=5000)
                                     else (0.12 if (100000>=x>=10000)
                                     else  0.2))))
y=y+yy["FECAL COLIFORM (MPN/100ml)"]

In [None]:
y=y*100
yy["y"]=y

In [None]:
yy

In [None]:
x=df.index.tolist()

In [None]:
x=list(map(lambda z:[z,],x))

In [None]:
x

In [None]:
y=list(y)

In [None]:
y

#### Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

### Model Training

In [None]:
from sklearn.linear_model import LinearRegression
lr= LinearRegression()
lr.fit(x_train,y_train)

#### Predicting for Test Data

In [None]:
y_pre=lr.predict(x_test)
y_pre

In [None]:
plt.scatter(x_test,y_test)
plt.plot(x_test,y_pre,"r")

In [None]:
lr.predict([[2025]])

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pre) 

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pre)

In [None]:
from joblib import dump
dump(lr,'model.save')