# Library Import

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pickle

# Synthetic Data Generation

In [4]:
# Set random seed for reproducibility
np.random.seed(42)

# Define the task types and their base water usage values
task_types = ['Cooking', 'Showering', 'Gardening', 'PersonalHygiene', 'Cleaning', 'OutdoorCleaning']
base_water_usage = {
    'Cooking': 2,
    'Showering': 8,
    'Gardening': 17,
    'PersonalHygiene': 1,
    'Cleaning': 3,
    'OutdoorCleaning': 10
}

# Define the number of samples
n_samples = 1000

# Generate random data
durations = np.random.uniform(1, 45, n_samples)  # Duration in minutes

# Randomly assign a task type and calculate the corresponding water usage
task_type = np.random.choice(task_types, n_samples)
water_usage = durations + np.vectorize(base_water_usage.get)(task_type)

# Ensure non-negative water usage
water_usage = np.maximum(water_usage, 0)

# Create a DataFrame
df = pd.DataFrame({
    'Duration': durations,
    'TaskType': task_type,
    'WaterUsage': water_usage
})

# One-hot encode the task type and time of day
df = pd.get_dummies(df, columns=['TaskType'])

print(df.head(n=100))
df.to_csv('refined_dataset.csv', index=False)

     Duration  WaterUsage  TaskType_Cleaning  TaskType_Cooking  \
0   17.479765   18.479765              False             False   
1   42.831429   52.831429              False             False   
2   33.207733   50.207733              False             False   
3   27.340973   30.340973               True             False   
4    7.864820    9.864820              False              True   
..        ...         ...                ...               ...   
95  22.727006   32.727006              False             False   
96  24.000244   32.000244              False             False   
97  19.811805   20.811805              False             False   
98   2.118442   10.118442              False             False   
99   5.747223   15.747223              False             False   

    TaskType_Gardening  TaskType_OutdoorCleaning  TaskType_PersonalHygiene  \
0                False                     False                      True   
1                False                      True   

# Load Data

In [5]:
df = pd.read_csv('refined_dataset.csv')

df.head(n=100)

Unnamed: 0,Duration,WaterUsage,TaskType_Cleaning,TaskType_Cooking,TaskType_Gardening,TaskType_OutdoorCleaning,TaskType_PersonalHygiene,TaskType_Showering
0,17.479765,18.479765,False,False,False,False,True,False
1,42.831429,52.831429,False,False,False,True,False,False
2,33.207733,50.207733,False,False,True,False,False,False
3,27.340973,30.340973,True,False,False,False,False,False
4,7.864820,9.864820,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...
95,22.727006,32.727006,False,False,False,True,False,False
96,24.000244,32.000244,False,False,False,False,False,True
97,19.811805,20.811805,False,False,False,False,True,False
98,2.118442,10.118442,False,False,False,False,False,True


# Data Preprocessing

In [6]:
X = df.drop('WaterUsage', axis=1)
y = df['WaterUsage']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()

In [13]:
pipe = Pipeline([('scaler', scaler), ('lasso', Lasso())])

pipe.fit(X_train, y_train)
input = pd.DataFrame({
    'Duration': [10],
    'TaskType_Cleaning': [0],
    'TaskType_Cooking': [0],
    'TaskType_Gardening': [0],
    'TaskType_OutdoorCleaning': [1],
    'TaskType_PersonalHygiene': [0],
    'TaskType_Showering': [0]
})
prediction = pipe.predict(input)
print(f"Cooking for 10 minutes will use {prediction[0]:.2f} liters of water:")

Cooking for 10 minutes will use 18.79 liters of water:


In [14]:
with open('laso_model.pkl', 'wb') as f:
    pickle.dump(pipe, f)