In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow 
import mlflow.sklearn
import numpy as np



In [2]:
housing=pd.read_csv("datasets/housing/housing.csv")

In [3]:
housing.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
train_set,test_set=train_test_split(housing,test_size=0.2)

In [5]:
housing=train_set.drop("median_house_value",axis=1)
housing_num=housing.drop("ocean_proximity",axis=1)
housing_labels=train_set["median_house_value"].copy()


In [6]:
test_set
X_test=test_set.drop("median_house_value",axis=1)
Y_test=test_set["median_house_value"].copy()

In [7]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="median")
X=imputer.fit_transform(housing_num)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                        bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

### ML FLOW TRACKING

In [9]:
#mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 127.0.0.1 --port 5000
remote_tracking_uri= "http://127.0.0.1:5000"
mlflow.set_tracking_uri(remote_tracking_uri)

In [10]:
mlflow.get_tracking_uri()

'http://127.0.0.1:5000'

In [11]:
# setting up experiment name 
exp_name="linear_regressiom_housing_data"
mlflow.set_experiment(exp_name)

2022/05/03 19:28:13 INFO mlflow.tracking.fluent: Experiment with name 'linear_regressiom_housing_data' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlruns/1', experiment_id='1', lifecycle_stage='active', name='linear_regressiom_housing_data', tags={}>

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [13]:
with mlflow.start_run(run_name="PARENT_RUN") as parent_run:
    with mlflow.start_run(run_name="Data_prep",nested=True) as data_prep:
        num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
        ])
        num_attribs = list(housing_num)
        cat_attribs = ["ocean_proximity"]
        full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
        ])
        housing_prepared=full_pipeline.fit_transform(housing)
        
    with mlflow.start_run(run_name="model_training",nested=True) as model_train:
        lin_reg=LinearRegression()
        lin_reg.fit(housing_prepared,housing_labels)
    with mlflow.start_run(run_name="model_scoring",nested=True) as model_score:
        X_test_prep=full_pipeline.transform(X_test)
        predictions=lin_reg.predict(X_test_prep)
        mse=mean_squared_error(Y_test,predictions)
        rmse=np.sqrt(mse)
        mlflow.log_metric(key="rmse",value=rmse)
        
    mlflow.sklearn.log_model(lin_reg,"model")
        
        
        