In [3]:
import os
import pandas as pd
import numpy as np
import joblib
 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

MODEL_FILE = "model.pkl"
PIPELINE_FILE = "pipeline.pkl"

def build_pipeline(num_attribute , cat_attribute):
    
    num_pipeline = Pipeline([
        ("imputer" , SimpleImputer(strategy="median")) , 
        ("scaler" , StandardScaler()) , 
    ])

    cat_pipeline = Pipeline([
        ("OneHot" , OneHotEncoder(handle_unknown="ignore"))
    ])

    full_pipeline = ColumnTransformer([
        ("num" , num_pipeline , num_attribute) , 
        ("cat" , cat_pipeline , cat_attribute),
    ])

    return full_pipeline

if not os.path.exists(MODEL_FILE):

    housing = pd.read_csv("housing.csv")
    housing["income_cat"] = pd.cut(housing["median_income"] , bins=[0.0,1.5,3.0,4.5,6.0,np.inf] , labels = [1,2,3,4,5])


    split = StratifiedShuffleSplit(n_splits=1 , test_size=0.2 , random_state=42) 
    for train_index  , test_index in split.split(housing , housing['income_cat']):
        train_set = housing.loc[train_index].drop('income_cat' , axis =1)

    housing_labels = train_set['median_house_value'].copy()
    housing_features = train_set.drop('median_house_value' , axis=1)

    num_attribute = housing_features.drop("ocean_proximity" , axis=1).columns.tolist()
    cat_attribute = ['ocean_proximity']

    pipeline = build_pipeline(num_attribute , cat_attribute)
    housing_prepared = pipeline.fit_transform(housing_features)

    model = RandomForestRegressor()
    model = model.fit(housing_prepared , housing_labels)

    joblib.dump(model , MODEL_FILE)
    joblib.dump(pipeline , PIPELINE_FILE)

    print("MODEL IS TRAINED SUCCESSFULLY")


else:
    model = joblib.load(MODEL_FILE)
    pipeline = joblib.load(PIPELINE_FILE)

    input_data = pd.read_csv("housing.csv")
    prepared_data = pipeline.transform(input_data)
    prediction = model.predict(prepared_data)
    input_data["Predicted_prices"] = prediction

    input_data.to_csv("output.csv" , index = False)

    print("OUTPUT IS STORED INTO THE OUTPUT CSV")

OUTPUT IS STORED INTO THE OUTPUT CSV


In [4]:
df = pd.read_csv("output.csv")

In [5]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,Predicted_prices
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,442223.04
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,383021.05
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,376215.04
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,352893.05
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,301180.0


In [8]:
df.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,Predicted_prices
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,75492.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,78057.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND,89902.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,84794.0
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND,89554.0


CONCLUSION
-The model worls perfectly and predicted the price of the houses with minimum errors 
