# House-Price-Model(Linear Regression)

###  Importing modules

In [393]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import joblib

### Reading File

### Taking features which has correlation greater than 0.6

In [398]:
def numerical_features(df):
    numerical_data = df[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF','SalePrice']] 
    return numerical_data

In [400]:
def get_categorical_feature(df):
    features_categorical=df.select_dtypes(include=['object'])
    return features_categorical

In [404]:
def concat_num_cat_features(categorical,numerical):
    categorical.index = numerical.index
    train= pd.concat([categorical, numerical], axis=1)
    return train

In [421]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [477]:
def split_data(features,target):
    features_train, features_validation, target_train, target_validation = train_test_split(features,target, test_size = 0.2, random_state=0)
    return features_train, features_validation, target_train, target_validation

In [478]:
def Add_encoder(features_train_categorical):
    ordinal=OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=100)
    ordinal.fit(features_train_categorical)
    joblib.dump(ordinal,"../models/ordinalEncoding")

In [479]:
def Add_Scaling_object(features_train):
    sc = StandardScaler()
    sc.fit(features_train)
    joblib.dump(sc,"../models/Scaler")

In [489]:
def fill_missing_value_object(features_train):
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer.fit(features_train)
    joblib.dump(imputer,"../models/SimpleImputer")

In [490]:
def scaling(features_train):
    sft=joblib.load("../models/Scaler")
    features_train=sft.transform(features_train)
    return features_train

In [491]:
def fill_missing_value(features_train):
    SI=joblib.load("../models/SimpleImputer")
    features_train = SI.transform(features_train)
    return features_train

In [492]:
def encoder(features_train_categorical):
    oe=joblib.load("../models/ordinalEncoding")
    features_train_categorical[features_train_categorical.columns]=pd.DataFrame(oe.transform(features_train_categorical))
    return features_train_categorical

In [493]:
def model_preprocessing(features_train):
    
    features_train_numerical=numerical_features(features_train)
    features_train_categorical=get_categorical_feature(features_train)
    Add_encoder(features_train_categorical)
    features_train_categorical=encoder(features_train_categorical)
    features_train=concat_num_cat_features(features_train_categorical,features_train_numerical)
    features_train=features_train[['OverallQual','GrLivArea','Foundation','Neighborhood']]
    Add_Scaling_object(features_train)
    features_train=scaling(features_train)
    fill_missing_value_object(features_train)
    features_train=fill_missing_value(features_train)
    return features_train

In [494]:
def model_training(features_train,target_train):
    multi_regression_model = LinearRegression()
    multi_regression_model.fit(features_train,target_train)
    joblib.dump(multi_regression_model,"../models/model")

In [495]:
def model_evaluation(features_validation,target_validation):
    features_validation= model_preprocessing(features_validation)
    pr=joblib.load("../models/model")
    feature_target_prediction = pr.predict(features_validation)
    feature_target_prediction = pd.DataFrame(feature_target_prediction,columns=['SalePrice'])
    rmse=compute_rmsle(target_validation,feature_target_prediction)
    return rmse

In [496]:
def build_model(features):  
    Train_data_copy=features
    target=features['SalePrice']
    features_train, features_validation, target_train, target_validation=split_data(features,target)
    #features_train, features_validation, target_train, target_validation
    features_train = model_preprocessing(features_train)
    multi_regression_model = LinearRegression()
    multi_regression_model.fit(features_train,target_train)
    joblib.dump(multi_regression_model,"../models/model")
    eva=model_evaluation(features_validation,target_validation)
    return eva

In [497]:
features=pd.read_csv('F:/Master SEM-2/dsp-yashkumar-jain/data/house-prices-advanced-regression-techniques/train.csv')
a=build_model(features)
print(a)

0.21


In [504]:
def make_predictions(test):
    test_numerical=test[['OverallQual','GrLivArea','GarageCars','GarageArea','TotalBsmtSF']]
    test_categorical=get_categorical_feature(test)
    test_categorical=encoder(test_categorical)
    test=concat_num_cat_features(test_categorical,test_numerical)
    test=test[['OverallQual','GrLivArea','Foundation','Neighborhood']]
    test=scaling(test)
    test=fill_missing_value(test)
    model=joblib.load("../models/model")
    Final_prediction = model.predict(test)
    return Final_prediction

In [505]:
test=pd.read_csv('F:/Master SEM-2/dsp-yashkumar-jain/data/house-prices-advanced-regression-techniques/test.csv')
final=make_predictions(test)
final

array([114647.10585787, 168384.45842006, 148939.24351283, ...,
       132198.86573247, 113813.65534678, 230134.11246829])