# house prices modeling 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_log_error

## load the dataset

In [3]:
train_df = pd.read_csv('C:/dsp-thirumurugan-kumar/data/train.csv')
test_df = pd.read_csv('C:/dsp-thirumurugan-kumar/data/test.csv')

## Split the data

In [4]:
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Preprocessing 

### Feature selection

In [5]:
continuous_features = ['LotArea', 'GrLivArea']
categorical_features = ['MSZoning', 'Neighborhood']

### Feature processing (scaling and encoding)

In [6]:
imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_features] = imputer.fit_transform(X_train[categorical_features])
X_test[categorical_features] = imputer.transform(X_test[categorical_features])

scaler = StandardScaler()
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

encoder = OneHotEncoder()  
X_train_encoded = encoder.fit_transform(X_train[categorical_features]).toarray()
X_test_encoded = encoder.transform(X_test[categorical_features]).toarray()

X_train_processed = np.concatenate([X_train[continuous_features].reset_index(drop=True),
                                     X_train_encoded], axis=1)
X_test_processed = np.concatenate([X_test[continuous_features].reset_index(drop=True),
                                   X_test_encoded], axis=1)

## Model training

In [8]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_processed, y_train)

## Model Evaluation

In [10]:
y_pred = model.predict(X_test_processed)

def compute_rmsle(y_test, y_pred, precision=2):
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

rmsle = compute_rmsle(y_test, y_pred)
print(f'RMSLE: {rmsle}')

# Display a random sample of 20 rows from the test set predictions
print("\nRandom sample of 20 rows from the test set predictions:")
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions_df.sample(20))

RMSLE: 0.21

Random sample of 20 rows from the test set predictions:
      Actual      Predicted
275   205000  146445.119242
274   124500  118198.265794
691   755000  465618.706973
259    97000   86438.884128
1160  146000  158954.530898
478   297000  217006.833869
1033  230000  210887.494671
51    114500  124788.486455
588   143000  191252.387538
937   253000  284495.885313
925   175000  146929.159437
679   128500  120312.750670
239   113000  138395.338889
984   126000  186748.419496
233   128200  156396.473991
353   105900   72172.003489
887   135500  135819.905667
1432   64500   85412.146392
620    67000   92287.312496
78    136500  178161.504144
