In [65]:
import numpy as np
#cleanup tasks - venkata
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

scaler = StandardScaler()

In [None]:
test_df = pd.read_csv('../docs/test.csv')
# test_df.head()
train_df = pd.read_csv('../docs/train.csv')
train_df.head()

# missing_values_in_test = test_df.isnull().sum()
# print(missing_values_in_test)

def remove_highly_correlated_features(train_df, test_df, threshold=0.85):
    # Compute the correlation matrix
    corr_matrix = train_df.corr(numeric_only=True)

    to_drop = [column for column in corr_matrix.columns if any(corr_matrix[column] > threshold)]

    # Drop the highly correlated columns
    train_df = train_df.drop(columns=to_drop)
    test_df = test_df.drop(columns=to_drop)
    return train_df, test_df


def process_correlation(train_df, test_df):
    # Compute the correlation matrix
    correlation_matrix = train_df.corr(numeric_only=True)

    # Visualize the correlation matrix
    # plt.figure(figsize=(10, 8))
    # sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    # plt.title('Correlation Matrix')
    # plt.show()
    remove_highly_correlated_features(train_df, test_df)

def preprocess_data(train_df, test_df):

    # Identify numerical and categorical columns
    numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns

    # Scale numerical columns
    scaler = StandardScaler()
    train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])
    # Encode categorical columns
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_categorical = encoder.fit_transform(train_df[categorical_cols])
    encoded_categorical_test = encoder.transform(test_df[categorical_cols])
    # Convert encoded values to a DataFrame
    encoded_categorical_df = pd.DataFrame(
        encoded_categorical,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=train_df.index
    )
    encoded_categorical_df_test = pd.DataFrame(encoded_categorical_test,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=test_df.index
    )
    # Drop original categorical columns and concatenate the encoded DataFrame
    train_df = pd.concat([train_df.drop(categorical_cols, axis=1), encoded_categorical_df], axis=1)
    test_df = pd.concat([test_df.drop(categorical_cols, axis=1), encoded_categorical_df_test], axis=1)

    # Align the train and test DataFrames
    train_df = train_df.drop(columns=['Id'])  # Drop ID column
    test_ids = test_df['Id']
    test_df = test_df.drop(columns=['Id'])

    # process correlation features
    process_correlation(train_df, test_df)

    return train_df, test_df

In [None]:
# Load the dataset
X = train_df.drop('SalePrice', axis=1)  # Replace 'target_column' with the actual target column name
y = train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# preprocess
X_train_ready, X_test_ready = preprocess_data(X_train, X_test)
print(X_train_ready.info())

In [68]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_ready, y_train)

# Predict on the test data
predictions_rf = model.predict(X_test_ready)
X_test_ready['SalePriceRF'] = predictions_rf

# Calculate the mean squared error
mse = mean_squared_error(y_test, predictions_rf)

# Calculate the root mean squared error
rmse = np.sqrt(mse)

# Print the result
print(f"Root Mean Squared Error of Random Forest: {rmse}")
print(X_test_ready.head())

Root Mean Squared Error of Random Forest: 28931.374675911215
      MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  \
892    -0.866764    -0.013818 -0.211594    -0.088934     2.165000  -0.259789   
1105    0.074110     1.111406  0.145643     1.374088    -0.524174   0.751222   
413    -0.631546    -0.576430 -0.160826    -0.820445     0.372217  -1.433867   
522    -0.161109    -0.817550 -0.529035    -0.088934     1.268609  -0.781602   
1036   -0.866764     0.749727  0.205338     2.105599    -0.524174   1.175195   

      YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_New  \
892       0.873470   -0.599984    0.472844   -0.285504  ...           0.0   
1105      0.487465    1.493012    1.276986   -0.285504  ...           0.0   
413      -1.683818   -0.599984   -0.971996   -0.285504  ...           0.0   
522      -1.683818   -0.599984   -0.102477   -0.285504  ...           0.0   
1036      1.114724   -0.195261    1.255193   -0.285504  ...           0.0