In [None]:
import numpy as np
#cleanup tasks - venkata
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt

scaler = StandardScaler()

In [None]:
test_df = pd.read_csv('../docs/test.csv')
# test_df.head()
train_df = pd.read_csv('../docs/train.csv')
train_df.head()

# missing_values_in_test = test_df.isnull().sum()
# print(missing_values_in_test)

def remove_highly_correlated_features(train_df, test_df, threshold=0.85):
    # Compute the correlation matrix
    corr_matrix = train_df.corr(numeric_only=True)

    to_drop = [column for column in corr_matrix.columns if any(corr_matrix[column] > threshold)]

    # Drop the highly correlated columns
    train_df = train_df.drop(columns=to_drop)
    test_df = test_df.drop(columns=to_drop)
    return train_df, test_df


def process_correlation(train_df, test_df):
    # Compute the correlation matrix
    correlation_matrix = train_df.corr(numeric_only=True)

    # Visualize the correlation matrix
    # plt.figure(figsize=(10, 8))
    # sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    # plt.title('Correlation Matrix')
    # plt.show()
    remove_highly_correlated_features(train_df, test_df)

def preprocess_data(train_df, test_df):

    # Identify numerical and categorical columns
    numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns

    # Scale numerical columns
    scaler = StandardScaler()
    train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
    test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])
    # Encode categorical columns
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_categorical = encoder.fit_transform(train_df[categorical_cols])
    encoded_categorical_test = encoder.transform(test_df[categorical_cols])
    # Convert encoded values to a DataFrame
    encoded_categorical_df = pd.DataFrame(
        encoded_categorical,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=train_df.index
    )
    encoded_categorical_df_test = pd.DataFrame(encoded_categorical_test,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=test_df.index
    )
    # Drop original categorical columns and concatenate the encoded DataFrame
    train_df = pd.concat([train_df.drop(categorical_cols, axis=1), encoded_categorical_df], axis=1)
    test_df = pd.concat([test_df.drop(categorical_cols, axis=1), encoded_categorical_df_test], axis=1)

    # Align the train and test DataFrames
    train_df = train_df.drop(columns=['Id'])  # Drop ID column
    test_ids = test_df['Id']
    test_df = test_df.drop(columns=['Id'])

    # process correlation features
    process_correlation(train_df, test_df)

    return train_df, test_df

In [81]:
def get_training_data():
    global y_train, y_test, X_train_ready, X_test_ready
    # Load the dataset
    X = train_df.drop('SalePrice', axis=1)  # Replace 'target_column' with the actual target column name
    y = train_df['SalePrice']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # preprocess
    X_train_ready, X_test_ready = preprocess_data(X_train, X_test)
    print(X_train_ready.info())
    return X_train_ready, X_test_ready, y_train, y_test


In [82]:
X_train, X_test, y_train, y_test = get_training_data()

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_ready, y_train)

# Predict on the test data
predictions_rf = model.predict(X_test_ready)
X_test_ready['SalePriceRF'] = predictions_rf

# Calculate the mean squared error
mse = mean_squared_error(y_test, predictions_rf)

# Calculate the root mean squared error
rmse = np.sqrt(mse)

# Print the result
print(f"Root Mean Squared Error of Random Forest: {rmse}")
print(X_test_ready.head())

<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 254 to 1126
Columns: 301 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(301)
memory usage: 2.7 MB
None
Root Mean Squared Error of Random Forest: 28931.374675911215
      MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
892     -0.86676     -0.01382 -0.21159     -0.08893      2.16500   -0.25979   
1105     0.07411      1.11141  0.14564      1.37409     -0.52417    0.75122   
413     -0.63155     -0.57643 -0.16083     -0.82044      0.37222   -1.43387   
522     -0.16111     -0.81755 -0.52903     -0.08893      1.26861   -0.78160   
1036    -0.86676      0.74973  0.20534      2.10560     -0.52417    1.17519   

      YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_New  \
892        0.87347    -0.59998     0.47284    -0.28550  ...       0.00000   
1105       0.48746     1.49301     1.27699    -0.28550  ...       0.00000   
413       -1.68382    -0.59998    -0.97200    -0.28550  .