In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('HousingData.csv')

# Handling missing values
# Replace 'NA' with NaN
df.replace('NA', pd.NA, inplace=True)

# Convert columns to appropriate data types
df = df.apply(pd.to_numeric, errors='ignore')

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Features and target variable
X = df_imputed.drop(columns=['MEDV'])
y = df_imputed['MEDV']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculate R-squared score and Mean Squared Error (MSE)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared score: {r2:.3f}")
print(f"Mean Squared Error: {mse:.3f}")


R-squared score: 0.659
Mean Squared Error: 25.018


In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('HousingData.csv')

# Handling missing values
# Replace 'NA' with NaN
df.replace('NA', pd.NA, inplace=True)

# Convert columns to appropriate data types
df = df.apply(pd.to_numeric, errors='ignore')

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Features and target variable
X = df_imputed.drop(columns=['MEDV'])
y = df_imputed['MEDV']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculate R-squared score and Mean Squared Error (MSE)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared score: {r2:.3f}")
print(f"Mean Squared Error: {mse:.3f}")


R-squared score: 0.888
Mean Squared Error: 8.244


In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('HousingData.csv')

# Handling missing values
# Replace 'NA' with NaN
df.replace('NA', pd.NA, inplace=True)

# Convert columns to appropriate data types
df = df.apply(pd.to_numeric, errors='ignore')

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Features and target variable
X = df_imputed.drop(columns=['MEDV'])
y = df_imputed['MEDV']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculate R-squared score and Mean Squared Error (MSE)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R-squared score: {r2:.3f}")
print(f"Mean Squared Error: {mse:.3f}")


R-squared score: 0.915
Mean Squared Error: 6.238
