In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Exploration


This assignment is to predict the values of the sale price by applying regression models

- analysis the correlation of features, choose the features with correlation index larger than 0.2
- dealing with categorical data
- dealing with missing data


In [None]:
# DATA: House Price Prediction

data = pd.read_csv("data.csv")
originalData = data.copy()
data.head()
print(data.info())

In [None]:
Y = data["SalePrice"]
X = data.drop("SalePrice", axis=1)

X.drop("Id", axis=1, inplace=True)

In [None]:
# Uncomment to see the scatter matrix
pd.plotting.scatter_matrix(X, figsize=(100, 100))

## Data Cleaning

Pleas see: https://www.bogotobogo.com/python/scikit-learn/scikit_machine_learning_Data_Preprocessing-Missing-Data-Categorical-Data.php


### Missing Data (finding and replacing)


In [None]:
# # Drop columns with more than 50% missing values
X.dropna(thresh=0.5 * len(X), axis=1, inplace=True)

# Impute numerical columns
from sklearn.impute import SimpleImputer

cols = X.select_dtypes(include=[np.number]).columns
X[cols] = SimpleImputer(strategy="mean").fit_transform(X[cols])

### Categorical Data (removing or replacing them)


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Impute categorical columns
cols = X.select_dtypes(include=[object]).columns
X[cols] = SimpleImputer(strategy="most_frequent").fit_transform(X[cols])

# One hot encode categorical columns
encoder = OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False)
encoded = pd.DataFrame(
    encoder.fit_transform(X[cols]), columns=encoder.get_feature_names_out(cols)
)


# Replace categorical columns with encoded columns
X.drop(cols, axis=1, inplace=True)
X = pd.concat([X, encoded], axis=1)

## Correlation


In [None]:
# Find features with correlation to sale price > 0.2
correlation = pd.concat([X, Y], axis=1).corr()

features = correlation["SalePrice"]
features = features[features > 0.2]
features.drop("SalePrice", inplace=True)


X = X[features.index]

## Heatmap


In [None]:
# heatmap
plt.figure(figsize=(50, 50))

correlation = pd.concat([X, Y], axis=1).corr()
sns.heatmap(correlation, annot=True, cmap="coolwarm")

# Linear Regression


Please answer the following questions:

1. what is linear regression?

**Linear regression** is a model that assumes a linear relationship between input variables ($x$ o _Features_) and a single output variable ($y$), where the output variable ($y$) can be calculated from a linear combination of the input variables ($x$). [In class definition]

2. why we choose the linear regression model?

**Linear regression** is a simple model which is easy to understand and interpret. It is a good starting point for regression tasks. It is also a good model to use when the relationship between the input and output variables is approximately linear.


In [None]:
# data preperation (split ...)
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [None]:
# linear regression
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, Y_train)

In [None]:
# evaluation

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


Y_pred = model.predict(X_test)


def printScores(Y_test, Y_pred):
    print("MAE: ", mean_absolute_error(Y_test, Y_pred))
    print("RMSE: ", np.sqrt(mean_squared_error(Y_test, Y_pred)))
    print("R2 Score: ", r2_score(Y_test, Y_pred))


printScores(Y_test, Y_pred)

3.  what are scaling techniques?

**Scaling** is a technique used to normalize the range of independent variables or features of data. It is generally useful to normalize the input data before feeding it to the model. There are two common scaling techniques: **Standardization** and **Normalization**. Standardization is the process of rescaling the features so that they have the properties of a standard normal distribution with a mean of 0 and a standard deviation of 1. Normalization is the process of scaling individual samples to have unit norm.

Using two scaling method to train the linear regression again.\
Comment on the differences (if any) in the results.

Comments:

- The results of the model trained with the scaled data are better than the model trained with the unscaled data.

- The results of the normalized data are significantly worse than the results of the standardized data and the unscaled data.


In [None]:
# Using normalization

from sklearn.preprocessing import Normalizer

scaler = Normalizer()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

model.fit(X_train_scaled, Y_train)

Y_pred = model.predict(X_test_scaled)

printScores(Y_test, Y_pred)

In [None]:
# Using Standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

X_train_std = pd.DataFrame(X_train_std, columns=X_train.columns)
X_test_std = pd.DataFrame(X_test_std, columns=X_test.columns)

model = LinearRegression()
model.fit(X_train_std, Y_train)

Y_pred = model.predict(X_test_std)

printScores(Y_test, Y_pred)

# Logistic Regression


Please answer the following questions:

1. what is logistic regression?

Logistic regression is a classification method used to assign observations to a discrete set of classes. The method is similar to linear regression with the difference that, instead of outputing continuous values, logistic regression generates outputs over a discrete set of values (classes, categories). To determine the classes, logistic regression uses a function (sigmoid, softmax) to map a probabilistic output to two or more discrete classes. [In class definition]

2. why we use the logistic regression model?

Logistic regression is a simple model that is easy to understand and interpret. It is a good starting point for classification tasks. It is also a good model to use when the relationship between the input and output variables is approximately linear. It allows us to predict the probability of an observation belonging to a particular class.

Please assume: if the sale price is less than 200,000 - label 0, otherwise - label 1.


In [None]:
# data preperation (assign categories, split ...)

# We will use the same split but we will assign categories to the SalePrice column
Y = Y.apply(lambda x: 0 if x < 200000 else 1)

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [None]:
# use function LogisticRegression()
# display precition, recall, classification_report

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

from sklearn.metrics import precision_score, recall_score, classification_report

print("Precision: ", precision_score(Y_test, Y_pred))
print("Recall: ", recall_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))