In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df=pd.read_csv(r"../Lab1/Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [8]:
df.shape

(545, 13)

In [9]:
print(df.columns.tolist())

['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [11]:
df.isna().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [12]:
def encode_categorical(df):
    df = df.copy()
    binary_mappings = {'yes': 1, 'no': 0}
    df['mainroad'] = df['mainroad'].map(binary_mappings)
    df['guestroom'] = df['guestroom'].map(binary_mappings)
    df['basement'] = df['basement'].map(binary_mappings)
    df['hotwaterheating'] = df['hotwaterheating'].map(binary_mappings)
    df['airconditioning'] = df['airconditioning'].map(binary_mappings)
    df['prefarea'] = df['prefarea'].map(binary_mappings)
    furnishing_map = {'unfurnished': 0, 'semi-furnished': 1, 'furnished': 2}
    df['furnishingstatus'] = df['furnishingstatus'].map(furnishing_map)
    return df


In [13]:
df_encoded = encode_categorical(df)

In [14]:
def handle_missing_values(df, method='zero'):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype in [np.float64, np.int64]:
            if method == 'zero':
                df[col].fillna(0, inplace=True)
            elif method == 'mean':
                mean_val = df[col].mean(skipna=True)
                df[col].fillna(mean_val, inplace=True)
            elif method == 'median':
                median_val = df[col].median(skipna=True)
                df[col].fillna(median_val, inplace=True)
    return df

In [15]:
def normalize_minmax(df):
    df = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        min_val = df[col].min()
        max_val = df[col].max()
        if max_val != min_val:
            df[col] = (df[col] - min_val) / (max_val - min_val)
        else:
            df[col] = 0  # All values are the same
    return df

In [16]:
def normalize_standard(df):
    df = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        mean_val = df[col].mean()
        std_val = df[col].std()
        if std_val != 0:
            df[col] = (df[col] - mean_val) / std_val
        else:
            df[col] = 0  # All values the same
    return df


In [17]:
def detect_outliers(df):
    outlier_indices = set()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outlier_rows = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
        outlier_indices.update(outlier_rows)
    return list(outlier_indices)

In [18]:
class SimpleLinearRegression:
    def __init__(self):
        self.coefficients = None  # Array for coefficients
        self.intercept = None

    def fit(self, X, y):
        # Add bias term (column of 1s)
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        # Calculate coefficients using Normal Equation: theta = (X.T X)^(-1) X.T y
        theta = np.linalg.pinv(X_b.T @ X_b) @ X_b.T @ y
        self.intercept = theta[0]
        self.coefficients = theta[1:]

    def predict(self, X):
        return X @ self.coefficients + self.intercept


In [19]:
def r2_score_custom(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - ss_res / ss_tot if ss_tot != 0 else 0


In [20]:
def run_experiment(df, missing_method, normalization_method):
    df_preprocessed = handle_missing_values(df, missing_method)
    if normalization_method == 'minmax':
        df_preprocessed = normalize_minmax(df_preprocessed)
    elif normalization_method == 'standard':
        df_preprocessed = normalize_standard(df_preprocessed)
    # Prepare arrays for regression
    X = df_preprocessed.drop('price', axis=1).values
    y = df_preprocessed['price'].values
    model = SimpleLinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    score = r2_score_custom(y, y_pred)
    return score, df_preprocessed

In [21]:
results = {}
for missing_method in ['zero', 'mean', 'median']:
    for norm_method in ['minmax', 'standard']:
        score, processed_df = run_experiment(df_encoded, missing_method, norm_method)
        results[(missing_method, norm_method)] = {'score_before_outlier_removal': score}
        # Detect and remove outliers
        outliers = detect_outliers(processed_df)
        df_no_outliers = processed_df.drop(outliers)
        if len(df_no_outliers) > 0:
            X_no_out = df_no_outliers.drop('price', axis=1).values
            y_no_out = df_no_outliers['price'].values
            model_no_out = SimpleLinearRegression()
            model_no_out.fit(X_no_out, y_no_out)
            y_pred_no_out = model_no_out.predict(X_no_out)
            score_no_out = r2_score_custom(y_no_out, y_pred_no_out)
        else:
            score_no_out = None  # No data left after outlier removal
        results[(missing_method, norm_method)]['score_after_outlier_removal'] = score_no_out
        results[(missing_method, norm_method)]['outliers_detected'] = outliers


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

Fo

In [22]:
# Display results
print("Results: ")
for key, value in results.items():
    print(f"Missing: {key[0]}, Normalization: {key[1]} => R² before outlier removal: {value['score_before_outlier_removal']:.4f}, "
          f"R² after outlier removal: {value['score_after_outlier_removal']}, Outliers detected: {value['outliers_detected']}")

Results: 
Missing: zero, Normalization: minmax => R² before outlier removal: 0.6801, R² after outlier removal: 0.48644344837157805, Outliers detected: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 63, 64, 66, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 86, 88, 89, 91, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, 125, 126, 128, 129, 130, 131, 134, 135, 139, 140, 141, 143, 144, 145, 148, 149, 150, 151, 152, 155, 156, 157, 159, 160, 161, 162, 163, 164, 165, 166, 169, 171, 172, 173, 174, 175, 177, 178, 180, 181, 182, 183, 184, 186, 187, 188, 189, 191, 192, 193, 194, 196, 197, 201, 204, 206, 210, 211, 216, 217, 218, 220, 224, 225, 226, 229, 230, 231, 233, 235, 236, 240, 244, 245, 246, 247, 248, 249, 253, 262, 263, 265, 26