In [1]:
## Load libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Path to the CSV file
csv_path = './data/merged_data.csv'

# Read the CSV file into a DataFrame
merged_data = pd.read_csv(csv_path)

# Display the first few rows of the DataFrame to verify

# List of columns to drop
columns_to_drop = [
    'saleType', 'guid', 'latitude', 'longitude',
    'estateCode', 'city', 'groupKey', 'canGetVR', 'bfEnr'
]

# Dropping the columns from the DataFrame
merged_data = merged_data.drop(columns=columns_to_drop)

In [3]:
merged_data.iloc[0]

estateId                                             2017881
address                                         Granhegnet 3
zipCode                                                 4850
price                                                 850000
soldDate                           2023-12-30 23:00:00+00:00
propertyType                                               4
sqmPrice                                            14912.28
rooms                                                      3
size                                                      57
buildYear                                               1976
change                                             -5.027933
municipalityCode                                         376
ouId                                              1094495391
ouAddress                    granhegnet-3-4850-stubbekoebing
Municipality                                    Guldborgsund
year                                                    2023
area_of_sports_facilitie

In [4]:
data = merged_data
data = data.dropna(subset=['sqmPrice'])


In [5]:
def split_data(X, y):
    # Remove rows with NaN values from both X and y
    mask = ~(X.isna().any(axis=1) | y.isna())
    X = X[mask]
    y = y[mask]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Handle potential negative or zero values in y
    y_train_pos = y_train[y_train > 0]
    y_test_pos = y_test[y_test > 0]

    y_train_log = np.log(y_train_pos)
    y_test_log = np.log(y_test_pos)
    
    return X_train_scaled, X_test_scaled, y_train_log, y_test_log

# 2. Lasso

## Fit lasso

In [6]:
X = data.drop(['propertyType','sqmPrice', 'estateId', 'address', 'zipCode', 'price', 'soldDate', 'ouId', 'ouAddress', 'change', 'Municipality', 'population','danish_origin'], axis=1)
y = data['sqmPrice']


In [8]:
X_train, X_test, y_train, y_test = split_data(X, y)

In [9]:
lasso_model = Lasso(alpha=0.000115) 
lasso_model.fit(X_train, y_train)

y_pred= lasso_model.predict(X_test)

mse_log = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse_log.round(5)}")

lasso_coef3= pd.Series(lasso_model.coef_, index= X.columns)
lasso_coef3

Mean Squared Error: 0.43879


rooms                       -0.041752
size                        -0.106268
buildYear                    0.112104
municipalityCode            -0.104420
year                         0.174301
area_of_sports_facilities   -0.013983
descendants                  0.113877
crimes                      -0.036286
full_time_unemployed         0.018321
gini_coefficient             0.008192
average_age                  0.137413
immigrants                  -0.061326
municipal_tax               -0.160232
primaryschool               -0.261512
kvu                         -0.297873
lvu                          0.059970
job                         -0.060886
dtype: float64

## Removing insignificant variables

In [10]:
lasso_coeffs = lasso_coef3

# Set a threshold to determine insignificance
threshold = 1e-3  # Adjust this threshold based on your needs

# Identify indices of significant features
significant_indices = np.where(np.abs(lasso_coeffs) > threshold)[0]

# Select only significant features from your data
X_train_selected = X_train[:, significant_indices]
X_test_selected = X_test[:, significant_indices]

In [11]:
lasso_model = Lasso(alpha=0.00015) 
lasso_model.fit(X_train_selected, y_train)

y_pred_log = lasso_model.predict(X_test_selected)

mse_log = mean_squared_error(y_test, y_pred_log)
print(f"Mean Squared Error: {mse_log.round(5)}")

lasso_coef3= pd.Series(lasso_model.coef_, index= X.columns[significant_indices])
lasso_coef3

Mean Squared Error: 0.43879


rooms                       -0.041718
size                        -0.106242
buildYear                    0.112065
municipalityCode            -0.104402
year                         0.173125
area_of_sports_facilities   -0.013531
descendants                  0.113133
crimes                      -0.036252
full_time_unemployed         0.018311
gini_coefficient             0.008021
average_age                  0.138304
immigrants                  -0.060191
municipal_tax               -0.159892
primaryschool               -0.259801
kvu                         -0.295047
lvu                          0.060160
job                         -0.060497
dtype: float64

## Validation

In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from joblib import Parallel, delayed
def evaluate_model(lambda_, X, y):
    pipe_lasso = make_pipeline(
        StandardScaler(),
        Lasso(alpha=lambda_, random_state=80499, max_iter=5000, tol=1e-4)
    )
    scores = cross_val_score(pipe_lasso, X, y, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    return -scores.mean()  # Return the negative MSE (we're minimizing)