In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [7]:
df = pd.read_csv('../data/latest/preprocessed_data_latest.csv')

In [11]:
df['rental_period'].value_counts()

rental_period
24+ måneder      33
Ubegrænset       25
34.900 kr.       13
12-23 måneder    11
7.225 kr.         9
                 ..
54.180 kr.        1
55.180 kr.        1
87.279 kr.        1
85.350 kr.        1
27.400 kr.        1
Name: count, Length: 1132, dtype: int64

In [None]:
## Make directory for plots

Path("./plots").mkdir(exist_ok=True)

In [None]:
df.info()

In [None]:
df['energy_mark'].value_counts()

In [None]:
df[df.select_dtypes(include=['object', 'bool']).columns.tolist()].columns

### First Attempt of Summary Statistics

In [None]:
from scipy import stats
# Calculate Z-scores
z_scores = np.abs(stats.zscore(df['monthly_rent']))

# Set a threshold (commonly 3)
threshold = 3

# Identify outliers
df[z_scores > threshold]

In [None]:
z_scores = np.abs(stats.zscore(df['monthly_aconto']))

# Set a threshold (commonly 3)
threshold = 3

# Identify outliers
df[z_scores > threshold]

In [None]:
# 1. Quick Summary Statistics
print("Summary Statistics:")
print(df.describe())

In [None]:
continuous_vars = df.select_dtypes(include=['number']).columns.tolist()
continuous_ratio_vars = df.select_dtypes(include=['float64']).columns.tolist()

In [None]:
# 2. Correlation Matrix for the numerical variables
correlation_matrix = df[continuous_vars].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

In [None]:
# 3. Heatmap of Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.savefig('plots/cor_matrix_heatmap.png')
plt.subplots_adjust(left=0.35, bottom=0.5)
plt.show()

In [None]:
len(df[df['monthly_aconto']==0])/len(df)*100

In [None]:
np.max(df['monthly_aconto'])

In [None]:
# 4. Scatterplot Matrix with Histograms (only continuous ratio variables)
sns.pairplot(df[df.select_dtypes(include=['float64']).columns.tolist()])
plt.title('Scatterplot Matrix')
plt.show()

In [None]:
# distribution of total monthly rent

fig = plt.figure(figsize=(8,8))
sns.histplot(df['total_monthly_rent'], kde=True, color='lightcoral', edgecolor='mistyrose')


# add labels and title
plt.xlabel('Total Monthly Rent')
plt.ylabel('Frequency')
plt.title('Distribution of Total Monthly Rent')

plt.savefig("./plots/distribution_total_monthly_rent.png")

plt.show()

In [None]:
for var in df.select_dtypes(include=['float64']).columns.tolist():
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=df[var])
    plt.title(f'Box Plot of {var}')
    plt.xlabel(var)
    plt.savefig("./plots/{}_boxplot.png".format(var))
    plt.show()

In [None]:
discrete_vars = df.select_dtypes(include=['object']).columns.tolist() 
continuous_var = 'total_monthly_rent' # We choose only the dependent variable we later want to predict


# Creating bar charts
for discrete_var in discrete_vars:
    plt.figure(figsize=(8, 5))
    sns.barplot(x=discrete_var, y=continuous_var, data=df, ci=None, color='teal')
    plt.title(f'Bar Chart of Avg. {continuous_var} by {discrete_var}')
    plt.xlabel(discrete_var)
    plt.ylabel(continuous_var)
    plt.show()

In [None]:
df.head()

### Attempt with the log transformed variables

We were familiar with a technique to have more interpretable results, which is to transform the data with a function, in this case the log works for us, as the data has a long tail and contains outliers, so applying a log transformation to the variables helps normalize the distribution and make the histograms more interpretable.

In [None]:
for var in continuous_ratio_vars:
    if var!='size_sqm' and var!='days_on_website':
        df[f'{var}_log'] = np.log1p(df[var])

In [None]:
continuous_ratio_log_vars = [col for col in df.select_dtypes(include=['float64']).columns.tolist() if ('log' in col or col=='size_sqm' or col=='days_on_website')]

In [None]:
# 1. Quick Summary Statistics
print("Summary Statistics:")
print(df.describe())

In [None]:
# 2. Correlation Matrix for the numerical variables
correlation_matrix_vars = continuous_ratio_log_vars.extend(['floor','rooms'])
correlation_matrix = df[continuous_ratio_log_vars].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

In [None]:
# 3. Heatmap of Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.subplots_adjust(left=0.2, bottom=0.2)
plt.savefig('plots/log_cor_matrix_heatmap.png')
plt.show()

In [None]:
# 4. Scatterplot Matrix with Histograms (only continuous ratio variables)
sns.pairplot(df[continuous_ratio_log_vars])
plt.title('Scatterplot Matrix')
plt.show()

In [None]:
# Logarithmic distribution of total monthly rent

fig = plt.figure(figsize=(8,8))
sns.histplot(df['total_monthly_rent_log'], kde=True, color='lightcoral', edgecolor='mistyrose')


# add labels and title
plt.xlabel('Total Monthly Rent (Log)')
plt.ylabel('Frequency')
plt.title('Distribution of log-transformed Total Monthly Rent')

plt.savefig("./plots/log_distribution_total_monthly_rent.png")

plt.show()

In [None]:
for var in continuous_ratio_log_vars:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=df[var], color='teal')
    plt.title(f'Box Plot of log-transformed {var}')
    plt.xlabel(var)
    plt.savefig("./plots/{}_log_boxplot.png".format(var))
    plt.show()

In [None]:
# 5. Bar charts of continuous variables by discrete variables
discrete_vars = df.select_dtypes(include=['object']).columns.tolist() 
continuous_var = 'total_monthly_rent' # We choose only the dependent variable we later want to predict

# Creating bar charts
for discrete_var in discrete_vars:
    plt.figure(figsize=(8, 5))
    sns.barplot(x=discrete_var, y=continuous_var, data=df, color='teal')
    plt.title(f'Bar Chart of Avg. {continuous_var} by {discrete_var}')
    plt.xlabel(discrete_var)
    plt.ylabel(continuous_var)
    plt.show()

In [None]:
df1 = df.drop(columns=[col for col in continuous_ratio_vars if (col!='size_sqm' and col!='days_on_website')])

In [None]:
df1.to_csv('preprocessed_log_data.csv', index=False, header=True, encoding='utf-8')