In [12]:
### Dependencies ----------------------------------------------------------------
import pandas as pd # used for working with data sets
import numpy as np # used for working with arrays
import matplotlib.pyplot as plt # used for plotting
import seaborn as sns # used for plotting, see examples at https://seaborn.pydata.org/examples/index.html
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import re

###COMBINE ALL .CSVS INTO ONE
file_paths = [r"..\data\listings_detail_6-23.csv" , r"..\data\listings_detail_9-23.csv" , r"..\data\listings_detail_12-23.csv" ]

# List to store DataFrames for each CSV
dfs = []

# Read each CSV file into a DataFrame and store in the list
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames in the list along the rows
combined_df = pd.concat(dfs, ignore_index=True)

# Write the combined DataFrame to a new CSV file
combined_df.to_csv(r"..\data\combined.csv" , index=False)

path_to_data = r"..\data\combined.csv" 

listings = pd.read_csv(path_to_data)

listings = pd.read_csv(path_to_data)

##############################################DATA PREPARATION#############################################

#TODO WITH DATA:

# Handling values
numeric_columns = listings.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = listings.select_dtypes(include=['object']).columns

# Impute missing values for numerical columns with mean
listings[numeric_columns] = listings[numeric_columns].fillna(listings[numeric_columns].mean())

# Feature selection and dropping obsolete columns
exclude_columns = ['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 
                   'neighbourhood', 'picture_url', 'host_url', 'host_acceptance_rate',
                   'host_thumbnail_url', 'host_picture_url', 'neighbourhood_group_cleansed', 
                   'latitude', 'longtitude', 'calendar_updated'
                   'has_availability', 'reviews_per_month', 'amenities', 'license']
columns_to_drop = [col for col in exclude_columns if col in listings.columns]
listings = listings.drop(columns=columns_to_drop)

###### Data transformation --->:

# Convert 'price' column to numeric
listings['price'] = listings['price'].replace('[\$,]', '', regex=True).astype(float)

# Convert 'host_response_rate' column to numeric
listings['host_response_rate'] = listings['host_response_rate'].replace('[\%,]', '', regex=True).astype(float)

# Bathroom needs to be ajdusted
# Fill empty values in the 'bathrooms_text' column with '1'
listings['bathrooms_text'] = listings['bathrooms_text'].fillna('1')
# Replace "Half-bath" with "0.5" in the 'bathrooms_text' column
listings['bathrooms_text'] = listings['bathrooms_text'].replace("Half-bath", "0.5")
listings['bathrooms_text'] = listings['bathrooms_text'].replace("Private half-bath", "0.5")
listings['bathrooms_text'] = listings['bathrooms_text'].replace("Shared half-bath", "0.5 shared")
# Function to extract the number from the 'bathrooms_text' column
listings['bathrooms'] = listings['bathrooms_text'].apply(lambda x: float(x.split()[0]) if pd.notnull(x) else None)
# Create the 'bathrooms_shared' column
listings['bathrooms_shared'] = listings['bathrooms_text'].str.contains('shared').astype(int)
# create new column of length of host about
listings['host_about'].fillna('n', inplace=True)


listings['host_about_len'] = listings['host_about'].str.len().astype(int)


listings.head()


  listings['price'] = listings['price'].replace('[\$,]', '', regex=True).astype(float)
  listings['host_response_rate'] = listings['host_response_rate'].replace('[\%,]', '', regex=True).astype(float)
  listings = pd.read_csv(path_to_data)
  listings = pd.read_csv(path_to_data)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  listings['host_about'].fillna('n', inplace=True)


Unnamed: 0,host_id,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_is_superhost,host_neighbourhood,host_listings_count,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,bathrooms_shared,host_about_len
0,7287777,Simon,2013-07-04,"Prague, Czechia",Me and my son Simon are ready to show you the ...,within a day,100.0,,Prague 16,2,...,4.95,4.54,4.53,f,2,1,1,0,1,198
1,7469380,Jarda,2013-07-14,"Prague, Czechia","My name is Jarda, I work in Prague :-) and I l...",within an hour,100.0,t,Nové Město,2,...,4.89,4.8,4.73,t,1,1,0,0,0,344
2,3279605,Elena And Kira,2012-08-16,"Prague, Czechia",We work for The Royal Court Apartments as Rese...,within a few hours,100.0,t,Prague 2,17,...,4.78,4.35,4.74,f,16,15,1,0,0,466
3,2635161,Pavel,2012-06-14,"Prague, Czechia",We started to handle this business in 1999. F...,within an hour,100.0,f,Josefov,4,...,4.72,4.96,4.67,f,4,4,0,0,0,163
4,2635161,Pavel,2012-06-14,"Prague, Czechia",We started to handle this business in 1999. F...,within an hour,100.0,f,Josefov,4,...,4.92,5.0,4.57,f,4,4,0,0,0,163


In [16]:

selected_columns = ['price','host_response_rate', 'bathrooms_shared', 'host_about_len','review_scores_value','review_scores_location']  # Zde nahraďte názvy sloupců, které chcete použít

# Vyberte pouze vybrané sloupce z DataFrame
selected_data = listings[selected_columns]

# Výpočet Pearsonovy korelační matice
correlation_matrix = selected_data.corr(method='pearson')


# Vytisknutí korelační matice
print(correlation_matrix)

                           price  host_response_rate  bathrooms_shared  \
price                   1.000000           -0.028249         -0.026723   
host_response_rate     -0.028249            1.000000         -0.036766   
bathrooms_shared       -0.026723           -0.036766          1.000000   
host_about_len         -0.004711            0.103983         -0.030453   
review_scores_value    -0.004635            0.122035         -0.098439   
review_scores_location  0.004533            0.111594         -0.078957   

                        host_about_len  review_scores_value  \
price                        -0.004711            -0.004635   
host_response_rate            0.103983             0.122035   
bathrooms_shared             -0.030453            -0.098439   
host_about_len                1.000000             0.048021   
review_scores_value           0.048021             1.000000   
review_scores_location        0.070837             0.556229   

                        review_scores_l

In [None]:

###### Data encoding --->:
# Encoding categorical variables
label_encoders = {}
for column in categorical_columns:
    if column in listings.columns:
      label_encoders[column] = LabelEncoder()
      listings[column] = label_encoders[column].fit_transform(listings[column])

##############################################DATA MODELING ############################################

# Splitting the data
X = listings.drop(columns=['price'])
y = listings['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the preprocessed data
#X_train.to_csv('X_train_regression.csv', index=False)
#y_train.to_csv('y_train_regression.csv', index=False)
#X_test.to_csv('X_test_regression.csv', index=False)
#y_test.to_csv('y_test_regression.csv', index=False)

##### TESTING MODELLING IGNORE #######

from sklearn.impute import SimpleImputer

# Handling missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train the regression model
regression_model = LinearRegression()
regression_model.fit(X_train_imputed, y_train)

# Predict on the test data
y_pred = regression_model.predict(X_test_imputed)

# Plotting the actual vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', label='Actual vs Predicted')

# Plotting the ideal linear relationship
plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_test, 1))(np.unique(y_test)),
         color='red', linestyle='--', label='Ideal Linear Relationship')

plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Price')
plt.legend()
plt.grid(True)
plt.show()
