In [138]:
import pandas as pd
import numpy as  np

In [139]:
#load the dataset
houses = pd.read_csv('Data/listings.csv')
houses.columns


Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license'],
      dtype='object')

In [140]:
houses.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,3191,Malleson Garden Cottage,3754,Brigitte,,Ward 57,-33.94762,18.47599,Entire home/apt,783.0,3,79,2024-06-18,0.56,1,11,4,
1,15007,Blaauwberg House on the beach in Bloubergstrand,59072,Dirk,,Ward 23,-33.80001,18.46063,Entire home/apt,6550.0,2,47,2024-10-19,0.35,3,46,2,
2,15068,Grande Bay,59318,Linda,,Ward 23,-33.78826,18.4594,Entire home/apt,3000.0,4,0,,,6,356,0,


In [141]:
#Describe the dataset
houses.describe()

Unnamed: 0,id,host_id,neighbourhood_group,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
count,25816.0,25816.0,0.0,25816.0,25816.0,21510.0,25816.0,25816.0,19458.0,25816.0,25816.0,25816.0
mean,5.871331e+17,204091800.0,,-33.95559,18.472027,4404.111018,4.396266,23.116478,0.932417,12.84467,200.388325,6.119654
std,5.316011e+17,200048400.0,,0.098417,0.12409,9540.179791,20.776909,46.209194,1.091803,28.473072,129.64558,10.544251
min,3191.0,3754.0,,-34.26284,18.318968,144.0,1.0,0.0,0.01,1.0,0.0,0.0
25%,32581900.0,34914030.0,,-34.01493,18.400966,1200.0,1.0,1.0,0.18,1.0,75.0,0.0
50%,7.337961e+17,116051400.0,,-33.927548,18.425413,2129.0,2.0,5.0,0.57,2.0,238.0,2.0
75%,1.086186e+18,370727200.0,,-33.910028,18.482063,4071.0,3.0,24.0,1.26,8.0,320.0,8.0
max,1.320402e+18,668819000.0,,-33.57218,18.93603,273736.0,999.0,798.0,22.65,175.0,365.0,154.0


In [142]:
#info about the dataset
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25816 entries, 0 to 25815
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              25816 non-null  int64  
 1   name                            25815 non-null  object 
 2   host_id                         25816 non-null  int64  
 3   host_name                       25745 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   25816 non-null  object 
 6   latitude                        25816 non-null  float64
 7   longitude                       25816 non-null  float64
 8   room_type                       25816 non-null  object 
 9   price                           21510 non-null  float64
 10  minimum_nights                  25816 non-null  int64  
 11  number_of_reviews               25816 non-null  int64  
 12  last_review                     

From the output displayed above, a few observations can be made :

1. There are 18 columns in this dataset
2. They include both integers and objects(numbers and strings)
3. Some columns have null values - further exploarion will be done to determine the best approach to handle the null values


#### Check for missing values

In [143]:
#check for missing values in percentages
houses.isnull().sum()/len(houses)*100

id                                  0.000000
name                                0.003874
host_id                             0.000000
host_name                           0.275023
neighbourhood_group               100.000000
neighbourhood                       0.000000
latitude                            0.000000
longitude                           0.000000
room_type                           0.000000
price                              16.679579
minimum_nights                      0.000000
number_of_reviews                   0.000000
last_review                        24.628138
reviews_per_month                  24.628138
calculated_host_listings_count      0.000000
availability_365                    0.000000
number_of_reviews_ltm               0.000000
license                            99.674620
dtype: float64

The output above shows the percentages of the columns with missing values. 
1. All the rows in the neighbourhood_group column have null values
2. The Name column has some few missing values in the column ~ 0.03
3. Host name column has 27% of the rows with missing values
4. Price has 16% of its rows with missing values
5. Last review and reviews_per_month  columns have ~25% null values in their rows
6. The last column has nearly 99% of it rows with missing values

For columns that have missing values with more than 50% of their values missing will be dropped.
The columns whose missing values are below @5% will be imputed with either the mean, median or mode
and those with less thab 10%, their rows will be dropped as this will not affect the dataset 




In [144]:
#drop column neighbourhood_group
houses.drop(columns="neighbourhood_group", inplace=True)


In [145]:
houses.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365',
       'number_of_reviews_ltm', 'license'],
      dtype='object')

In [146]:
#drop the license column 
houses.drop(columns="license", inplace=True)

In [147]:
#check for the columns with missing values but less than 10%
houses.columns[houses.isnull().sum() > 0 ]


Index(['name', 'host_name', 'price', 'last_review', 'reviews_per_month'], dtype='object')

In [None]:
#Drop the rows in the name column  and host_name with missing values
houses["name"].dropna(inplace=True)

In [149]:
#check the name column for missing values
houses["name"].isnull().sum()

1

In [150]:
#check for missing values in percentages
houses.isnull().sum()/len(houses)*100

id                                 0.000000
name                               0.003874
host_id                            0.000000
host_name                          0.275023
neighbourhood                      0.000000
latitude                           0.000000
longitude                          0.000000
room_type                          0.000000
price                             16.679579
minimum_nights                     0.000000
number_of_reviews                  0.000000
last_review                       24.628138
reviews_per_month                 24.628138
calculated_host_listings_count     0.000000
availability_365                   0.000000
number_of_reviews_ltm              0.000000
dtype: float64