In [49]:
import numpy as np     
import pandas as pd      
import matplotlib.pyplot as plt   
import seaborn as sns           

# Statistical analysis
import scipy.stats as stats

# Jupyter Notebook magic commands for inline plotting
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [264]:
df = pd.read_csv('data/raw/listings.csv')

In [265]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,3191,Home in Southern Suburbs · ★4.81 · 1 bedroom ·...,3754,Brigitte,,Ward 57,-33.94762,18.47599,Entire home/apt,650.0,3,75,2023-12-23,0.58,1,306,6,
1,15007,Home in Cape Town · ★4.82 · 3 bedrooms · 4 bed...,59072,Dirk,,Ward 23,-33.80001,18.46063,Entire home/apt,4029.0,2,45,2023-12-03,0.37,3,294,6,
2,15068,Rental unit in Cape Town · 3 bedrooms · 5 beds...,59318,Linda,,Ward 23,-33.78826,18.4594,Entire home/apt,2000.0,4,0,,,4,354,0,
3,15077,Rental unit in Tableview - Sunset Beach · ★5.0...,59342,Georg,,Ward 4,-33.858356,18.490376,Private room,2377.0,2,7,2022-06-16,0.05,6,83,0,
4,15199,Rental unit in Cape Town · 1 bedroom · 1 bed ·...,59694,Alexa,,Ward 115,-33.91115,18.41235,Entire home/apt,2500.0,14,2,2016-04-15,0.02,1,365,0,


In [266]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22966 entries, 0 to 22965
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              22966 non-null  int64  
 1   name                            22966 non-null  object 
 2   host_id                         22966 non-null  int64  
 3   host_name                       22962 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   22966 non-null  object 
 6   latitude                        22966 non-null  float64
 7   longitude                       22966 non-null  float64
 8   room_type                       22966 non-null  object 
 9   price                           21574 non-null  float64
 10  minimum_nights                  22966 non-null  int64  
 11  number_of_reviews               22966 non-null  int64  
 12  last_review                     

In [267]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license'],
      dtype='object')

## Work on Property Type data:

In [268]:
# create property info dataframe

df_prop_inf = pd.DataFrame()

In [269]:
# split name column into five columns

df_prop_inf[['property type', 'rating', 'bedrooms', 'beds', 'baths']] = df['name'].str.split(' · ', expand=True)
df_prop_inf.head()

Unnamed: 0,property type,rating,bedrooms,beds,baths
0,Home in Southern Suburbs,★4.81,1 bedroom,1 bed,1 bath
1,Home in Cape Town,★4.82,3 bedrooms,4 beds,3 baths
2,Rental unit in Cape Town,3 bedrooms,5 beds,2 baths,
3,Rental unit in Tableview - Sunset Beach,★5.0,1 bedroom,2 beds,1.5 baths
4,Rental unit in Cape Town,1 bedroom,1 bed,1 bath,


In [270]:
# check if property type column only contains property type data 

percentage_with_in = (df_prop_inf['property type'].str.contains('in').sum() / len(df_prop_inf)) * 100

print("Percentage of rows with 'in' in the 'property type' column:", percentage_with_in)

Percentage of rows with 'in' in the 'property type' column: 100.0


In [271]:
# isolate property type column for further cleaning by splitting the property type and area

df_prop_inf_clean = df_prop_inf['property type'].str.split(' in ', expand=True)
df_prop_inf_clean.head()

Unnamed: 0,0,1,2
0,Home,Southern Suburbs,
1,Home,Cape Town,
2,Rental unit,Cape Town,
3,Rental unit,Tableview - Sunset Beach,
4,Rental unit,Cape Town,


In [272]:
# check which row does not have null value because the 3rd column should not exist

df_prop_inf_clean[df_prop_inf_clean.notna().all(axis=1)]

Unnamed: 0,0,1,2
1920,Rental unit,Simon's Town,Cape Town


In [273]:
# change all 3rd column values to None

df_prop_inf_clean.loc[df_prop_inf_clean[2].notna(), 2] = None
df_prop_inf_clean[2].value_counts()

Series([], Name: count, dtype: int64)

In [274]:
# drop third column

df_prop_inf_clean = df_prop_inf_clean.drop(columns=[2])

In [275]:
# change 2nd column values to 'Cape Town'

df_prop_inf_clean[1] = 'Cape Town'

In [276]:
# change column names

df_prop_inf_clean.columns = ['property_type', 'city']
df_prop_inf_clean.head()

Unnamed: 0,property_type,city
0,Home,Cape Town
1,Home,Cape Town
2,Rental unit,Cape Town
3,Rental unit,Cape Town
4,Rental unit,Cape Town


In [277]:
# remove the first column from df

df_prop_inf = df_prop_inf.iloc[:, 1:]

In [278]:
# add clean property type and city columns to property info df

df_prop_inf['property_type'] = df_prop_inf_clean['property_type']
df_prop_inf['city'] = df_prop_inf_clean['city']

df_prop_inf.head()

Unnamed: 0,rating,bedrooms,beds,baths,property_type,city
0,★4.81,1 bedroom,1 bed,1 bath,Home,Cape Town
1,★4.82,3 bedrooms,4 beds,3 baths,Home,Cape Town
2,3 bedrooms,5 beds,2 baths,,Rental unit,Cape Town
3,★5.0,1 bedroom,2 beds,1.5 baths,Rental unit,Cape Town
4,1 bedroom,1 bed,1 bath,,Rental unit,Cape Town


In [279]:
df[['property_type', 'city']] = df_prop_inf[['property_type', 'city']]

In [280]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license', 'property_type',
       'city'],
      dtype='object')

## Work on ratings column

In [281]:
# create ratings dataframe

df_ratings = pd.DataFrame()

In [282]:
df_ratings.head()

In [283]:
# populate with ratings column from original dataframe

df_ratings = df_prop_inf['rating'].copy()
df_ratings.head()

0         ★4.81
1         ★4.82
2    3 bedrooms
3          ★5.0
4     1 bedroom
Name: rating, dtype: object

In [285]:
#if the value does not contain a star, change it to No Rating

df_ratings[df_ratings.str.contains('★') == False ] = 'No rating'

In [287]:
# check changes 

df_ratings.value_counts()

rating
No rating    7288
★5.0         2939
★New         2051
★4.67         474
★4.80         390
             ... 
★3.17           1
★2.60           1
★4.02           1
★3.93           1
★2.67           1
Name: count, Length: 154, dtype: int64

In [288]:
# if value contains the string New, change it to No Rating

df_ratings[df_ratings.str.contains('New') == True ] = 'No rating'

In [289]:
# check changes

df_ratings.value_counts()

rating
No rating    9339
★5.0         2939
★4.67         474
★4.80         390
★4.88         388
             ... 
★3.17           1
★2.60           1
★4.02           1
★3.93           1
★2.67           1
Name: count, Length: 153, dtype: int64

In [290]:
# remove the star

df_ratings.loc[df_ratings.str.contains('.')] = df_ratings.str.replace('★', '')
df_ratings.head()

0         4.81
1         4.82
2    No rating
3          5.0
4    No rating
Name: rating, dtype: object

In [291]:
#repplace No rating with NaN

df_ratings.replace('No rating', np.nan, inplace=True)
df_ratings.head()

0    4.81
1    4.82
2     NaN
3     5.0
4     NaN
Name: rating, dtype: object

In [292]:
df_ratings = df_ratings.astype(float)
df_ratings.info()

<class 'pandas.core.series.Series'>
RangeIndex: 22966 entries, 0 to 22965
Series name: rating
Non-Null Count  Dtype  
--------------  -----  
13627 non-null  float64
dtypes: float64(1)
memory usage: 179.6 KB


In [293]:
df['rating'] = df_ratings

In [294]:
df['rating'].info

<bound method Series.info of 0        4.81
1        4.82
2         NaN
3        5.00
4         NaN
         ... 
22961     NaN
22962     NaN
22963     NaN
22964     NaN
22965     NaN
Name: rating, Length: 22966, dtype: float64>

## Working on bedroom column

In [296]:
df_bedrooms = pd.DataFrame()

In [298]:
df_bedrooms = df_prop_inf[['rating', 'bedrooms']].copy()
df_bedrooms.head()

Unnamed: 0,rating,bedrooms
0,★4.81,1 bedroom
1,★4.82,3 bedrooms
2,3 bedrooms,5 beds
3,★5.0,1 bedroom
4,1 bedroom,1 bed


In [301]:
def assign_bedrooms(row):
    if 'bedroom' in row['rating']:
        return row['rating']
    else:
        return row['bedrooms']

# Apply the custom function to each row of the DataFrame
df_bedrooms['bedrooms'] = df_bedrooms.apply(assign_bedrooms, axis=1)

In [304]:
df_bedrooms['bedrooms']

0         1 bedroom
1        3 bedrooms
2        3 bedrooms
3         1 bedroom
4         1 bedroom
            ...    
22961    2 bedrooms
22962     1 bedroom
22963     1 bedroom
22964     1 bedroom
22965     1 bedroom
Name: bedrooms, Length: 22966, dtype: object

In [305]:
df['bedrooms'] = df_bedrooms['bedrooms']

In [306]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license', 'property_type',
       'city', 'rating', 'bedrooms'],
      dtype='object')

## Working on beds column

In [307]:
df_beds = pd.DataFrame()

In [308]:
df_bed = df_prop_inf[['bedrooms', 'beds']].copy()
df_bed.head()

Unnamed: 0,bedrooms,beds
0,1 bedroom,1 bed
1,3 bedrooms,4 beds
2,5 beds,2 baths
3,1 bedroom,2 beds
4,1 bed,1 bath
