## **AIRBNB Data Cleaning**

**Steps**
1. Importing all dependencies (lib)
2. Loading Datasets
3. Initial Exploration
4. Data Cleaning

**1. Importing all Dependencies**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**2. Loading Datasets**

In [4]:
data = pd.read_csv('airbnb_data0.csv', encoding_errors='ignore')

**3. Explore Data**

In [8]:
data.shape

(20770, 22)

In [5]:
data.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,rating,bedrooms,beds,baths
0,1312228.0,Rental unit in Brooklyn · ★5.0 · 1 bedroom,7130382,Walter,Brooklyn,Clinton Hill,40.68371,-73.96461,Private room,55.0,...,20/12/15,0.03,1.0,0.0,0.0,No License,5.0,1,1,Not specified
1,45277540.0,Rental unit in New York · ★4.67 · 2 bedrooms ·...,51501835,Jeniffer,Manhattan,Hell's Kitchen,40.76661,-73.9881,Entire home/apt,144.0,...,01/05/23,0.24,139.0,364.0,2.0,No License,4.67,2,1,1
2,9.71e+17,Rental unit in New York · ★4.17 · 1 bedroom · ...,528871354,Joshua,Manhattan,Chelsea,40.750764,-73.994605,Entire home/apt,187.0,...,18/12/23,1.67,1.0,343.0,6.0,Exempt,4.17,1,2,1
3,3857863.0,Rental unit in New York · ★4.64 · 1 bedroom · ...,19902271,John And Catherine,Manhattan,Washington Heights,40.8356,-73.9425,Private room,120.0,...,17/09/23,1.38,2.0,363.0,12.0,No License,4.64,1,1,1
4,40896610.0,Condo in New York · ★4.91 · Studio · 1 bed · 1...,61391963,Stay With Vibe,Manhattan,Murray Hill,40.75112,-73.9786,Entire home/apt,85.0,...,03/12/23,0.24,133.0,335.0,3.0,No License,4.91,Studio,1,1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20770 entries, 0 to 20769
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20770 non-null  float64
 1   name                            20770 non-null  object 
 2   host_id                         20770 non-null  int64  
 3   host_name                       20770 non-null  object 
 4   neighbourhood_group             20770 non-null  object 
 5   neighbourhood                   20763 non-null  object 
 6   latitude                        20763 non-null  float64
 7   longitude                       20763 non-null  float64
 8   room_type                       20763 non-null  object 
 9   price                           20736 non-null  float64
 10  minimum_nights                  20763 non-null  float64
 11  number_of_reviews               20763 non-null  float64
 12  last_review                     

In [9]:
#statistical summary
data.describe(include='all')

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,rating,bedrooms,beds,baths
count,20770.0,20770,20770.0,20770,20770,20763,20763.0,20763.0,20763,20736.0,...,20763,20763.0,20763.0,20763.0,20763.0,20770,20770,20770.0,20770.0,20770.0
unique,,9836,,5815,5,221,,,4,,...,1878,,,,,879,162,12.0,,17.0
top,,Rental unit in New York · 1 bedroom · 1 bed · ...,,Jeniffer,Manhattan,Bedford-Stuyvesant,,,Entire home/apt,,...,04/09/23,,,,,No License,No rating,1.0,,1.0
freq,,409,,187,8050,1583,,,11555,,...,326,,,,,17579,3595,13445.0,,17026.0
mean,3.033858e+17,,174904900.0,,,,40.726821,-73.939179,,187.71494,...,,1.257589,18.866686,206.067957,10.848962,,,,1.723592,
std,3.901221e+17,,172565700.0,,,,0.060293,0.061403,,1023.245124,...,,1.904472,70.921443,135.077259,21.354876,,,,1.211993,
min,2595.0,,1678.0,,,,40.500314,-74.24984,,10.0,...,,0.01,1.0,0.0,0.0,,,,1.0,
25%,27072600.0,,20411840.0,,,,40.684159,-73.980755,,80.0,...,,0.21,1.0,87.0,1.0,,,,1.0,
50%,49928520.0,,108699000.0,,,,40.72289,-73.949597,,125.0,...,,0.65,2.0,215.0,3.0,,,,1.0,
75%,7.22e+17,,314399700.0,,,,40.763106,-73.917475,,199.0,...,,1.8,5.0,353.0,15.0,,,,2.0,


**4. Data Cleaning**

In [47]:
data.isnull().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
number_of_reviews_ltm             0
license                           0
rating                            0
bedrooms                          0
beds                              0
baths                             0
dtype: int64

In [48]:
# drops all missing values
data.dropna(inplace=True)
data.isnull().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
number_of_reviews_ltm             0
license                           0
rating                            0
bedrooms                          0
beds                              0
baths                             0
dtype: int64

In [49]:
data.shape

(20724, 22)

In [50]:
# find no of duplicates
data.duplicated().sum()

0

In [51]:
data[data.duplicated()]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license,rating,bedrooms,beds,baths


In [52]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()
#data.shape

0

In [57]:
# check datatypes (type casting)
data.dtypes

id                                 object
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                    float64
number_of_reviews                 float64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
number_of_reviews_ltm             float64
license                            object
rating                             object
bedrooms                           object
beds                                int64
baths                              object
dtype: object

In [59]:
data['id'] = data['id'].astype(object)
data['host_id'] = data['host_id'].astype(object)
data.dtypes

id                                 object
name                               object
host_id                            object
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                    float64
number_of_reviews                 float64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
number_of_reviews_ltm             float64
license                            object
rating                             object
bedrooms                           object
beds                                int64
baths                              object
dtype: object

In [60]:
#Export Cleaned Dataset
data.to_csv("airbnb_data1.csv", index=False)