In [2]:
!pip install nltk



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic

In [4]:
# Import the data set
df = pd.read_csv("AB_NYC_2019.csv")

In [5]:
def preprocess(data):
    """
    Takes a pandas dataframe input and returns a preprocessed version of the input after data cleaning and
    feature engineering
    """
    # Drop rows with missing values in other columns
    data.dropna(subset=['name', 'host_name'], inplace=True) 

    # Convert the last_review column to datetime format
    data['last_review'] = pd.to_datetime(data['last_review'])

    # One-Hot Encode Room Type
    one_hot_room = pd.get_dummies(data['room_type'])
    data = data.drop('room_type',axis = 1)
    data = data.join(one_hot_room)
    
    # Reviews per Month - replace NA with 0
    data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

    # Apartment rated
    data['is_rated'] = (~data['last_review'].isnull()).astype(int)

    # Replace NaN values in 'last_review' with the minimum date

    min_date = data['last_review'].min()
    data['last_review'] = data['last_review'].fillna(min_date)

    return data

In [6]:
df2 = preprocess(df) 
df2.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,Entire home/apt,Private room,Shared room,is_rated
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,149,1,9,2018-10-19,0.21,6,365,0,1,0,1
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,225,1,45,2019-05-21,0.38,2,355,1,0,0,1
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,150,3,0,2011-03-28,0.0,1,365,0,1,0,0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,89,1,270,2019-07-05,4.64,1,194,1,0,0,1
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,80,10,9,2018-11-19,0.1,1,0,1,0,0,1


In [None]:
# Save pre-processed file
df2.to_csv("CleanedData.csv", index = False)  