In [1]:
!pip install --upgrade pip
!pip install nltk
!pip install textblob
!pip install geopy

Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.2.4
    Uninstalling pip-21.2.4:
      Successfully uninstalled pip-21.2.4
Successfully installed pip-23.0.1
Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     -------------------------------------- 636.8/636.8 kB 8.0 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Collecting geopy
  Downloading geopy-2.3.0-py3-none-any.whl (119 kB)
     -------------------------------------- 119.8/119.8 kB 3.5 MB/s eta 0:00:00
Collecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
     ---------------------------------------- 40.3/40.3 kB 2.0 MB/s eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.3.0


In [12]:
# All imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
from textblob import TextBlob
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CompuTop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CompuTop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Import the data set
df = pd.read_csv("AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [6]:
# Define a function to calculate the sentiment of the listing names
def get_sentiment(text):
    """
    Takes a text input and returns the sentiment polarity score using TextBlob.
    """
    blob = TextBlob(str(text))
    return blob.sentiment.polarity

In [13]:
##Alternatively more hands-on:
def extract_features(text):
    """
    Takes a text input and returns a dictionary of features extracted from the text.
    """
    # Convert text to lowercase and tokenize it
    text = text.lower()
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Define list of keywords indicative of a higher or lower price
    high_price_keywords = ['luxury', 'spacious', 'panoramic', 'stunning', 'elegant', 'modern', 'designer', 'sleek', 'upscale', 'chic', 'exquisite', 'pristine', 'grand', 'lavish', 'stylish', 'opulent', 'deluxe', 'magnificent', 'breathtaking', 'regal', 'exclusive', 'premier', 'finest', 'posh', 'glamorous', 'high-end']
    low_price_keywords = ['cozy', 'quaint', 'cute', 'rustic', 'charming', 'vintage', 'cozy', 'eclectic', 'simple', 'cozy', 'homey', 'comfy', 'quirky', 'charming', 'cozy', 'bohemian', 'shabby', 'chic', 'artsy', 'funky', 'unique', 'authentic', 'modest', 'budget-friendly', 'economical', 'affordable', 'value']
    
    # Count occurrences of high and low price keywords
    high_price_count = sum([1 for word in words if word in high_price_keywords])
    low_price_count = sum([1 for word in words if word in low_price_keywords])
    
    # Calculate ratio of high to low price keywords
    if low_price_count == 0:
        ratio = high_price_count
    else:
        ratio = high_price_count / low_price_count 
    
    # Create dictionary of features
    features = {
        'num_words': len(words),
        'high_price_count': high_price_count,
        'low_price_count': low_price_count,
        'price_keyword_ratio': ratio
    }
    
    return features

In [8]:
def preprocess(data):
    # Drop rows with missing values in other columns
    data.dropna(subset=['name', 'host_name'], inplace=True)    
    
    # Remove Outliers (removing data where price > 3 standard deviations from mean)
    z_scores = np.abs((data['price'] - data['price'].mean()) / data['price'].std())
    data = data[z_scores <= 3]
    
    # Remove Data with Price as 0
    data = data[data['price']>0]
    
    # Convert the last_review column to datetime format
    data['last_review'] = pd.to_datetime(data['last_review'])
    
    # One-Hot Encode Neighbourhood Groups
    one_hot_neighbourhood_group = pd.get_dummies(data['neighbourhood_group'])
    data = data.drop('neighbourhood_group',axis = 1)
    data = data.join(one_hot_neighbourhood_group)
    
    # One-Hot Encode only the top 25 Neighbourhoods (capturing 72% of the data), rest will be marked "Outside" 
    counts = data['neighbourhood'].value_counts(normalize=True)
    top_25 = list(counts[:25].keys())
    data["neighbour"] = [neighborhood if neighborhood in top_25 else "Outside" for neighborhood in data["neighbourhood"]]
    one_hot_neighbourhood = pd.get_dummies(data['neighbour'])
    data = data.join(one_hot_neighbourhood)
    
    # One-Hot Encode Room Type
    one_hot_room = pd.get_dummies(data['room_type'])
    data = data.drop('room_type',axis = 1)
    data = data.join(one_hot_room)
    
    # Reviews per Month - replace NA with 0
    data['reviews_per_month'] = data['reviews_per_month'].fillna(0)
    

    #Switched the order of apartment rated and min_date because otherwise since we replace all nulls with min_date all apptms appear as rated.
    #Apartment rated
    data['is_rated'] = (~data['last_review'].isnull()).astype(int)

    # Replace NaN values in 'last_review' with the minimum date
    min_date = data['last_review'].min()
    data['last_review'] = data['last_review'].fillna(min_date)
 
    
    # Word Count in Name of Listing
    data["length_name"] = [len(str(name).split()) for name in data["name"]]
    
    # Sentiment Analysis on Name of listing
    # Apply the function to the 'name' column to create a new 'sentiment' column
    data['sentiment'] = data['name'].apply(get_sentiment)
    
    # Other option counting luxury or not words:
    data['name_features'] = data['name'].apply(extract_features)
    
    # Convert 'feature_name' column to separate columns
    data[['num_words', 'high_price_count', 'low_price_count', 'price_keyword_ratio']] = data['name_features'].apply(pd.Series)
    data=data.drop(['name_features', 'num_words'], axis=1)

    # Number of units for the host
    data['host_total_listings_count'] = data.groupby('host_id')['host_id'].transform('count')

    # Host activity level
    data['host_activity'] = data.groupby('host_id')['calculated_host_listings_count'].transform('sum')
    
    # Distance to Center grouped by each region
    neighbours = list(data['neighbourhood'].unique())
    temp = data.groupby('neighbourhood').agg({'latitude': lambda lat : lat.mean(),
                                            'longitude': lambda lon : lon.mean()})
    temp['neighbourhood'] = temp.index
    data['distance_to_center'] = 0
    for neighbour in neighbours:
        center = (float(temp[temp['neighbourhood'] == neighbour]['latitude']), float(temp[temp['neighbourhood'] == neighbour]['longitude']))
        data['distance_to_center'] = data.apply(lambda x: geodesic(center, (x['latitude'], x['longitude'])).miles if x['neighbourhood'] == neighbour else x['distance_to_center'], axis=1)
        
    # Days since last review
    data['time_since_last_review'] = (pd.to_datetime('today') - data['last_review']).dt.days

    # Create new features for dates
    data['last_review_year'] = pd.to_datetime(data['last_review']).dt.year
    data['last_review_month'] = pd.to_datetime(data['last_review']).dt.month
    data['last_review_dayofweek'] = data['last_review'].dt.dayofweek
    
    # Log or Square values
    ##Applying the log for price, we already removed prices=0
    data['price'] = np.log(data['price']) # we need to remembe to transform our predictions back.
    ##Non linear relationships?
    ##Square of minimum_nights
    data['min_nights2']=data['minimum_nights']**2

    # Standardize numerical columns using StandardScaler
    scaler = StandardScaler()
    num_cols = [ 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'length_name','min_nights2']
    data[num_cols] = scaler.fit_transform(data[num_cols])
    
    # Remove unused columns
    data = data.drop('neighbourhood',axis = 1)
    data = data.drop('neighbour',axis = 1)
    
    return data

In [9]:
df2 = preprocess(df) #~~7min
df2.head()

Unnamed: 0,id,name,host_id,host_name,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,...,low_price_count,price_keyword_ratio,host_total_listings_count,host_activity,distance_to_center,time_since_last_review,last_review_year,last_review_month,last_review_dayofweek,min_nights2
0,2539,Clean & quiet apt home by the park,2787,John,40.64749,-73.97237,5.003946,-0.301207,-0.322171,2018-10-19,...,0.0,0.0,6,36,0.433121,1611,2018,10,4,-0.037593
1,2595,Skylit Midtown Castle,2845,Jennifer,40.75362,-73.98377,5.4161,-0.301207,0.483623,2019-05-21,...,0.0,0.0,2,4,0.425709,1397,2019,5,1,-0.037593
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,40.80902,-73.9419,5.010635,-0.200157,-0.52362,2011-03-28,...,0.0,0.0,1,1,0.580629,4373,2011,3,0,-0.036908
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,40.68514,-73.95976,4.488636,-0.301207,5.519837,2019-07-05,...,1.0,0.0,1,1,0.317519,1352,2019,7,4,-0.037593
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,40.79851,-73.94399,4.382027,0.153517,-0.322171,2018-11-19,...,0.0,1.0,1,1,0.186646,1580,2018,11,0,-0.029119


In [17]:
# Checking that we removed all NA values
print('Now we have', df2.isna().sum().sum(), 'NAs')

Now we have 0 NAs


In [19]:
# Checking that all types are float, int or datetime
df2.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64'), dtype('<M8[ns]'),
       dtype('uint8'), dtype('int32')], dtype=object)

In [10]:
##Indeed, they way we are doing with last_review makes it so is_rated is always one, so we probably should'nt replace the last_review with the min_date(or at least not before doing is_rated)
df2[df2['is_rated']!=1].head(10)

Unnamed: 0,id,name,host_id,host_name,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,...,low_price_count,price_keyword_ratio,host_total_listings_count,host_activity,distance_to_center,time_since_last_review,last_review_year,last_review_month,last_review_dayofweek,min_nights2
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,40.80902,-73.9419,5.010635,-0.200157,-0.52362,2011-03-28,...,0.0,0.0,1,1,0.580629,4372,2011,3,0,-0.036908
19,7750,Huge 2 BR Upper East Cental Park,17985,Sing,40.79685,-73.94872,5.247024,0.001942,-0.52362,2011-03-28,...,0.0,0.0,2,4,0.341721,4372,2011,3,0,-0.033484
26,8700,Magnifique Suite au N de Manhattan - vue Cloitres,26394,Claude & Sophie,40.86754,-73.92639,4.382027,-0.149632,-0.52362,2011-03-28,...,0.0,0.0,1,1,0.176643,4372,2011,3,0,-0.036309
36,11452,Clean and Quiet in Brooklyn,7355,Vt,40.68876,-73.94312,3.555348,2.679759,-0.52362,2011-03-28,...,0.0,0.0,1,1,0.20512,4372,2011,3,0,0.270462
38,11943,Country space in the city,45445,Harriet,40.63702,-73.96327,5.010635,-0.301207,-0.52362,2011-03-28,...,0.0,0.0,1,1,0.565836,4372,2011,3,0,-0.037593
193,51438,1 Bedroom in 2 Bdrm Apt- Upper East,236421,Jessica,40.77333,-73.95199,4.867534,0.355616,-0.52362,2011-03-28,...,0.0,0.0,2,4,0.146883,4372,2011,3,0,-0.020902
204,54466,Beautiful Uptown Manhattan apartmnt,253385,Douglas,40.80234,-73.95603,5.298317,1.164014,-0.52362,2011-03-28,...,0.0,0.0,1,1,1.034889,4372,2011,3,0,0.039357
260,63588,LL3,295128,Carol Gloria,40.81309,-73.85514,4.49981,-0.250682,-0.52362,2011-03-28,...,0.0,0.0,7,49,0.12658,4372,2011,3,0,-0.037336
265,63913,"HOSTING YOUR SUNNY, SPACIOUS NYC ROOM",312288,Paula,40.86648,-73.9263,4.317488,0.001942,-0.52362,2011-03-28,...,0.0,1.0,2,4,0.115331,4372,2011,3,0,-0.033484
267,64015,Prime East Village 1 Bedroom,146944,David,40.72807,-73.98594,5.298317,-0.200157,-0.52362,2011-03-28,...,0.0,0.0,1,1,0.142513,4372,2011,3,0,-0.036908


In [None]:
# Save pre-processed file
df2.to_csv("CleanedData.csv", index = False)  