In [6]:
!pip install nltk
!pip install textblob


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: textblob
Successfully installed textblob-0.17.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [4]:
# Import the data set
df = pd.read_csv("AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [7]:
from textblob import TextBlob

# Define a function to calculate the sentiment of the listing names
def get_sentiment(text):
    """
    Takes a text input and returns the sentiment polarity score using TextBlob.
    """
    blob = TextBlob(str(text))
    return blob.sentiment.polarity

In [8]:
##Alternatively more hands-on:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

def extract_features(text):
    """
    Takes a text input and returns a dictionary of features extracted from the text.
    """
    # Convert text to lowercase and tokenize it
    text = text.lower()
    tokens = word_tokenize(text)
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Define list of keywords indicative of a higher or lower price
    high_price_keywords = ['luxury', 'spacious', 'panoramic', 'stunning', 'elegant', 'modern', 'designer', 'sleek', 'upscale', 'chic', 'exquisite', 'pristine', 'grand', 'lavish', 'stylish', 'opulent', 'deluxe', 'magnificent', 'breathtaking', 'regal', 'exclusive', 'premier', 'finest', 'posh', 'glamorous', 'high-end']
    low_price_keywords = ['cozy', 'quaint', 'cute', 'rustic', 'charming', 'vintage', 'cozy', 'eclectic', 'simple', 'cozy', 'homey', 'comfy', 'quirky', 'charming', 'cozy', 'bohemian', 'shabby', 'chic', 'artsy', 'funky', 'unique', 'authentic', 'modest', 'budget-friendly', 'economical', 'affordable', 'value']
    
    # Count occurrences of high and low price keywords
    high_price_count = sum([1 for word in words if word in high_price_keywords])
    low_price_count = sum([1 for word in words if word in low_price_keywords])
    
    # Calculate ratio of high to low price keywords
    if low_price_count == 0:
        ratio = high_price_count
    else:
        ratio = high_price_count / low_price_count 
    
    # Create dictionary of features
    features = {
        'num_words': len(words),
        'high_price_count': high_price_count,
        'low_price_count': low_price_count,
        'price_keyword_ratio': ratio
    }
    
    return features

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [4]:
def preprocess(data):
    # Drop rows with missing values in other columns
    data.dropna(subset=['name', 'host_name'], inplace=True)    
    
    # Convert the last_review column to datetime format
    data['last_review'] = pd.to_datetime(data['last_review'])
    
    # One-Hot Encode Room Type
    one_hot_room = pd.get_dummies(data['room_type'])
    data = data.drop('room_type',axis = 1)
    data = data.join(one_hot_room)
    
    # Reviews per Month - replace NA with 0
    data['reviews_per_month'] = data['reviews_per_month'].fillna(0)
    
   # Apartment rated
    data['is_rated'] = (~data['last_review'].isnull()).astype(int)

    # Replace NaN values in 'last_review' with the minimum date
    min_date = data['last_review'].min()
    data['last_review'] = data['last_review'].fillna(min_date)

    # Days since last review
    data['time_since_last_review'] = (pd.to_datetime('today') - data['last_review']).dt.days

    # Create new features for dates
    data['last_review_year'] = pd.to_datetime(data['last_review']).dt.year
    data['last_review_month'] = pd.to_datetime(data['last_review']).dt.month
    data['last_review_dayofweek'] = data['last_review'].dt.dayofweek
    
    return data

In [5]:
df2 = preprocess(df)
df2.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,minimum_nights,...,calculated_host_listings_count,availability_365,Entire home/apt,Private room,Shared room,is_rated,time_since_last_review,last_review_year,last_review_month,last_review_dayofweek
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,149,1,...,6,365,0,1,0,1,1606,2018,10,4
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,225,1,...,2,355,1,0,0,1,1392,2019,5,1
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,150,3,...,1,365,0,1,0,0,4368,2011,3,0
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,89,1,...,1,194,1,0,0,1,1347,2019,7,4
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,80,10,...,1,0,1,0,0,1,1575,2018,11,0


In [None]:
# Save pre-processed file
df2.to_csv("CleanedData.csv", index = False)