# Airbnb preprocess
- 1.Select cities and neighborhoods to explore <br>
- 2.Get crime rate for neighborhoods from areavibe <br>
- 3.Generate common words and remove city specific words <br>

- The above preprocess steps are only designed for Airbnb dataset

In [2]:
import requests, time, operator, re, json, csv, pickle, random
import pandas as pd
import numpy as np
from collections import Counter
from bs4 import BeautifulSoup
import nltk.data
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")
data_path = '/data/2/zwang/2018_S_WordTreatment/V2_airbnb/0_Data/'
list_path = '/data/2/zwang/2017_M_F/airbnb/Data/listing_by_city/'

<a id='part1'></a>
### 1.Select neighborhoods to explore [Home](#home)<br>
- Define cities to explre<br>
- Only explore cities that has crime data from FBI<br>
> Average crime per person<br>
> http://www.freep.com/story/news/2017/09/25/database-2016-fbi-crime-statistics-u-s-city/701445001/

In [3]:
city_info = {}
#city_info['city'] = ['state','population','crime','crime_per_1000']

city_info['WDC'] = ['District of Columbia', 681170, 7711, 11]
city_info['SantaCruz'] = ['California', 64969, 521, 8]
city_info['Portland'] = ['Oregon', 642129, 3163, 5]
city_info['Austin'] = ['Texas', 956911, 3903, 4]
city_info['LA'] = ['California', 4007905, 28817, 7]
city_info['Denver'] = ['Colorado', 699259, 4597, 7]
city_info['Boston'] = ['Massachusetts', 673880, 4767, 7]
city_info['NewOrleans'] = ['Louisiana', 397208, 4249, 11]
city_info['Seattle'] = ['Washington', 700313, 4294, 6]
city_info['Oakland'] = ['California', 424998, 6059, 14]
city_info['NY'] = ['New York', 8566917, 49124, 6]
city_info['Nashville'] = ['Tennessee', 668685, 7371, 11]
city_info['Chicago'] = ['Illinois', 2725153, 30126, 11]
city_info['SanDiego'] = ['California', 1413414, 5332, 4]
city_info['Asheville'] = ['North Carolina', 89546, 537, 6]
city_info['SanFrancisco'] = ['California', 871155, 6190, 7]

- 1.2 Count neighborhoods in each city

In [4]:
def get_city_hoods(mydata,city_names):
    """
    Iterate over each city's listing file to count neighborhoods of that city.
    city_all_hoods[city] = {hood:n_frequency}
    """
    city_all_hoods = {}
    
    for city in city_names:
        city_pd = pd.read_csv(open(mydata+city+"_listings.csv"))
        dedup_city_pd = city_pd.drop_duplicates('neighborhood_overview',keep='last')
        city_pd_notnull = dedup_city_pd[dedup_city_pd['neighborhood_overview'].notnull()]
        
        hood_ct = Counter(city_pd_notnull[city_pd_notnull['neighbourhood_cleansed'].notnull()].neighbourhood_cleansed)
        city_all_hoods[city] = hood_ct
        
        print("get %d neighborhoods for %s" % (len(hood_ct),city))
        
    return city_all_hoods

In [5]:
city_all_hoods = get_city_hoods(list_path,list(city_info.keys()))

get 219 neighborhoods for NY
get 92 neighborhoods for Portland
get 37 neighborhoods for SanFrancisco
get 8 neighborhoods for Asheville
get 5 neighborhoods for SantaCruz
get 69 neighborhoods for NewOrleans
get 96 neighborhoods for SanDiego
get 35 neighborhoods for Nashville
get 74 neighborhoods for Chicago
get 74 neighborhoods for Denver
get 87 neighborhoods for Seattle
get 39 neighborhoods for WDC
get 25 neighborhoods for Boston
get 108 neighborhoods for Oakland
get 248 neighborhoods for LA
get 43 neighborhoods for Austin


In [6]:
city_all_hoods['Boston']

Counter({'Allston': 188,
         'Back Bay': 209,
         'Bay Village': 21,
         'Beacon Hill': 144,
         'Brighton': 137,
         'Charlestown': 90,
         'Chinatown': 20,
         'Dorchester': 162,
         'Downtown': 105,
         'East Boston': 105,
         'Fenway': 155,
         'Hyde Park': 22,
         'Jamaica Plain': 302,
         'Leather District': 4,
         'Longwood Medical Area': 12,
         'Mattapan': 19,
         'Mission Hill': 59,
         'North End': 103,
         'Roslindale': 52,
         'Roxbury': 114,
         'South Boston': 132,
         'South Boston Waterfront': 39,
         'South End': 226,
         'West End': 26,
         'West Roxbury': 28})

In [7]:
def select_common_hoods(city_all_hoods, threshold):
    """
    Remove uncommon hoods if number of hood_overviews is less than a threshold.
    """
    city_common_hoods = {}
    for city in city_all_hoods:
        city_common_hoods[city] = {}
        cm_hoods = {}
        for hood,ct in city_all_hoods[city].items():
            if(ct >= threshold):
                cm_hoods[hood]=ct
            else:
                print("Drop neighborhood %s in city %s" % (hood,city))
        city_common_hoods[city] = cm_hoods
    return city_common_hoods

city_hoods = select_common_hoods(city_all_hoods, threshold=1)

In [8]:
n_total = 0
for city in city_hoods:
    n_hood = len(city_hoods[city])
    n_total += n_hood
    print("%d neighborhoods in %s" % (n_hood,city))
print("\n--------------\n%d hoods for %d cities" % (n_total,len(city_hoods)))

219 neighborhoods in NY
92 neighborhoods in Portland
37 neighborhoods in SanFrancisco
8 neighborhoods in Asheville
5 neighborhoods in SantaCruz
69 neighborhoods in NewOrleans
96 neighborhoods in SanDiego
35 neighborhoods in Nashville
74 neighborhoods in Chicago
74 neighborhoods in Denver
87 neighborhoods in Seattle
39 neighborhoods in WDC
25 neighborhoods in Boston
108 neighborhoods in Oakland
248 neighborhoods in LA
43 neighborhoods in Austin

--------------
1259 hoods for 16 cities


<a id='part2'></a>
### 2.Get crime rate for neighborhoods [Home](#home)<br>
- From areavibes

In [9]:
def get_state_abbre():
    """ 
    scrape short name for each state, this is used for scraping hood crime rate from areavibes website.
    """
    
    state_abbre = {}
    url = "https://www.50states.com/abbreviations.htm"
    r = requests.get(url)
    if r.ok:
        soup = BeautifulSoup(r.text, 'html.parser')
        for state in soup.find('table', attrs={'class': 'spaced stripedRows'}).text.split('\n'):
            if(state[:-2]):
                state_abbre[state[:-2]] = state[-2:]
    else:
        print("Website not available")
    return state_abbre

state_abbre = get_state_abbre()

In [11]:
pickle.dump(state_abbre, open(data_path+'state_abbrev.pickle','wb'))

In [12]:
# map city name to areavibes names
map_city = {}
map_city['SanFrancisco'] = 'San+Francisco'
map_city['NewOrleans'] = 'New+Orleans'
map_city['LA'] = 'Los+Angeles'
map_city['NY'] = 'New+York'
map_city['WDC'] = 'Washington'
map_city['SanDiego'] = 'San+Diego'
map_city['SantaCruz'] = 'Santa+Cruz'

In [13]:
def scrape_hood_crime(city_hoods,city_info,map_city,state_abbre):
    """
    For every city, iterate over all neighborhoods in that city, to scrape crime rate for that neighborhood.
    """
    crime_info = []
    city_hood_crime = {}
    for city in city_hoods:
        city_hood_crime[city] = {}
        hood_crime = {}
        state = state_abbre[city_info[city][0]]
        if(city in map_city):
            city_name = map_city[city]
        else:
            city_name = city
        
        print('---------------',"crime rate for neighborhoods in", city,state,'---------------')
        
        for hood in city_hoods[city]:
            hoodname = '+'.join(str(hood).split())
            url = "http://www.areavibes.com/%s-%s/%s/crime/" % (city_name.lower(),state.lower(),hoodname.lower())
            r = requests.get(url)
            time.sleep(.5)
            if not r.ok:
                url = 'http://www.areavibes.com/%s-%s/crime/' % (hoodname.lower(), state.lower())
                r = requests.get(url)
                time.sleep(.5)
            
            if r.ok:
                soup = BeautifulSoup(r.text, 'html.parser')
                print('hood: %s' % hood)
                
                try:
                    hood_crime[hood] = 1 / int(soup.find('div', attrs={'class': 'chance-item vc'}).text.split()[-1])
                    crime_info.append({'city':city, 'hood':hood, 'crime':hood_crime[hood]})
                except Exception as e:
                    print('>>>exception parsing crime rate for %s\n%s' % (hood, e))
            else:
                print('>>>failed %s' % hood)
        
        city_hood_crime[city] = hood_crime
        
    pickle.dump(crime_info,open(data_path + 'city_hood_crime.pickle','wb'))
    
    return city_hood_crime

In [14]:
# This process takes about 20 minutes
city_hood_crime = scrape_hood_crime(city_hoods,city_info,map_city,state_abbre)

--------------- crime rate for neighborhoods in NY NY ---------------
hood: Cobble Hill
hood: Queens Village
hood: West Village
>>>failed Astoria
>>>failed Vinegar Hill
>>>failed Gravesend
>>>failed Stuyvesant Town
>>>failed Sunnyside
>>>failed Morrisania
>>>failed Long Island City
hood: Upper West Side
>>>failed East Morrisania
>>>failed Arden Heights
hood: University Heights
>>>failed Fort Greene
>>>failed Kingsbridge
hood: Financial District
hood: Bensonhurst
>>>failed Civic Center
hood: Howland Hook
hood: New Brighton
hood: Brownsville
hood: Borough Park
>>>failed North Riverdale
>>>failed Nolita
hood: Springfield Gardens
>>>failed Arrochar
hood: Jackson Heights
>>>failed Ditmars Steinway
hood: Little Italy
hood: Woodrow
>>>failed Grant City
>>>failed Richmond Hill
hood: Jamaica
>>>failed Bay Terrace
hood: Laurelton
>>>failed Stapleton
>>>failed Prospect Heights
>>>failed Sheepshead Bay
>>>failed Unionport
hood: Riverdale
>>>failed Pelham Gardens
hood: Flushing
hood: SoHo
>>>failed

In [15]:
n_total_hoodcrime = 0
for city in city_hood_crime:
    n_hoodcrime = len(city_hood_crime[city])
    n_total_hoodcrime += n_hoodcrime
    print("Crime rate for %d / %d neighborhoods in %s" % (n_hoodcrime,len(city_hoods[city]),city))

print("\n-------------------------------------------\nGet crime rate for %d out of %d neighborhoods." % (n_total_hoodcrime,n_total))

Crime rate for 84 / 219 neighborhoods in NY
Crime rate for 43 / 92 neighborhoods in Portland
Crime rate for 0 / 8 neighborhoods in Asheville
Crime rate for 28 / 37 neighborhoods in SanFrancisco
Crime rate for 71 / 74 neighborhoods in Chicago
Crime rate for 104 / 108 neighborhoods in Oakland
Crime rate for 77 / 96 neighborhoods in SanDiego
Crime rate for 0 / 35 neighborhoods in Nashville
Crime rate for 0 / 5 neighborhoods in SantaCruz
Crime rate for 13 / 74 neighborhoods in Denver
Crime rate for 57 / 87 neighborhoods in Seattle
Crime rate for 0 / 39 neighborhoods in WDC
Crime rate for 10 / 25 neighborhoods in Boston
Crime rate for 46 / 69 neighborhoods in NewOrleans
Crime rate for 177 / 248 neighborhoods in LA
Crime rate for 0 / 43 neighborhoods in Austin

-------------------------------------------
Get crime rate for 710 out of 1259 neighborhoods.


- for hood that has crime info, pickle information ['city','city_crime','hood','hood_crime','overview']

In [16]:
def store_crimehood_overview(mypath,info_file,city_hood_crime):
    """
    Pickle information into a file, a list of dictionaries.
    ['city','city_crime','hood','hood_crime','hood_overview']
    """
    list_of_infos = []
    
    for city in city_hood_crime:
        print("\n",city)
        city_pd = pd.read_csv(open(mypath+city+"_listings.csv"))
        dedup_city_pd = city_pd.drop_duplicates('neighborhood_overview',keep='last')
        city_pd_notnull = dedup_city_pd[dedup_city_pd['neighborhood_overview'].notnull()]
        
        for hood in city_hood_crime[city]:
            list_of_hoodviews = city_pd_notnull[city_pd_notnull['neighbourhood_cleansed'] == hood].neighborhood_overview.values.tolist()
            for view in list_of_hoodviews:
                view_info = {}
                view_info['city'] = city
                view_info['city_crime'] = city_info[city][-1]/1000
                view_info['hood'] = hood
                view_info['hood_crime'] = city_hood_crime[city][hood]
                view_info['hood_overviews'] = view
                list_of_infos.append(view_info)
            print("%s\t%d" % (hood,len(list_of_hoodviews)))
        
    pickle.dump(list_of_infos,open(info_file,'wb'))
    return

In [17]:
store_crimehood_overview(list_path,data_path+'city_hood_byview.pickle',city_hood_crime)


 NY
Cobble Hill	150
Sunset Park	398
Boerum Hill	283
Upper East Side	2517
Bedford-Stuyvesant	3675
West Village	1237
Tribeca	232
Middle Village	34
Lower East Side	1463
Canarsie	92
Woodside	149
Belmont	10
Morris Heights	18
Midtown	1419
Kensington	188
Norwood	41
University Heights	9
Williamsburg	5032
New Brighton	21
Harlem	3492
City Island	16
Dyker Heights	37
Chelsea	1655
Financial District	684
Crown Heights	1858
Bensonhurst	67
Bellerose	13
Maspeth	71
Corona	62
Rosebank	2
Forest Hills	147
Brownsville	51
Borough Park	98
South Beach	5
Gramercy	473
East Village	3033
Springfield Gardens	30
Spuyten Duyvil	27
Jackson Heights	242
Huguenot	2
Little Italy	143
Hunts Point	1
Baychester	4
Country Club	2
Mott Haven	51
Woodrow	1
Carroll Gardens	309
Bay Ridge	159
Chinatown	549
Mariners Harbor	11
Midland Beach	15
Jamaica	115
Laurelton	9
Great Kills	5
Upper West Side	2684
Morris Park	6
Greenwich Village	590
Oakwood	4
Howland Hook	2
Park Slope	685
Riverdale	27
Richmondtown	4
Glendale	29
Flushing	282
Fordha

- Check the stored info

In [18]:
info_pd = pd.DataFrame(pickle.load(open(data_path+'city_hood_byview.pickle','rb')))

In [19]:
info_pd.head()

Unnamed: 0,city,city_crime,hood,hood_crime,hood_overviews
0,NY,0.006,Cobble Hill,0.002294,Cobble Hill is a wonderful neighborhood with s...
1,NY,0.006,Cobble Hill,0.002294,I love everything about the neighborhood. It i...
2,NY,0.006,Cobble Hill,0.002294,Squeezed between the similarly charming (and h...
3,NY,0.006,Cobble Hill,0.002294,An ideal spot. One of the best neighborhoods ...
4,NY,0.006,Cobble Hill,0.002294,"We are located on Smith Street, and are in wal..."


In [20]:
info_pd[info_pd['hood'] == 'Ocean View'][:10]

Unnamed: 0,city,city_crime,hood,hood_crime,hood_overviews
49291,SanFrancisco,0.007,Ocean View,0.006098,We are located in Merced Heights which is a 15...
49292,SanFrancisco,0.007,Ocean View,0.006098,Ingleside still retains old-school charm of Sa...
49293,SanFrancisco,0.007,Ocean View,0.006098,We are located in Merced Heights which is a 15...
49294,SanFrancisco,0.007,Ocean View,0.006098,"There are tennis, basketball courts and soccer..."
49295,SanFrancisco,0.007,Ocean View,0.006098,Cultural diversity of this neighborhood is phe...
49296,SanFrancisco,0.007,Ocean View,0.006098,Daly City is the next city just south of San F...
49297,SanFrancisco,0.007,Ocean View,0.006098,Ingleside is a quite residential area for the ...
49298,SanFrancisco,0.007,Ocean View,0.006098,Our charming cozy house is close to 280 freewa...
49299,SanFrancisco,0.007,Ocean View,0.006098,Ingleside is a lesser know and wonderful neigb...
49300,SanFrancisco,0.007,Ocean View,0.006098,The location in Ingleside is close to City Col...


<a id='part3'></a>
### 3.Generate common words among several cities [Home](#home)<br>
#### 3.1. Common words, remove city specific words<br>

In [21]:
def iter_views(data_pd):
    for city in Counter(data_pd.city):
        city_pd = data_pd[data_pd['city'] == city]
        for hood in Counter(city_pd.hood):
            hood_pd = city_pd[city_pd['hood'] == hood]
            yield from hood_pd['hood_overviews'].values

In [23]:
def select_general_wds(data_file,n_tf,n_city):
    data_pd = pd.DataFrame(pickle.load(open(data_file,'rb')))
    data_vec = CountVectorizer(binary=True, ngram_range=(1,1))
    X_all = data_vec.fit_transform(iter_views(data_pd))
    
    all_vocab = data_vec.get_feature_names()
    print(X_all.shape)
    
    city_vec = CountVectorizer(binary=True, ngram_range=(1,1), vocabulary = all_vocab)
    n_city_feat = np.zeros(X_all.shape[1])
    n_min_tf = np.zeros(X_all.shape[1])
    
    for city in Counter(data_pd.city):
        print(city)
        city_pd = data_pd[data_pd['city'] == city]
        city_views = city_pd['hood_overviews'].values
        
        X_city = city_vec.fit_transform(city_views)
        n_city_feat += [1 if tf>=n_tf else 0 for tf in X_city.sum(axis=0).A1]
        n_tf_feat = [tf if tf>=n_tf else 0 for tf in X_city.sum(axis=0).A1]
        
        # record each existing word's min term frequency in each city
        if(np.all(n_min_tf == 0)):
            n_min_tf = n_tf_feat
        else:
            for tf_i in range(X_city.shape[1]):
                if(n_tf_feat[tf_i]>0 and n_min_tf[tf_i]>0):
                    n_min_tf[tf_i] = min(n_tf_feat[tf_i],n_min_tf[tf_i])
                elif(n_tf_feat[tf_i]==0 or n_min_tf[tf_i]==0):
                    n_min_tf[tf_i] = max(n_tf_feat[tf_i],n_min_tf[tf_i])
                else:
                    print("error")
      
    remain_feat_idx = np.where(n_city_feat>=n_city)[0]
    remain_feats_info = {}
    for idx in remain_feat_idx:
        remain_feats_info[all_vocab[idx]] = [n_min_tf[idx],n_city_feat[idx]]
    
    return all_vocab, remain_feats_info

In [24]:
all_vocab, remain_feats_info = select_general_wds(data_path+'city_hood_byview.pickle',n_tf=8,n_city=6)

(99620, 45866)
NY
Portland
Denver
SanFrancisco
Chicago
Seattle
NewOrleans
Boston
Oakland
SanDiego
LA


In [25]:
print("%d out of %d words are selected." % (len(remain_feats_info),len(all_vocab)))

1550 out of 45866 words are selected.


In [26]:
def write_feat_tofile(wd_log,feats_info):
    with open(wd_log,'w') as fw:
        myfields = ['word','min_tf','n_cities']
        feat_csv = csv.DictWriter(fw,fieldnames = myfields)
        feat_csv.writeheader()
        for wd in feats_info:
            feat_csv.writerow({'word':wd, 'min_tf':feats_info[wd][0], 'n_cities':feats_info[wd][1]})
#write_feat_tofile(data_path+'common_wds.csv',remain_feats_info)

#### Not remove stop words in CountVectorizer
- words removed by using stop_words='english'
- http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
- some words in stop word list are meaningful.