# Finding Gentrification Areas Within LA County using K-means Clustering

### Below is the technique that was used to analyze the data to find the information I was looking for

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Import geo data from the LA Almanac website

In [13]:
website_url = requests.get('http://www.laalmanac.com/geography/ge09.php').text
soup = BeautifulSoup(website_url,'lxml')

In [14]:
tables = soup.find_all('table')

In [15]:
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:2] == ['Place', 'Geographic Coordinates']:
        break

In [16]:
P=[]
G=[]
for tr in table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue
    Place, x, y, z, Geo = [td.text.strip() for td in tds[:5]]
    P.append(Place)
    G.append(Geo)

In [17]:
G2 = pd.DataFrame(G)

In [18]:
GeoData = G2[0].str.split(" ",n=1,expand=True)

In [19]:
D1 = pd.DataFrame({'Place':P,'Lat':GeoData[0],'Lon':GeoData[1]})

In [202]:
print(D1.shape)
D1.head(14)

(142, 3)


Unnamed: 0,Place,Lat,Lon
0,Los Angeles County,34.196398,-118.261862
1,Acton*,34.49626,-118.183891
2,Agoura Hills,34.148925,-118.763917
3,Agua Dulce*,34.501757,-118.320567
4,Alhambra,34.083571,-118.136444
5,Alondra Park*,33.889678,-118.335541
6,Altadena*,34.192212,-118.135589
7,Arcadia,34.132689,-118.036347
8,Artesia,33.867593,-118.080635
9,Avalon,33.332675,-118.330166


### Plot a map of all the geolocations for the various cities

In [140]:
import folium
# create map of LA using latitude and longitude values
latitude = D1.iloc[0]['Lat']
longitude = D1.iloc[0]['Lon']
map_T = folium.Map(location=[float(latitude), float(longitude)], zoom_start=9)

# add markers to map
for lat, lng, place in zip(D1['Lat'], D1['Lon'], D1['Place']):
    label = '{}'.format(place)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lng)],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_T)  
    
map_T

### Grab Data from FourSquare

In [210]:
CLIENT_ID = 'T4VRPSYEVOLDPDHPSDW3VC1OAU1YYDCHMFOIBULZWJRETILU' # your Foursquare ID
CLIENT_SECRET = '1JGOFHRKUQTN33EXCAUZDGQSMQHKYQU25CGF1SZSUEN354ZG' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 300

In [211]:
def getNearbyVenues(names, latitudes, longitudes, radius=2500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [212]:
ven = getNearbyVenues(D1['Place'], D1['Lat'], D1['Lon'])

Los Angeles County
Acton*
Agoura Hills
Agua Dulce*
Alhambra
Alondra Park*
Altadena*
Arcadia
Artesia
Avalon
Avocado Heights*
Azusa
Baldwin Park
Bell
Bell Gardens
Bellflower
Beverly Hills
Bradbury
Burbank
Calabasas
Carson
Castaic*
Cerritos
Charter Oak*
Citrus*
Claremont
Commerce
Compton
Covina
Cudahy
Culver City
Del Aire*
Desert View Highlands*
Diamond Bar
Downey
Duarte
East La Mirada*
East Los Angeles*
East Pasadena*
East Rancho Dominguez*
East San Gabriel*
El Monte
El Segundo
Elizabeth Lake*
Florence-Graham*
Gardena
Glendale
Glendora
Green Valley*
Hacienda Heights*
Hasley Canyon*
Hawaiian Gardens
Hawthorne
Hermosa Beach
Hidden Hills
Huntington Park
Industry
Inglewood
Irwindale
La Cañada Flintridge
La Crescenta-Montrose*
La Habra Heights
La Mirada
La Puente
La Verne
Ladera Heights*
Lake Hughes*
Lake Los Angeles*
Lakewood
Lancaster
Lawndale
Lennox*
Leona Valley*
Littlerock*
Lomita
Long Beach
Los Angeles
Lynwood
Malibu
Manhattan Beach
Marina del Rey*
Mayflower Village*
Maywood
Monrovia
Mo

In [213]:
manhattan_venues = ven
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.shape
manhattan_onehot.head(0)

Unnamed: 0,Yoga Studio,ATM,Accessories Store,Advertising Agency,African Restaurant,Airport,Airport Service,Airport Terminal,American Restaurant,Amphitheater,...,Warehouse Store,Watch Shop,Water Park,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store


In [214]:
ven.shape

(10983, 7)

In [215]:
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()
manhattan_grouped.head()
num_top_venues = 5

for hood in manhattan_grouped['Neighborhood']:
    #print("----"+hood+"----")
    temp = manhattan_grouped[manhattan_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    #print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    #print('\n')

### Return Most Common Establishments

In [228]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [229]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(3)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Acton*,Construction & Landscaping,Fast Food Restaurant,Café,Sandwich Place,Mexican Restaurant,Pizza Place,American Restaurant,Furniture / Home Store,Gym,Pet Store
1,Agoura Hills,Deli / Bodega,Park,Italian Restaurant,Pizza Place,Furniture / Home Store,Pharmacy,Fast Food Restaurant,Breakfast Spot,Mexican Restaurant,Bakery
2,Agua Dulce*,Construction & Landscaping,Restaurant,Bakery,Grocery Store,Gift Shop,Trail,Café,Pizza Place,Mexican Restaurant,Park


### Cluster Cities According to Most Common Establishments

In [226]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 20

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering

kmeans = KMeans(n_clusters=kclusters, init='random', n_init=150).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([ 5, 15, 13,  3,  7, 10, 15, 11, 11, 12,  1, 12, 12,  1,  0, 11,  1,
       15,  8, 15, 19, 11, 10, 10, 11, 15,  0, 10, 12, 11, 15, 15,  3, 10,
       14,  1, 12, 15,  0,  3,  1, 15, 18,  1,  7, 14, 15,  2,  1,  8, 10,
       10, 15, 11, 12,  1,  0, 15, 15, 15, 12,  1,  1,  1, 15,  6, 13, 15,
       15, 10,  0,  5,  9,  7, 15, 11,  8,  0,  8, 11, 11,  1, 12, 11, 12,
        3,  1,  1, 16, 14,  1, 11,  1,  0, 16, 14, 15, 14,  7, 14,  3,  3,
        1, 12,  3, 15, 15, 15,  1, 15, 14, 15,  1,  0,  0, 15,  3, 15,  1,
       15,  4,  3, 11, 15, 17,  1,  0,  0,  1, 15, 12, 16,  7, 15, 11, 12,
       16,  0, 11, 16, 15, 16])

In [230]:
manhattan_data = D1
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [231]:
manhattan_merged = manhattan_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Place')

#manhattan_merged.head(10) # check the last columns!

In [232]:
drop=[]
for i in range(len(manhattan_merged)):
    if np.isnan(manhattan_merged.iloc[i]['Cluster Labels']):
        drop.append(i)
manhattan_merged = manhattan_merged.drop(manhattan_merged.index[drop])

In [233]:
manhattan_merged.head(3)

Unnamed: 0,Place,Lat,Lon,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Los Angeles County,34.196398,-118.261862,8,Trail,Library,Park,Intersection,History Museum,Scenic Lookout,Women's Store,Farmers Market,Entertainment Service,Event Service
1,Acton*,34.49626,-118.183891,5,Construction & Landscaping,Fast Food Restaurant,Café,Sandwich Place,Mexican Restaurant,Pizza Place,American Restaurant,Furniture / Home Store,Gym,Pet Store
2,Agoura Hills,34.148925,-118.763917,15,Deli / Bodega,Park,Italian Restaurant,Pizza Place,Furniture / Home Store,Pharmacy,Fast Food Restaurant,Breakfast Spot,Mexican Restaurant,Bakery


### These are the areas that are grouped with Compton and therefore are good potential places where gentrification could be occuring

In [236]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0.0]

Unnamed: 0,Place,Lat,Lon,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Bellflower,33.887821,-118.12725,0,Convenience Store,Sandwich Place,Mexican Restaurant,Burger Joint,Coffee Shop,Fast Food Restaurant,Thai Restaurant,BBQ Joint,Pharmacy,Grocery Store
27,Compton,33.892614,-118.227374,0,Sandwich Place,Fast Food Restaurant,Mexican Restaurant,Burger Joint,Discount Store,Coffee Shop,Department Store,Pizza Place,Video Game Store,Convenience Store
39,East Rancho Dominguez*,33.894834,-118.195588,0,Mexican Restaurant,Fast Food Restaurant,Pizza Place,Convenience Store,Sandwich Place,Burger Joint,Park,Fried Chicken Joint,Pharmacy,Coffee Shop
57,Inglewood,33.956068,-118.344274,0,Fast Food Restaurant,Convenience Store,Mexican Restaurant,Grocery Store,Burger Joint,Pharmacy,Discount Store,Coffee Shop,Southern / Soul Food Restaurant,Ice Cream Shop
71,Lennox*,33.938064,-118.358543,0,Mexican Restaurant,Pizza Place,Hotel,Convenience Store,Sandwich Place,Rental Car Location,Seafood Restaurant,Coffee Shop,Fast Food Restaurant,Latin American Restaurant
77,Lynwood,33.923962,-118.201647,0,Mexican Restaurant,Fast Food Restaurant,Burger Joint,Sandwich Place,Pizza Place,Convenience Store,Pharmacy,Coffee Shop,Video Game Store,Mobile Phone Shop
93,Pomona,34.058595,-117.761266,0,Mexican Restaurant,Convenience Store,Fast Food Restaurant,Pharmacy,Pizza Place,Sandwich Place,Chinese Restaurant,Hotel,Bar,Coffee Shop
113,South Gate,33.944159,-118.192761,0,Convenience Store,Fast Food Restaurant,Pharmacy,Mexican Restaurant,Coffee Shop,Sandwich Place,Pizza Place,Grocery Store,Burger Joint,Big Box Store
114,South Monrovia Island*,34.123435,-117.99586,0,Mexican Restaurant,Racetrack,Coffee Shop,Fast Food Restaurant,Breakfast Spot,Grocery Store,Burger Joint,Furniture / Home Store,BBQ Joint,Sandwich Place
126,Vernon,34.001123,-118.210869,0,Convenience Store,Fast Food Restaurant,Burger Joint,Mexican Restaurant,Food Truck,Pharmacy,Pizza Place,Sandwich Place,Shipping Store,Café


### Location plot of grouped cities
Cities that are grouped with Compton are areas that are likely undergoing gentrification and deserve to be looked at closely for investment opportunities. The group we care about is teal colored below.

In [235]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[float(latitude), float(longitude)], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Lat'], manhattan_merged['Lon'], manhattan_merged['Place'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lon)],
        radius=5,
        popup=label,
        color=rainbow[(int(cluster)-1)],
        fill=True,
        fill_color=rainbow[(int(cluster)-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters