In [1]:
# imports
import pandas as pd
import numpy as np
from google.cloud import storage
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

import nltk

import geopy
from geopy.geocoders import Nominatim

import folium
from folium import FeatureGroup, LayerControl, Map, Marker
from folium.plugins import HeatMap
from folium.plugins import TimestampedGeoJson
from folium.plugins import MarkerCluster

%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,6)

# Load Files

In [2]:
# Get files from GCS bucket

BUCKET_NAME = 'salary-data'

client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)

blobs = bucket.list_blobs()
files = []
for blob in blobs:
    files.append(blob.name)

print(files)

['2011_Travel_to_Work_Areas_summary_statistics_V5.csv', 'Location_Tree.csv', 'Test_rev1.csv', 'Train_rev1.csv', 'Valid_rev1.csv', 'mean_benchmark.csv', 'random_forest_benchmark_test_rev1.csv', 'test.csv']


In [3]:
# Read data
df_TTWA = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,files[0]))
df_tree = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,files[1]))
df_train = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,files[3]))

## Investigate Location values

In [109]:
df_train.LocationNormalized.describe()

norm_counts = df_train.LocationNormalized.value_counts()

norm_counts[0:20].plot.barh()

df_UK = df_train.loc[df_train.LocationNormalized == 'UK']
df_london = df_train.loc[df_train.LocationNormalized == 'London']
df_thecity = df_train.loc[df_train.LocationNormalized == 'The City']

'Banbury'

## Match raw location to a TTWA

In [6]:
# Function to find TTWA_names in raw locations
def get_TTWA(TTWA_names, raw_location):
    
    indices = []
    for i in range(0,len(TTWA_names)):
        indices.append(raw_location[raw_location.str.contains(TTWA_names.iloc[i])].index)
        
    return indices

In [19]:
TTWA_names = df_TTWA['TTWA Name']
TTWA_names = TTWA_names.dropna()
raw_location = df_train.LocationRaw

In [20]:
indices = get_TTWA(TTWA_names, raw_location)

Now to add an extra column to df_train with TTWA values

In [21]:
df_train['TTWA'] = np.nan

for i in range(0,len(TTWA_names)):

    df_train.TTWA.loc[indices[i]] = TTWA_names.iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## Match raw location to county

In [23]:
url = 'https://raw.githubusercontent.com/andreafalzetti/uk-counties-list/master/uk-counties/uk-counties-list.csv'
counties = pd.read_csv(url, header = None)


In [24]:
counties.columns = ['Country', 'County']
counties.head()

Unnamed: 0,Country,County
0,England,Bedfordshire
1,England,Buckinghamshire
2,England,Cambridgeshire
3,England,Cheshire
4,England,Cleveland


In [25]:
indices = get_TTWA(counties.County, raw_location)

In [26]:
df_train['County'] = np.nan

for i in range(0,len(counties)):

    df_train.County.loc[indices[i]] = counties.County.iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## Create one feature out of TTWA and county

In [39]:
df_Loc = df_train[['TTWA','County']]

df_Loc['TTWA_County'] = df_Loc.TTWA
df_Loc.TTWA_County[df_Loc.TTWA_County.isna()] = df_Loc.County[df_Loc.TTWA_County.isna()] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)
