# Events location processing

After scraping the ted website with `scraper.py`, we still have missing events. Let's analyze them and fill in as much missing data as possible.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

# Uncomment this if you want to see all rows and columns when displaying a pandas object
# pd.set_option("display.max_rows", None, "display.max_columns", None)

In [91]:
events = pd.read_csv('data/locations_scraped.csv')
events = events.fillna('')

print('Total number of events: {0}'.format(events.shape[0]))
print('Total number of events with location: {0}'.format(events[events['city'].notnull()].shape[0]))

Total number of events: 356
Total number of events with location: 356


In [92]:
events[events['city'] != ''].shape[0]

180

In [2]:
import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to had
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

pandas==0.25.1
numpy==1.17.2


## Number of missing locations per event type

In [93]:
no_location = events[events['city'] == '']
location = events[events['city'] != '']

In [94]:
events_type = events['event_type'].unique()

for event in events_type:
    a = no_location[no_location['event_type'] == event].shape[0]
    b = location[location['event_type'] == event].shape[0]
    print('For {0}, {1} have no location while {2} have location'.format(event, a, b))

For main, 21 have no location while 1 have location
For external, 40 have no location while 0 have location
For satellite, 45 have no location while 0 have location
For global, 12 have no location while 0 have location
For med, 8 have no location while 0 have location
For salon, 15 have no location while 0 have location
For women, 4 have no location while 0 have location
For youth, 5 have no location while 0 have location
For tedx, 26 have no location while 179 have location


main: all main events are in 3 locations (Monterey, Vancouver or Long Beach) <br />
external: these are not exactly ted events, just talks which are ted worthy <br />
For the others we are going to manually input the data where we can, if the location is mentioned in the event name: TED@Bangalore, TED@London

### Women

In [95]:
no_location[no_location['event_type'] == 'women']

Unnamed: 0,event,event_type,avg_views,avg_speed_of_speech,avg_duration,film_date,nof_talks,talks,event_id,city,country
136,TEDWomen 2010,women,1298086.0,144.759908,12.066667,2010-02-12,34,"[826, 827, 828, 829, 831, 834, 835, 839, 842, ...",135,,
137,TEDWomen 2013,women,2504863.0,148.644765,12.944444,2013-04-12,9,"[1639, 1647, 1649, 1651, 1659, 1663, 1670, 167...",136,,
138,TEDWomen 2015,women,1449902.0,133.74315,12.6375,2015-05-27,28,"[1991, 1995, 1996, 1997, 2001, 2006, 2008, 201...",137,,
139,TEDWomen 2016,women,1340357.0,142.858163,14.31,2016-10-26,25,"[2320, 2321, 2325, 2330, 2334, 2338, 2341, 234...",138,,


In [96]:
# Manually fill in location for women (from ted.com)
events.loc[events['event'] == 'TEDWomen 2010', 'city'] = 'Washington'
events.loc[events['event'] == 'TEDWomen 2010', 'country'] = 'United States'
events.loc[events['event'] == 'TEDWomen 2013', 'city'] = 'San Francisco'
events.loc[events['event'] == 'TEDWomen 2013', 'country'] = 'United States'
events.loc[events['event'] == 'TEDWomen 2015', 'city'] = 'Monterey'
events.loc[events['event'] == 'TEDWomen 2015', 'country'] = 'United States'
events.loc[events['event'] == 'TEDWomen 2016', 'city'] = 'San Francisco'
events.loc[events['event'] == 'TEDWomen 2016', 'country'] = 'United States'

### Youth

In [97]:
no_location[no_location['event_type'] == 'youth']

Unnamed: 0,event,event_type,avg_views,avg_speed_of_speech,avg_duration,film_date,nof_talks,talks,event_id,city,country
140,TEDYouth 2011,youth,1343913.0,120.817798,7.6,2011-11-19,3,"[1157, 1379, 1395]",139,,
141,TEDYouth 2012,youth,274986.0,0.0,6.466667,2012-11-17,1,[1467],140,,
142,TEDYouth 2013,youth,962806.3,177.753402,6.705556,2013-11-16,3,"[1641, 1668, 1688]",141,,
143,TEDYouth 2014,youth,1206283.0,168.505825,6.540476,2014-04-11,7,"[1888, 1893, 1900, 1911, 1912, 1914, 1964]",142,,
144,TEDYouth 2015,youth,1771106.0,146.717383,6.426667,2015-11-14,5,"[2099, 2100, 2101, 2119, 2150]",143,,


In [98]:
# Manually fill in location for youth (from ted.com)
events.loc[events['event'] == 'TEDYouth 2011', 'city'] = 'New York'
events.loc[events['event'] == 'TEDYouth 2011', 'country'] = 'United States'
events.loc[events['event'] == 'TEDYouth 2012', 'city'] = 'New York'
events.loc[events['event'] == 'TEDYouth 2012', 'country'] = 'United States'
events.loc[events['event'] == 'TEDYouth 2013', 'city'] = 'New Orleans'  # !! Different than the rest
events.loc[events['event'] == 'TEDYouth 2013', 'country'] = 'United States'
events.loc[events['event'] == 'TEDYouth 2014', 'city'] = 'New York'
events.loc[events['event'] == 'TEDYouth 2014', 'country'] = 'United States'
events.loc[events['event'] == 'TEDYouth 2015', 'city'] = 'New York'
events.loc[events['event'] == 'TEDYouth 2015', 'country'] = 'United States'

### Med

In [99]:
no_location[no_location['event_type'] == 'med']

Unnamed: 0,event,event_type,avg_views,avg_speed_of_speech,avg_duration,film_date,nof_talks,talks,event_id,city,country
109,TEDMED 2009,med,2246144.0,163.313356,17.571212,2009-10-10,11,"[596, 598, 602, 609, 611, 617, 621, 636, 639, ...",108,,
110,TEDMED 2010,med,476313.2,167.484619,18.775,2010-10-10,4,"[850, 855, 858, 877]",109,,
111,TEDMED 2011,med,772050.3,148.891851,14.22619,2011-10-24,7,"[1077, 1084, 1089, 1096, 1100, 1122, 1128]",110,,
112,TEDMED 2012,med,1351933.0,152.173905,13.948333,2012-04-15,10,"[1252, 1258, 1265, 1271, 1282, 1336, 1342, 134...",111,,
113,TEDMED 2013,med,1681900.0,140.529529,14.9625,2013-04-16,8,"[1517, 1533, 1557, 1591, 1599, 1602, 1650, 2016]",112,,
114,TEDMED 2014,med,1743996.0,135.038771,13.826667,2014-09-09,10,"[1832, 1840, 1844, 1849, 1853, 1860, 1863, 186...",113,,
115,TEDMED 2015,med,2053347.0,139.67306,11.7,2015-11-18,8,"[2131, 2138, 2147, 2151, 2197, 2268, 2301, 2305]",114,,
116,TEDMED 2016,med,1241131.0,128.910646,13.445,2016-11-30,10,"[2374, 2403, 2419, 2424, 2434, 2446, 2452, 247...",115,,


In [100]:
# Manually fill in location for med (from wikipedia)
events.loc[events['event'] == 'TEDMED 2009', 'city'] = 'San Diego'
events.loc[events['event'] == 'TEDMED 2009', 'country'] = 'United States'
events.loc[events['event'] == 'TEDMED 2010', 'city'] = 'San Diego'
events.loc[events['event'] == 'TEDMED 2010', 'country'] = 'United States'
events.loc[events['event'] == 'TEDMED 2011', 'city'] = 'Washington'
events.loc[events['event'] == 'TEDMED 2011', 'country'] = 'United States'
events.loc[events['event'] == 'TEDMED 2012', 'city'] = 'Washington'
events.loc[events['event'] == 'TEDMED 2012', 'country'] = 'United States'
events.loc[events['event'] == 'TEDMED 2013', 'city'] = 'Washington'
events.loc[events['event'] == 'TEDMED 2013', 'country'] = 'United States'
events.loc[events['event'] == 'TEDMED 2014', 'city'] = 'Washington'
events.loc[events['event'] == 'TEDMED 2014', 'country'] = 'United States'
events.loc[events['event'] == 'TEDMED 2015', 'city'] = 'Washington'
events.loc[events['event'] == 'TEDMED 2015', 'country'] = 'United States'
events.loc[events['event'] == 'TEDMED 2016', 'city'] = 'Palm Springs'
events.loc[events['event'] == 'TEDMED 2016', 'country'] = 'United States'

### Global

In [101]:
no_location[no_location['event_type'] == 'global']

Unnamed: 0,event,event_type,avg_views,avg_speed_of_speech,avg_duration,film_date,nof_talks,talks,event_id,city,country
95,TEDGlobal 2005,global,1639063.0,166.48097,18.496154,2005-01-07,26,"[19, 25, 26, 30, 32, 33, 34, 35, 36, 42, 50, 5...",94,,
96,TEDGlobal 2007,global,562149.8,131.862544,15.073457,2007-04-06,27,"[127, 128, 129, 130, 131, 132, 133, 134, 135, ...",95,,
97,TEDGlobal 2009,global,1679021.0,148.224575,13.52,2009-07-07,65,"[491, 492, 494, 495, 496, 497, 498, 499, 500, ...",96,,
98,TEDGlobal 2010,global,1338910.0,153.532743,14.411515,2010-07-13,55,"[728, 729, 730, 731, 732, 733, 734, 736, 740, ...",97,,
99,TEDGlobal 2011,global,1717761.0,149.17275,13.21348,2011-07-13,68,"[978, 979, 980, 981, 982, 983, 984, 985, 986, ...",98,,
100,TEDGlobal 2012,global,2072436.0,148.676288,12.854524,2012-06-19,70,"[1273, 1274, 1275, 1276, 1277, 1280, 1281, 128...",99,,
101,TEDGlobal 2013,global,2584163.0,144.112618,13.044192,2013-06-13,66,"[1523, 1524, 1525, 1526, 1527, 1528, 1529, 153...",100,,
102,TEDGlobal 2014,global,1316167.0,139.924767,12.816667,2014-02-10,51,"[1837, 1838, 1839, 1842, 1843, 1846, 1848, 185...",101,,
103,TEDGlobal 2017,global,556081.3,118.419158,12.933333,2017-08-27,3,"[2535, 2542, 2544]",102,,
104,TEDGlobal>Geneva,global,3385408.0,150.692296,14.933333,2015-08-12,11,"[2106, 2110, 2114, 2117, 2120, 2123, 2124, 212...",103,,


In [102]:
# Manually fill in location for gloabl (from ted.com)
events.loc[events['event'] == 'TEDGlobal 2005', 'city'] = 'Oxford'
events.loc[events['event'] == 'TEDGlobal 2005', 'country'] = 'United Kingdom'
events.loc[events['event'] == 'TEDGlobal 2007', 'city'] = 'Arusha'
events.loc[events['event'] == 'TEDGlobal 2007', 'country'] = 'Tanzania'
events.loc[events['event'] == 'TEDGlobal 2009', 'city'] = 'Oxford'
events.loc[events['event'] == 'TEDGlobal 2009', 'country'] = 'United Kingdom'
events.loc[events['event'] == 'TEDGlobal 2010', 'city'] = 'Oxford'
events.loc[events['event'] == 'TEDGlobal 2010', 'country'] = 'United Kingdom'
events.loc[events['event'] == 'TEDGlobal 2011', 'city'] = 'Edinburgh'
events.loc[events['event'] == 'TEDGlobal 2011', 'country'] = 'United Kingdom'
events.loc[events['event'] == 'TEDGlobal 2012', 'city'] = 'Edinburgh'
events.loc[events['event'] == 'TEDGlobal 2012', 'country'] = 'United Kingdom'
events.loc[events['event'] == 'TEDGlobal 2013', 'city'] = 'Edinburgh'
events.loc[events['event'] == 'TEDGlobal 2013', 'country'] = 'United Kingdom'
events.loc[events['event'] == 'TEDGlobal 2014', 'city'] = 'Rio de Janeiro'
events.loc[events['event'] == 'TEDGlobal 2014', 'country'] = 'Brazil'
events.loc[events['event'] == 'TEDGlobal 2017', 'city'] = 'Arusha'
events.loc[events['event'] == 'TEDGlobal 2017', 'country'] = 'Tanzania'

### Tedx

In [103]:
no_location[no_location['event_type'] == 'tedx']

Unnamed: 0,event,event_type,avg_views,avg_speed_of_speech,avg_duration,film_date,nof_talks,talks,event_id,city,country
146,TEDxAmazonia,tedx,984973.0,144.694981,21.583333,2010-08-11,1,[1824],145,,
158,TEDxBend,tedx,2981683.0,141.556208,12.391667,2015-04-18,2,"[2057, 2256]",157,,
181,TEDxChange,tedx,1032390.0,159.632886,17.829167,2010-09-20,4,"[780, 784, 787, 1202]",180,,
188,TEDxConcorde,tedx,1193896.0,0.952381,3.15,2013-01-24,1,[1444],187,,
192,TEDxDU 2010,tedx,900105.5,133.290068,12.4,2010-05-13,2,"[809, 817]",191,,
193,TEDxDU 2011,tedx,369363.0,146.745914,11.216667,2011-05-13,1,[1092],192,,
196,TEDxDelft,tedx,919304.0,133.882962,6.891667,2012-05-10,2,"[1434, 2157]",195,,
201,TEDxEQChCh,tedx,2829484.0,124.373178,17.15,2012-01-09,1,[1386],200,,
219,TEDxGöteborg 2010,tedx,463366.0,194.277108,16.6,2010-11-20,1,[852],218,,
220,TEDxHamburg,tedx,539207.0,124.937238,11.95,2016-08-06,1,[2528],219,,


In [104]:
# Manually fill in location for tedx (from ted.com)
events.loc[events['event'] == 'TEDxAmazonia', 'city'] = 'Manaus'
events.loc[events['event'] == 'TEDxAmazonia', 'country'] = 'Brazil'
events.loc[events['event'] == 'TEDxChange', 'city'] = 'New Yrok'
events.loc[events['event'] == 'TEDxChange', 'country'] = 'United States'
events.loc[events['event'] == 'TEDxDU 2010', 'city'] = 'Denver'
events.loc[events['event'] == 'TEDxDU 2010', 'country'] = 'United States'
events.loc[events['event'] == 'TEDxDU 2011', 'city'] = 'Denver'
events.loc[events['event'] == 'TEDxDU 2011', 'country'] = 'United Kingdom'
events.loc[events['event'] == 'TEDxEQChCh', 'city'] = 'Christchurch'
events.loc[events['event'] == 'TEDxEQChCh', 'country'] = 'New Zealand'
events.loc[events['event'] == 'TEDxKrakow', 'city'] = 'Krakow'
events.loc[events['event'] == 'TEDxKrakow', 'country'] = 'Poland'
events.loc[events['event'] == 'TEDxMidwest', 'city'] = 'Chicago'
events.loc[events['event'] == 'TEDxMidwest', 'country'] = 'United States'
events.loc[events['event'] == 'TEDxNorrkoping', 'city'] = 'Norrkoping'
events.loc[events['event'] == 'TEDxNorrkoping', 'country'] = 'Sweden'
events.loc[events['event'] == 'TEDxRC2', 'city'] = 'Geneve'
events.loc[events['event'] == 'TEDxRC2', 'country'] = 'Switzerland'
events.loc[events['event'] == 'TEDxSF', 'city'] = 'San Francisco'
events.loc[events['event'] == 'TEDxSF', 'country'] = 'United States'
events.loc[events['event'] == 'TEDxUF', 'city'] = 'Gainesville'
events.loc[events['event'] == 'TEDxUF', 'country'] = 'United States'
events.loc[events['event'] == 'TEDxUM', 'city'] = 'Oxford'
events.loc[events['event'] == 'TEDxUM', 'country'] = 'United States'
events.loc[events['event'] == 'TEDxWomen 2011', 'city'] = 'New York'
events.loc[events['event'] == 'TEDxWomen 2011', 'country'] = 'United States'
events.loc[events['event'] == 'TEDxWomen 2012', 'city'] = 'Washington'
events.loc[events['event'] == 'TEDxWomen 2012', 'country'] = 'United States'

### Main

In [105]:
no_location[no_location['event_type'] == 'main']

Unnamed: 0,event,event_type,avg_views,avg_speed_of_speech,avg_duration,film_date,nof_talks,talks,event_id,city,country
48,TED1984,main,974087.0,154.313854,25.383333,1984-02-02,1,[200],47,,
49,TED1990,main,620806.0,128.917102,44.633333,1990-03-03,1,[202],48,,
50,TED1994,main,581419.0,165.234783,19.166667,1994-02-20,1,[1131],49,,
51,TED1998,main,750886.3,154.339962,18.875,1998-02-02,6,"[260, 290, 316, 376, 382, 395]",50,,
52,TED2001,main,1858949.0,128.508424,19.677778,2001-01-02,3,"[351, 413, 415]",51,,
53,TED2002,main,951474.9,140.123049,19.408929,2002-01-03,28,"[66, 70, 75, 96, 100, 104, 113, 119, 146, 178,...",52,,
54,TED2003,main,1111202.0,148.435738,16.858824,2001-02-02,34,"[52, 67, 71, 78, 88, 89, 91, 93, 94, 106, 107,...",53,,
55,TED2004,main,2693644.0,144.197724,18.323656,2004-01-03,31,"[16, 21, 24, 27, 28, 29, 69, 72, 82, 83, 85, 9...",54,,
56,TED2005,main,1786187.0,148.47356,18.730631,2005-02-02,37,"[18, 31, 38, 39, 40, 41, 44, 45, 47, 51, 58, 5...",55,,
57,TED2006,main,3274345.0,144.821671,16.376296,2006-02-02,45,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",56,,


In [106]:
# Manually fill in location for main (from ted.com)
# Most are at Monterey (12) so we fill them all at Monterey
events.loc[events['event_type'] == 'main', 'city'] = 'Monterey'
events.loc[events['event_type'] == 'main', 'country'] = 'United States'

# Some of them are in Long Beach (5)
long_beach = ['TED2013', 'TED2012', 'TED2011', 'TED2010', 'TED2009']
events.loc[events['event'].isin(long_beach), 'city'] = 'Long Beach'
events.loc[events['event'].isin(long_beach), 'country'] = 'United States'

# Some of them are in Vancouver (4)
vancouver = ['TED2017', 'TED2016', 'TED2015', 'TED2014']
events.loc[events['event'].isin(vancouver), 'city'] = 'Vancouver'
events.loc[events['event'].isin(vancouver), 'country'] = 'Canada'

In [5]:
# Recompute
no_location = events[events['city'] == '']
location = events[events['city'] != '']
events_type = events['event_type'].unique()

for event in events_type:
    a = no_location[no_location['event_type'] == event].shape[0]
    b = location[location['event_type'] == event].shape[0]
    print('For {0}, {1} have no location while {2} have location'.format(event, a, b))

For main, 0 have no location while 22 have location
For external, 0 have no location while 40 have location
For satellite, 0 have no location while 45 have location
For global, 0 have no location while 12 have location
For med, 0 have no location while 8 have location
For salon, 0 have no location while 15 have location
For women, 0 have no location while 4 have location
For youth, 0 have no location while 5 have location
For tedx, 0 have no location while 205 have location


We will **not** include location data for external and satellite event types.

In [12]:
#Some manual work done
events = pd.read_csv('data/partial_events_locations.csv')
events = events[(events['event_type'] != 'external')&(events['city']!='undefined')]

## Longitude and latitude

Steps:
1. get all cities with countries in a separate file
2. input the file here https://www.gpsvisualizer.com/geocoder/ (source: MapQuest, easier to use, must have an account before)
3. get the output and add number of events, save them in resources/cities.csv

In [14]:
# 1
events['address'] = events['city'] + ', ' + events['country']
all_address = events['address'].unique()
file = open('temp.txt', 'w')
for address in all_address:
    file.write(address + '\n')
file.close()

In [15]:
# 2
# Fill in the name of the file
gps_visualizer_csv = 'data/cities_coordinates.csv'

In [16]:
# 3
# This file has one address per line, the name field represents the address
address_coordinates = pd.read_csv(gps_visualizer_csv)
events_coordinates = events.merge(address_coordinates, left_on='address', right_on='name')
events_coordinates.shape

(308, 19)

In [11]:
events_coordinates.sort_values(by='film_date', inplace=True, ascending=False)
cities_coordinates = events_coordinates.groupby(['latitude', 'longitude', 'desc'], as_index='False')['event'].apply(list).reset_index(name='events')
cities_coordinates['count'] = cities_coordinates['events'].str.len()
cities_coordinates['desc'] = cities_coordinates['desc'].apply(lambda x: x[:x.find(',')])

In [12]:
cities_coordinates

Unnamed: 0,latitude,longitude,desc,events,count
0,-43.530955,172.636646,Christchurch,[TEDxEQChCh],1
1,-35.282071,149.128667,Canberra,[TEDxCanberra],1
2,-34.612869,-58.445979,Buenos Aires,[TEDxRiodelaPlata],1
3,-33.854816,151.216454,Sydney,"[TEDxYouth@Sydney, TEDxSydney]",2
4,-32.927288,151.781253,Newcastle,[TEDxNewy],1
...,...,...,...,...,...
167,56.878718,14.809439,Växjö,[TEDxLinnaeusUniversity],1
168,57.706407,11.968629,Gothenburg,[TEDxGöteborg 2010],1
169,58.460278,8.766667,Arendal,[TEDxArendal],1
170,58.590913,16.190348,Norrköping,[TEDxNorrkoping],1


In [13]:
cities_coordinates.to_csv('resources/cities.csv', index=False, header=True)