In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import json
from collections import defaultdict
import re
import folium
from folium.plugins.marker_cluster import MarkerCluster
import googlemaps
from googleplaces import GooglePlaces, types, lang
import googlemaps

## Plotting Countries on Map

##### Let's plot the number of times each country in the world is mentioned in a post on a folium map

To do this we must use the output from our Lda model in conjunction with the folium visualization library to generate a color coded map of the world coded by the countries' relevancy to all the other countries in the world. This will be accomplished using:

* Google maps
* folium
* Web scraping using lxml and html

##### Google maps API will be used to geocode countries

In [2]:
with open('Google API Key.txt','r') as f:
    api_key = f.read()

google_places = GooglePlaces(api_key)
gmaps = googlemaps.Client(key=api_key)

globe_countries_map = folium.Map(location=[0, 0], zoom_start=1, tiles='CartoDB positron')
marker_cluster = MarkerCluster().add_to(globe_countries_map)

##### Reading in .json file to pd.DataFrame

Capitals must be extracted and transformed to Title format so they can be searched on Google maps

* First we must map each country to its capital {country : capital}

In [3]:
with open('tsne.json') as json_data:
    d = json.load(json_data)
    topic_rankings_default = defaultdict(list)
    for topic, freq in zip(d['tinfo']['Term'], d['tinfo']['Freq']):
        topic_rankings_default[topic].append(freq)

most_occuring_words_dict = dict()
most_occuring_words_list = list()

for topic in topic_rankings_default.keys():
    most_occuring_words_dict[topic] = topic_rankings_default[topic][0]

df_country_caps = pd.read_csv('countries.txt')

country_capital_map = dict()
for country, capital in zip(df_country_caps['country'], df_country_caps['capital']):
    country_no_punc = re.sub(r'[^a-zA-Z\s\-].{0}','',country)
    capital_no_punc_no_nums = re.sub(r'[^a-zA-Z\s\-].{0}','',str(capital))
    if 'and ' in capital_no_punc_no_nums.lower():
        capital_no_punc_no_nums = capital_no_punc_no_nums.lower().split(' and')[0]
    country_capital_map[country_no_punc.lower()] = capital_no_punc_no_nums.lower()
    
for k, v in list(country_capital_map.items())[:5]:
    print(k,v)

afghanistan kabul
albania tirana
andorra andorra la vella
angola luanda
antigua and barbuda st johns


##### Create object with following structure {country : { capital: { lat, long : mentions } } }

Once each country is mapped to its capital, we can use Google maps api and geocode the capital to place a marker on the map with.

In [4]:
country_rankings = dict()
for topic in sorted(most_occuring_words_dict.items(), key=lambda x: x[1], reverse=True):
    topic_no_punc_no_nums = re.sub(r'[^a-zA-Z\s\-].{0}','',topic[0])
    if topic_no_punc_no_nums in list(country_capital_map.keys()):
        geo = gmaps.geocode(str(country_capital_map[topic_no_punc_no_nums]).title() + ',' + topic_no_punc_no_nums)
        country_rankings[topic_no_punc_no_nums] = {str(country_capital_map[topic_no_punc_no_nums]): {",".join([str(geo[0]['geometry']['location']['lat']),str(geo[0]['geometry']['location']['lng'])]):topic[1]}}
for k, v in list(country_rankings.items())[:5]:
    print(k,v)

usa {'washington dc': {'38.9071923,-77.0368707': 8189.0}}
canada {'ottawa': {'45.4215296,-75.69719309999999': 7211.0}}
india {'new delhi': {'28.6139391,77.2090212': 6467.0}}
france {'paris': {'48.856614,2.3522219': 5280.0}}
japan {'tokyo': {'35.6894875,139.6917064': 3549.5810733857998}}


##### Enhance code readability

Unravel complex structure to make code more readable

In [5]:
locations = list()
countries = list()
citis = list()
mentions = list()

for country in country_rankings.keys():
    countries.append(country.title())
    citis.append(str(list(country_rankings.get(country).keys())[0]).title())
    locations.append([float(re.sub(r'[^\w\s\.\-]','',number)) for number in list(list(country_rankings.get(country).values())[0].keys())[0].split(',')])
    mentions.append(int(list(country_rankings[country][str(list(country_rankings.get(country).keys())[0])].values())[0]))

##### Make the mapping from countries to mentions {country : mention }

To produce a colorbar showing each countries relevancy on the forum with respect to each other is an easy job:

* First we must map each country to its count of mentions

In [6]:
country_mention_dict = dict()
for c, m in zip(countries, mentions):
    country_mention_dict[c] = m

##### Transform dictionary to pd.DataFrame

In [7]:
country_mention_heatmap_pd_df = pd.DataFrame.from_dict(country_mention_dict, orient='index').reset_index().rename(columns={0:'Mentions_in_Posts', 'index':'Country'})
country_mention_heatmap_pd_df.head()

Unnamed: 0,Country,Mentions_in_Posts
0,Usa,8189
1,Canada,7211
2,India,6467
3,France,5280
4,Japan,3549


##### Match countries in pd.DataFrame to countries in .json file (encoded using ISO ALPHA-3)

American Samoa -> ASM

To do this we will scrape some data from a table off the internet using lxml and html

The scraped data will be used in a pd.DataFrame

In [8]:
from lxml import html

url = "http://www.nationsonline.org/oneworld/country_code_list.htm"
xpath = "//*[@id='codelist']"

tree = html.parse(url)
table = tree.xpath(xpath)[0]
raw_html = html.tostring(table)

dta = pd.read_html(raw_html, header=0)[0]

table.make_links_absolute()
del dta ['Unnamed: 0']
dta = dta.dropna()

In [9]:
dta.head()

Unnamed: 0,Country or Area Name,"ISO ""ALPHA-2 Code",ISO ALPHA-3 Code,ISO Numeric Code UN M49 Numerical Code
1,Afghanistan,AF,AFG,4.0
2,Aland Islands,AX,ALA,248.0
3,Albania,AL,ALB,8.0
4,Algeria,DZ,DZA,12.0
5,American Samoa,AS,ASM,16.0


##### Create country to isoalpha3 dictionary {country : iso-alpha3}

To connect our completed pd.DataFrame to the data provided inside the geo.json file we must make use of the ISO-ALPHA3 code for each country as a lookup key to correctly place the overlay on the map

In [10]:
country_isoalpha3_map = dict()
for country, isoalpha3 in zip(dta['Country or Area Name'],dta['ISO ALPHA-3 Code']):
    country_isoalpha3_map[country] = str(isoalpha3)
country_isoalpha3_map['Usa'] =  'USA'
country_isoalpha3_map['Iran'] =  'IRN'
country_isoalpha3_map['Vietnam'] =  'VNM'
country_isoalpha3_map['Taiwan'] =  'TWN'
country_isoalpha3_map['Russia'] =  'RUS'
country_isoalpha3_map['Laos'] =  'LAO'
country_isoalpha3_map['England'] =  'GBR'
for k, v in list(country_isoalpha3_map.items())[:5]:
    print(k,v)

Afghanistan AFG
Aland Islands ALA
Albania ALB
Algeria DZA
American Samoa ASM


##### Perform mapping and add iso-alpha3 as column to pd.DataFrame

In [11]:
country_mention_heatmap_pd_df['Country_iso_alpha3'] = country_mention_heatmap_pd_df['Country'].map(country_isoalpha3_map)
country_mention_heatmap_pd_df.head()

Unnamed: 0,Country,Mentions_in_Posts,Country_iso_alpha3
0,Usa,8189,USA
1,Canada,7211,CAN
2,India,6467,IND
3,France,5280,FRA
4,Japan,3549,JPN


##### Plot created overlay on folium map along with clusters

Finally! After much data massaging and wrangling, we are ready to examine how the countries of the world stack up to each other!

In [12]:
geo_path = "countries_geo.json"
for citi, country, location, mention  in zip(citis, countries, locations, mentions):
    html="""
        <h3>{0}</h3>
        Mentions: {1}<br>
    """
    iframe = folium.IFrame(html=html.format(str(country), mention), width=400, height=200)
    popup = folium.Popup(iframe, max_width=1000)
    folium.Marker(location, popup=popup).add_to(marker_cluster)
globe_countries_map.choropleth(geo_path, data=country_mention_heatmap_pd_df,
                               columns=['Country_iso_alpha3', 'Mentions_in_Posts'],
                               key_on='feature.id',
                               fill_color= 'YlOrRd',
                               legend_name='Mentions in Posts (Count)',
                               highlight=True)
globe_countries_map

To view folium map visit my nbviewer at nbviewer.jupyter.org:

http://nbviewer.jupyter.org/gist/tjefferies/eb3b00664504eb7a260e1def69b59d88

* USA, Europe, and India dominate the rest of the countries in post mentions
* Working with country name data from the internet is very difficult to the variety of ways a country can be declared
* Countries does not tell the whole picture: adding a map from cities to parent host country iso-alpha3 would be much more telling
    * TODO: Drill down to city level
* Africa has almost zero representation (may need to refine mapping -see bullet above)

# TODO:

##### This observation begs a timeless travel question:

##### Are residents partial to mentioning their home country when giving/receiving travel advice?

To answer this question, lets piece together location information for users and plot clusters on top of our existing overlay

* Country popups with number of mentions will be added directly to map
* User locations will be added to popup cluster

# TODO:

##### Text summarization for each country

##### What's the overall summary for travelling to each country?

To answer this question, lets piece together post information for posts tagged to each country and plot the summary of all posts inside the countries popup. maybe include post parent user location as a cluster on top of overlay with country summaries ?

* Country popups with number of mentions will be added directly to map will have a text summarization added to it
* Post parent user locations will be added to popup clusters inside each country