# 6.3 Geographic Visualization 

### 1. Import data and libraries
### 2. Data cleaning 
### 3. Data wrangling
### 4. Plotting a choropleth

## Disclaimer:
Geographical Visualization is not an important/relevant part of my project. 

## 1. Import data and libraries

In [34]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json

In [35]:
# This command propts matplotlib visuals to appear in the notebook 

%matplotlib inline

In [36]:
#Set Path
path = r'/Users/tristansavella/Desktop/Important Things/Data Analytics/CareerFoundry/Data Immersion/Achievement 6/Master Folder ATP/02 Data'

In [37]:
#Import Datasets
df_matchstats = pd.read_pickle(os.path.join(path, 'Prepared Data','df_matchstats.pkl'))
df_players = pd.read_csv(os.path.join(path, 'Original Data', 'players.csv'), index_col = False)
#I will also use df_players for this exercise

In [38]:
#Import Countries JSON (3 Letter Abbreviation)
countries = r'/Users/tristansavella/Desktop/Important Things/Data Analytics/CareerFoundry/Data Immersion/Achievement 6/Master Folder ATP/02 Data/Original Data/countries.json'

In [39]:
# That's just in case you want to look at the JSON file contents here too:

f = open(r'/Users/tristansavella/Desktop/Important Things/Data Analytics/CareerFoundry/Data Immersion/Achievement 6/Master Folder ATP/02 Data/Original Data/countries.json',)
  
# returns JSON object asa dictionary
data = json.load(f)
  
# Iterating through the json list
for i in data['features']:
    print(i)

TypeError: list indices must be integers or slices, not str

## 2. Data Cleaning

#### df_players only

In [40]:
#Check Head
df_players.head()

Unnamed: 0,player_id,name_first,name_last,hand,dob,ioc,height,wikidata_id
0,100001,Gardnar,Mulloy,R,19131122.0,USA,185.0,Q54544
1,100002,Pancho,Segura,R,19210620.0,ECU,168.0,Q54581
2,100003,Frank,Sedgman,R,19271002.0,AUS,180.0,Q962049
3,100004,Giuseppe,Merlo,R,19271011.0,ITA,,Q1258752
4,100005,Richard,Gonzalez,R,19280509.0,USA,188.0,Q53554


In [41]:
#show all values under 'ioc'

unique_values = df_players['ioc'].unique()

print(unique_values)

['USA' 'ECU' 'AUS' 'ITA' 'RSA' 'DEN' 'HUN' 'CHI' 'POL' 'PER' 'IND' 'SWE'
 'ESP' 'SUI' 'GER' 'ROU' 'CRO' 'JPN' 'CZE' 'RUS' 'GBR' 'BRA' 'FRA' 'ZIM'
 'SRB' 'NED' 'CAN' 'GRE' 'MEX' 'JAM' 'COL' 'ARG' 'BEL' 'NZL' 'VEN' 'EGY'
 'BOL' 'AUT' 'PAK' 'YUG' 'NGR' 'PUR' 'PAR' 'IRL' 'IRI' 'FIN' 'URU' 'ISR'
 'KOR' 'CRC' 'MAR' 'HAI' 'LAT' 'INA' 'SEN' 'LUX' 'BAH' 'SVK' 'UKR' 'PHI'
 'TUR' 'HKG' 'CUB' 'BUL' 'NOR' 'MAS' 'KEN' 'POR' 'GEO' 'THA' 'AND' 'CHN'
 'CIV' 'MKD' 'TPE' 'AHO' 'ESA' 'GUA' 'MON' 'GHA' 'SLO' 'EST' 'TRI' 'ALG'
 'DOM' 'BLR' 'UZB' 'CAF' 'ARM' 'QAT' 'BIH' 'LIB' 'LTU' 'MDA' 'KUW' 'BEN'
 'MRI' 'TUN' 'AZE' 'KAZ' 'ISL' 'UAE' 'MNE' 'SLE' 'VIE' 'CYP' 'TOG' 'SRI'
 'BAR' 'SOL' 'ARU' 'TJK' 'NAM' 'MAD' 'SYR' 'MLT' 'CAM' 'UGA' 'KGZ' 'LBA'
 'BUR' 'ZAM' 'BER' 'MLI' 'SGP' 'OMA' 'VAN' 'KSA' 'BRN' 'GUM' 'URS' 'PNG'
 'FIJ' 'SAM' 'COK' 'MHL' 'BRU' 'SMR' 'UNK' 'NIG' 'ECA' 'BAN' 'IRQ' 'CAR'
 'CGO' 'CMR' 'RHO' 'JOR' 'PAN' 'BDI' 'ISV' 'TCH' 'FRG' 'ANZ' 'BRI' 'HAW'
 'TAN' 'HON' 'SCG' 'BOT' 'CAY' 'COD' 'VIN' 'RWA' 'T

In [42]:
#drop irrelevant columns
df_players1 = df_players.drop(columns = ['wikidata_id'])

In [43]:
# Convert float64 variable to string and extract first four characters
df_players1['yob'] = df_players['dob'].astype(str).str[:4]

In [44]:
df_players1.head()

Unnamed: 0,player_id,name_first,name_last,hand,dob,ioc,height,yob
0,100001,Gardnar,Mulloy,R,19131122.0,USA,185.0,1913
1,100002,Pancho,Segura,R,19210620.0,ECU,168.0,1921
2,100003,Frank,Sedgman,R,19271002.0,AUS,180.0,1927
3,100004,Giuseppe,Merlo,R,19271011.0,ITA,,1927
4,100005,Richard,Gonzalez,R,19280509.0,USA,188.0,1928


In [45]:
#duplicates check

df_players1_dups = df_players1[df_players1.duplicated()]
df_players1_dups.shape

#no duplicates

(0, 8)

In [46]:
#check shape
df_players1.shape

(58687, 8)

In [47]:
#checking for missing values
df_players1.isnull().sum()
#'ioc' is the column that shows which country a player comes from
#I will remove all enries with missing ioc/dob vales

player_id         0
name_first      355
name_last        41
hand            240
dob           13547
ioc             101
height        55899
yob               0
dtype: int64

In [48]:
#removing all missing ioc vales (there are only 101 out of 58,687 missing)
df_players2 = df_players1[df_players1['ioc'].isnull()== False]

In [49]:
df_players2.shape

(58586, 8)

In [50]:
#checking data type
df_players2.dtypes

player_id       int64
name_first     object
name_last      object
hand           object
dob           float64
ioc            object
height        float64
yob            object
dtype: object

In [51]:
#I do not need yob and dob
df_players3 = df_players2.drop(columns = ['dob'])

In [52]:
df_players3.head()

Unnamed: 0,player_id,name_first,name_last,hand,ioc,height,yob
0,100001,Gardnar,Mulloy,R,USA,185.0,1913
1,100002,Pancho,Segura,R,ECU,168.0,1921
2,100003,Frank,Sedgman,R,AUS,180.0,1927
3,100004,Giuseppe,Merlo,R,ITA,,1927
4,100005,Richard,Gonzalez,R,USA,188.0,1928


In [53]:
#checking for mixed data types
for col in df_players2.columns.tolist():
      weird = (df_players2[[col]].applymap(type) != df_players2[[col]].iloc[0].apply(type)).any(axis = 1)
      if len (df_players2[weird]) > 0:
        print (col)
#the following columns are not important for analysis

name_first
name_last
hand


### 3. Data wrangling

#### I am interested to know how many players came from each country

In [54]:
#Checking value counts for 'ioc'
value_counts = df_players3['ioc'].value_counts()

In [55]:
# Convert the Series to a DataFrame
df_players_ioc = pd.DataFrame({'ioc': value_counts.index, 'number of players': value_counts.values})

print(df_players_ioc)

     ioc  number of players
0    USA               9850
1    AUS               2862
2    ESP               2857
3    GBR               2738
4    GER               2557
..   ...                ...
217  SWZ                  1
218  PLW                  1
219  CPV                  1
220  MGL                  1
221  NPL                  1

[222 rows x 2 columns]


### Plotting a Chloropleth

In [56]:
# Create a data frame with ioc and number of players

data_to_plot = df_players_ioc[['ioc','number of players']]
data_to_plot.head()

Unnamed: 0,ioc,number of players
0,USA,9850
1,AUS,2862
2,ESP,2857
3,GBR,2738
4,GER,2557


In [57]:
data_to_plot.dtypes

ioc                  object
number of players     int64
dtype: object

In [58]:
# Setup a folium map at a high-level zoom
map = folium.Map(location = [100, 0], zoom_start = 1.5)

# Choropleth maps bind Pandas Data Frames and json geometries.This allows us to quickly visualize data combinations
folium.Choropleth(
    geo_data = countries, 
    data = data_to_plot,
    columns = ['ioc', 'number of players'],
    key_on = 'code', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
    legend_name = "number of players").add_to(map)
folium.LayerControl().add_to(map)

map

TypeError: list indices must be integers or slices, not str