In [1]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
import folium
import sys

pd.options.display.max_rows = 200

In [2]:
x = pd.read_csv("./data/winemag-data_first150k.csv")

In [3]:
x.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


No rows with empty country, province, variety, and winery

no location info:
- @suskostrzewa
- @kerinokeefe
- @vossroger
- @wineschach


can't find handle:
- @winechristina

In [4]:
# get lat long of countries
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

In [5]:
latlong = world.centroid
latlong = pd.concat([world.name, latlong.x, latlong.y], axis=1)
latlong = latlong.rename(columns={'name' :'country', 0 : 'long', 1 : 'lat'})
latlong = latlong[['country', 'lat', 'long']] # reorder to (lat,long)
latlong.head()

Unnamed: 0,country,lat,long
0,Fiji,-17.316309,163.853165
1,Tanzania,-6.257732,34.75299
2,W. Sahara,24.291173,-12.137831
3,Canada,61.469076,-98.142381
4,United States of America,45.705628,-112.599436


## Remove rows with missing country value

In [6]:
x = x[~pd.isnull(x.country)]

In [7]:
# replace names to match those on centroid list
x.country = x.country.replace(['US', 'Czech Republic', 'England'], ['United States of America', 'Czechia', 'United Kingdom'])

# remove arbitrary countries
x = x.drop(x[(x['country']=='US-France') | (x['country']=='Bosnia and Herzegovina')].index)

## should give us total of 46 countries

In [8]:
unique_countries = sorted(list(set(x['country'])))
print('There are %d different countries in the wine reviews dataset:\n' % len(unique_countries))
print(', '.join(sorted(unique_countries)))

There are 46 different countries in the wine reviews dataset:

Albania, Argentina, Australia, Austria, Brazil, Bulgaria, Canada, Chile, China, Croatia, Cyprus, Czechia, Egypt, France, Georgia, Germany, Greece, Hungary, India, Israel, Italy, Japan, Lebanon, Lithuania, Luxembourg, Macedonia, Mexico, Moldova, Montenegro, Morocco, New Zealand, Portugal, Romania, Serbia, Slovakia, Slovenia, South Africa, South Korea, Spain, Switzerland, Tunisia, Turkey, Ukraine, United Kingdom, United States of America, Uruguay


In [9]:
def find_latlong(countries):
    lat = []
    long = []
    for country in countries:
        row = latlong[latlong['country'] == country]
        lat.append(float(row.lat))
        long.append(float(row.long))
    
    return lat,long

lat, long = find_latlong(unique_countries)

In [10]:
def find_review_freq(countries):
    freq = []
    for country in countries:
        row = x[x['country']==country]
        freq.append(row.country.count())
    
    return freq

freq = find_review_freq(unique_countries)

In [11]:
def find_review_avgs(countries):
    avg_points = []
    avg_prices = []
    for country in countries:
        row = x[x['country'] == country]
        avg_points.append(row.points.mean())
        avg_prices.append(row.price.mean())
        
    return avg_points, avg_prices

avg_points, avg_prices = find_review_avgs(unique_countries)

In [12]:
# write freq as csv

FILE = './wine-freq-loc.csv'

if not os.path.exists(FILE):
    with open(FILE, 'w+') as file:
        file.write('country,lat,long,freq\n')
        for country,la,lo,f in zip(unique_countries, lat, long, freq):
            tmp = ",".join([country,str(la),str(lo),str(f)])
            file.write(tmp + '\n')
        file.close()
        
# for country,la,lo,f in zip(unique_countries, lat, long, freq):
#     print(",".join([country,str(la),str(lo),str(f)]))

In [13]:
# write avg points as csv

FILE = './wine-points-loc.csv'

if not os.path.exists(FILE):
    with open(FILE, 'w+') as file:
        file.write('country,lat,long,points\n')
        for country,la,lo,f in zip(unique_countries, lat, long, avg_points):
            tmp = ",".join([country,str(la),str(lo),str(f)])
            file.write(tmp + '\n')
        file.close()
        
# for country,la,lo,f in zip(unique_countries, lat, long, freq):
#     print(",".join([country,str(la),str(lo),str(f)]))

In [14]:
# write freq as csv

FILE = './wine-price-loc.csv'

if not os.path.exists(FILE):
    with open(FILE, 'w+') as file:
        file.write('country,lat,long,prices\n')
        for country,la,lo,f in zip(unique_countries, lat, long, avg_prices):
            tmp = ",".join([country,str(la),str(lo),str(f)])
            file.write(tmp + '\n')
        file.close()
        
# for country,la,lo,f in zip(unique_countries, lat, long, freq):
#     print(",".join([country,str(la),str(lo),str(f)]))

In [41]:
np.min(freq)

2

In [13]:
map_freq = folium.Map(name='Map of wine review frequencies', location=[0,0], tiles='OpenStreetMap', zoom_start=1)

for country,la,lo,val in zip(unique_countries, lat, long, freq):
    folium.Circle(location=[la,lo], tooltip=('%s: %d' % (country,int(val))), radius=int(val)*25, color='crimson', fill=True, fill_color='crimson').add_to(map_freq)

In [14]:
map_freq

In [15]:
map_points = folium.Map(name='Map of aveage wine reviews', location=[0,0], tiles='OpenStreetMap', zoom_start=1)

for country,la,lo,val in zip(unique_countries, lat, long, avg_points):
    folium.Circle(location=[la,lo], tooltip=('%s: %d' % (country,int(val))), radius=int(val)*4000, color='crimson', fill=True, fill_color='crimson').add_to(map_points)

In [16]:
map_points

In [17]:
map_prices = folium.Map(name='Map of average wine prices', location=[0,0], tiles='OpenStreetMap', zoom_start=1)

for country,la,lo,val in zip(unique_countries, lat, long, avg_prices):
    if val==val:
        folium.Circle(location=[la,lo], tooltip=('%s: $%d' % (country,int(val))), radius=int(val)*5000, color='crimson', fill=True, fill_color='crimson').add_to(map_prices)

In [18]:
map_prices