In [180]:
%load_ext autoreload
%autoreload 2
import os
import glob
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
pd.options.display.max_columns = 999
#pd.options.display.max_rows = 100
import seaborn as sns 
import indicoio
import sys
sys.path.append("../../Code")
import utils
sys.path.append("../../Tools/python-client")
from __future__ import print_function
import time
from pprint import pprint
import settings as s
import images as im
import cognitive as cog
import files as f
import profiles
import preprocessing as pre

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Country Data 

In [181]:
countries = pd.read_csv('../1/country_info.csv', index_col=0)

In [182]:
countries = countries.rename(columns={'languages': 'country_languages'})

In [183]:
countries.head()

Unnamed: 0,alpha2Code,alpha3Code,country_languages,borders,capital,gini,latlng,name,nativeName,population,region,regionalBlocs,subregion
0,AF,AFG,"['Pashto', 'Uzbek', 'Turkmen']","['IRN', 'PAK', 'TKM', 'UZB', 'TJK', 'CHN']",Kabul,27.8,"[33.0, 65.0]",Afghanistan,افغانستان,27657145,Asia,['SAARC'],Southern Asia
1,AX,ALA,['Swedish'],[],Mariehamn,,"[60.116667, 19.9]",Åland Islands,Åland,28875,Europe,['EU'],Northern Europe
2,AL,ALB,['Albanian'],"['MNE', 'GRC', 'MKD', 'KOS']",Tirana,34.5,"[41.0, 20.0]",Albania,Shqipëria,2886026,Europe,['CEFTA'],Southern Europe
3,DZ,DZA,['Arabic'],"['TUN', 'LBY', 'NER', 'ESH', 'MRT', 'MLI', 'MAR']",Algiers,35.3,"[28.0, 3.0]",Algeria,الجزائر,40400000,Africa,"['AU', 'AL']",Northern Africa
4,AS,ASM,"['English', 'Samoan']",[],Pago Pago,,"[-14.33333333, -170.0]",American Samoa,American Samoa,57100,Oceania,[],Polynesia


In [184]:
country_cols = ['alpha2Code', 'region', 'regionalBlocs', 'subregion', 'latlng', 'borders', 'country_languages']

In [185]:
def choose_match_name(cnty_tbl, cult_table):
    names = []
    corrections = {
        "Russian Federation": "Russia",
        "Sierra Leone": "Sierra Leon",
        "Korea (Republic of)": "South Korea",
        "Iran (Islamic Republic of)": "Iran"
    }
    for _, r in cnty_tbl.iterrows():
        if r['nativeName'] in cult_table['country'].unique():
            names.append(r['nativeName'])
        elif r['name'] in corrections.keys():
            names.append(corrections[r['name']])
        else:
            names.append(r['name'])
    return names

In [186]:
cultural_factors = pd.read_csv('../../Tools/world_cultural_factors.csv')
cultural_factor_cols = ['country', 'selected_for', 'power_distance', 'individualism', 'masculinity', 'uncertainty_avoidance']

In [187]:
countries['matchName'] = choose_match_name(countries, cultural_factors)

In [188]:
cntry_cult_tbl = countries.join(cultural_factors[cultural_factor_cols].set_index('country'), on='matchName').drop_duplicates(subset="alpha2Code", keep="first")

In [189]:
cntry_cult_tbl[~cntry_cult_tbl.power_distance.isnull()].head(2)

Unnamed: 0,alpha2Code,alpha3Code,country_languages,borders,capital,gini,latlng,name,nativeName,population,region,regionalBlocs,subregion,matchName,selected_for,power_distance,individualism,masculinity,uncertainty_avoidance
10,AR,ARG,"['Spanish', 'Guaraní']","['BOL', 'BRA', 'CHL', 'PRY', 'URY']",Buenos Aires,44.5,"[-34.0, -64.0]",Argentina,Argentina,43590400,Americas,['USAN'],South America,Argentina,addition,49.0,46.0,56.0,86.0
13,AU,AUS,['English'],[],Canberra,30.5,"[-27.0, 133.0]",Australia,Australia,24117360,Oceania,[],Australia and New Zealand,Australia,High individualisim,36.0,90.0,61.0,51.0


# Load Review, Host , Guest Tables

### Listings 

In [190]:
listings = pd.read_csv('../2/nr_restrict_listings.csv', index_col=0)
full_listings = listings[listings['room_type'] == 'Entire home/apt']
shared_listings = listings[listings['room_type'] != 'Entire home/apt']

In [191]:
len(listings)

6924

### Guests

In [192]:
# Man guests because we are gaining information about hosts guests - not looking at reviews

In [193]:
guests = pd.read_csv('../2/man_guests.csv', low_memory=False, index_col=0)

In [194]:
guests = guests.join(countries[country_cols].set_index('alpha2Code'), on='ccode')

In [195]:
guests = guests.drop_duplicates(subset="id", keep="last")

In [196]:
len(guests)

181376

In [197]:
guests_cols = ['id', 'ccode', 'region', 'subregion', 'country_languages']

### Hosts

In [198]:
hosts = pd.read_csv('../2/nr_restrict_hosts.csv', index_col=0)
hosts = hosts[~hosts.index.isnull()]

In [199]:
len(hosts)

6924

### Reviews

In [200]:
reviews = pd.read_csv('../2/nr_restrict_reviews.csv', index_col=0)

### Reduce reviews to those where we have both hosts and guests 

In [201]:
reviews_match, h_ids, g_ids = utils.reduce_reviews(reviews, hosts, guests)

Total Reviews: 105600 
Unique Hosts: 5591 
Unique Guests: 96255


In [202]:
results_table = reviews_match[['id', 'date', 'reviewer_id', 'listing_id', 'recipient_id']].reset_index(drop=True)

### Get Trip table 

In [203]:
hosts_trip = pd.read_csv('../2/man_hostTrips.csv', index_col=0)

In [204]:
host_trip_summary = profiles.summarize_host_table(hosts_trip, countries)

### Get Guest Review Table 

In [205]:
hg_reviews = pd.read_csv('../2/man_hostReviews.csv', low_memory=False, index_col=0)

#### Cleaning up Reviews

#### Getting Count Tables

In [206]:
reviews_per_guest = hg_reviews[['recipient_id', 'total_host_reviews']].astype(int).groupby('recipient_id').mean()

In [208]:
reviews_from_host = hg_reviews[~hg_reviews['reviewer_id'].isnull()][['reviewer_id', 'total_host_reviews']].astype(int).groupby('reviewer_id').count()

## Get information about where Host has visited 

In [209]:
import geopy.distance

In [210]:
def get_num_countries(x, htrips):
    if x in set(htrips.id):
        return len(htrips[htrips.id == x]['countries_visited'].keys())
    else:
        return 0

In [211]:
results_table = results_table.join(host_trip_summary.set_index('id'), on='recipient_id')

In [212]:
results_table['h_min_used'] = [np.sum(list(v.values())) if v not in [None, "NaN", np.nan] else 0 for v in results_table['regions_visited']]

In [213]:
results_table['h_min_cntry'] = [len(v.keys()) if v not in [None, "NaN", np.nan] else 0 for v in results_table['countries_visited']]

In [214]:
results_table['h_min_regions'] = [len(v.keys()) if v not in [None, "NaN", np.nan] else 0 for v in results_table['regions_visited']]

In [215]:
results_table['h_min_subregions'] = [len(v.keys()) if v not in [None, "NaN", np.nan] else 0 for v in results_table['sub_regions_visited']]

In [216]:
results_table = results_table.join(guests[guests_cols].set_index('id'), on='reviewer_id')

In [217]:
results_table['eng_in_guest_clangs'] = [1 if v not in [np.nan, None] and "English" in set(eval(v)) else 0 for v in results_table['country_languages']]

In [218]:
def get_h_visit_g(val, opts):
    if opts in [np.nan, None, "NaN"]:
        return 0
    else:
        if val in opts.keys():
            return 1
        else:
            return 0

In [219]:
results_table['h_visit_g_cntry'] =  [get_h_visit_g(r['ccode'] , r['countries_visited']) for _, r in results_table.iterrows()]

In [220]:
results_table['h_visit_g_region'] =  [get_h_visit_g(r['region'] , r['regions_visited']) for _, r in results_table.iterrows()]

In [221]:
results_table['h_visit_g_sregion'] =  [get_h_visit_g(r['subregion'] , r['sub_regions_visited']) for _, r in results_table.iterrows()]

In [222]:
results_table['same_country'] = results_table.ccode.map(lambda x: 1 if x == s.HOSTS_CCODE else 0)

In [223]:
results_table['same_region'] = results_table.region.map(lambda x: 1 if x == s.HOSTS_REGION else 0)

In [224]:
results_table['same_subregion'] = results_table.subregion.map(lambda x: 1 if x == s.HOSTS_SUBREGION else 0)

In [225]:
results_table =results_table.join(cntry_cult_tbl[['alpha2Code', 'power_distance', 'individualism', 'masculinity', 'uncertainty_avoidance']].set_index('alpha2Code'), on= 'ccode')

In [226]:
h_power = int(cntry_cult_tbl[cntry_cult_tbl.alpha2Code == s.HOSTS_CCODE]['power_distance'])

In [227]:
h_individualism = int(cntry_cult_tbl[cntry_cult_tbl.alpha2Code == s.HOSTS_CCODE]['individualism'])

In [228]:
h_masculinity = int(cntry_cult_tbl[cntry_cult_tbl.alpha2Code == s.HOSTS_CCODE]['masculinity'])
h_uncertainty_avoid = int(cntry_cult_tbl[cntry_cult_tbl.alpha2Code == s.HOSTS_CCODE]['uncertainty_avoidance'])

In [229]:
def get_cultural_difference(tbl, col, hval):
    vals = []
    for _, r in tbl.iterrows():
        if r[col] in [np.nan, None]:
            vals.append(np.nan)
        else:
            vals.append(hval-r[col])
    return vals

In [230]:
results_table['power_difference'] = get_cultural_difference(results_table, 'power_distance', h_power)                       

In [231]:
results_table['individualism_difference']  = get_cultural_difference(results_table, 'individualism', h_individualism)


In [232]:
results_table['masculanity_difference']  = get_cultural_difference(results_table, 'masculinity', h_masculinity)

In [233]:
results_table['uncertainty_avoid_difference']  = get_cultural_difference(results_table, 'uncertainty_avoidance', h_uncertainty_avoid)

In [234]:
results_table = results_table.drop(["country_languages",'countries_visited', 'regions_visited', 'sub_regions_visited', 'region', 'subregion', 'country_languages', 'power_distance','individualism','masculinity', 'uncertainty_avoidance'], axis=1)



In [235]:
def get_host_guest_countries(src_tbl, host):
    guest_countries = src_tbl[src_tbl['recipient_id'] == host]['ccode'] 
    country_set = {c for c in guest_countries}
    return len(country_set)/len(guest_countries) if len(guest_countries) > 0  else 0

In [236]:
hosts['diverse_guests'] = [get_host_guest_countries(results_table, h) for h in hosts['id']]

In [237]:
results_table = results_table.join(hosts[['id', 'diverse_guests']].set_index('id'), on='recipient_id')

In [238]:
results_table

Unnamed: 0,id,date,reviewer_id,listing_id,recipient_id,h_min_used,h_min_cntry,h_min_regions,h_min_subregions,ccode,eng_in_guest_clangs,h_visit_g_cntry,h_visit_g_region,h_visit_g_sregion,same_country,same_region,same_subregion,power_difference,individualism_difference,masculanity_difference,uncertainty_avoid_difference,diverse_guests
0,22938,2010-01-03,61471,4989,7118,5,3,2,2,AU,1,0,0,0,0,0,0,4.0,1.0,1.0,-5.0,0.375000
1,234213,2011-04-21,446902,4989,7118,5,3,2,2,AU,1,0,0,0,0,0,0,4.0,1.0,1.0,-5.0,0.375000
2,264416,2011-05-15,540879,4989,7118,5,3,2,2,CA,1,1,1,1,0,1,1,1.0,11.0,10.0,-2.0,0.375000
3,843520,2012-01-06,1357796,4989,7118,5,3,2,2,AU,1,0,0,0,0,0,0,4.0,1.0,1.0,-5.0,0.375000
4,993034,2012-03-12,1757830,4989,7118,5,3,2,2,US,1,1,1,1,1,1,1,0.0,0.0,0.0,0.0,0.375000
5,1119849,2012-04-12,2019278,4989,7118,5,3,2,2,NL,0,0,1,0,0,0,0,2.0,11.0,48.0,-7.0,0.375000
6,1129816,2012-04-14,1857814,4989,7118,5,3,2,2,US,1,1,1,1,1,1,1,0.0,0.0,0.0,0.0,0.375000
7,1292160,2012-05-16,161198,4989,7118,5,3,2,2,US,1,1,1,1,1,1,1,0.0,0.0,0.0,0.0,0.375000
8,1624497,2012-07-03,1810355,4989,7118,5,3,2,2,CA,1,1,1,1,0,1,1,1.0,11.0,10.0,-2.0,0.375000
9,4737934,2013-05-22,6043058,4989,7118,5,3,2,2,SE,0,0,1,0,0,0,0,9.0,20.0,57.0,17.0,0.375000


In [179]:
results_table.to_csv("country_cultural_diff.csv")

# By Regions

In [72]:
profiles.print_country_table(reviews_match, guests, 'region')

Unnamed: 0,average,guest_count,region,review_count,stddev
0,0.754966,5172,Asia,5520,0.160857
1,0.783109,23910,Europe,25127,0.14057
2,0.791789,7141,Oceania,7483,0.136964
3,0.762943,52968,Americas,59678,0.149074
4,0.76879,736,Africa,789,0.157066
5,0.838414,1,Polar,2,0.060018


In [75]:
profiles.print_country_table_simple(reviews, guests, 'region', full_listings)

Unnamed: 0,average,guest_count,region,review_count,stddev
0,0.758868,34085,Same,38058,0.151063
1,0.774534,22310,Different,23414,0.148194


In [77]:
profiles.print_country_table_simple(reviews, guests, 'region', shared_listings)

Unnamed: 0,average,guest_count,region,review_count,stddev
0,0.770116,19784,Same,21620,0.14523
1,0.789508,14930,Different,15504,0.136223


In [71]:
profiles.print_country_table(reviews_match, guests, 'subregion')

Unnamed: 0,average,guest_count,review_count,stddev,subregion
0,0.743393,2334,2485,0.167751,Eastern Asia
1,0.775023,1253,1338,0.151366,Western Asia
2,0.777277,10241,10770,0.14219,Western Europe
3,0.768789,3085,3201,0.149927,Southern Europe
4,0.791825,7129,7471,0.136996,Australia and New Zealand
5,0.796882,9303,9818,0.131335,Northern Europe
6,0.762805,48890,55376,0.148983,Northern America
7,0.765422,3052,3187,0.148292,South America
8,0.750306,1150,1226,0.160864,South-Eastern Asia
9,0.764691,784,842,0.151703,Central America


In [74]:
profiles.print_country_table_simple(reviews, guests, 'subregion', full_listings)

Unnamed: 0,average,guest_count,review_count,stddev,subregion
0,0.759139,31750,35599,0.150782,Same
1,0.772682,24647,25875,0.148964,Different


In [76]:
profiles.print_country_table_simple(reviews, guests, 'subregion', shared_listings)

Unnamed: 0,average,guest_count,review_count,stddev,subregion
0,0.769406,18004,19777,0.145456,Same
1,0.788248,16713,17350,0.136958,Different
