This module gets the dataframe with id, sentiment, canton for instagram json files.

In [1]:
import pandas as pd
import requests
import time
import pickle
import numpy as np
import folium
import scipy.stats as stats
import matplotlib.pyplot as plt
import math
import json
%matplotlib inline

In [2]:
import os
import sys

spark_path = os.environ["SPARK_PATH"]
os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

## Defining Spark Context

In [None]:
import json
import subprocess
import fnmatch
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

# Defining sc
conf = SparkConf().setAppName("ADA-GCL")
sc = SparkContext(conf=conf)

# use SQL context
sqlContext = SQLContext(sc)

## Spark Application

Quite simple at the moment, just getting the dataframe. 
Started running at

In [None]:
# Path definitions
sample_path ='hdfs://iccluster046.iccluster.epfl.ch:8020/datasets/goodcitylife/april/harvest3r_instagram_data_28-04_0.json'
base_path = 'hdfs://iccluster046.iccluster.epfl.ch:8020/datasets/goodcitylife'
insta_files='*/harvest3r_instagram_data*.json'

# getting the dataframe
# sdf = sqlContext.read.json(sample_path)

# Get all files under a particular directory
def get_paths(dir_path, recurse):
    t = []
    files = []
    if(recurse == True):
        cat = subprocess.Popen(["hadoop", "fs", "-ls", "-R", dir_path], stdout=subprocess.PIPE)
    else:
        cat = subprocess.Popen(["hadoop", "fs", "-ls", dir_path], stdout=subprocess.PIPE)
    for line in cat.stdout:
        t.append(line.decode())
        
    for l in t[1:]:
        files.append(l.split(' ')[-1].rstrip())
    return(files)

# getting just the relevant paths, in this case just the Instagram ones.
file_list = []
all_paths = get_paths(base_path, True)
for path in all_paths:
    if(fnmatch.fnmatch(path, insta_files)):
        file_list.append(path)

# do the operation for all the files
sdf_list = []
for file_path in file_list:
    sdf_list.append(sqlContext.read.json(file_path))


# Displays the content of the DataFrame to stdout
# sdf.show()
print(len(sdf_list))

# convert dataframe to pandas
# df = sdf.toPandas()

# Local Operations
* will need to move this into Spark Applications with Spark DataFrames, but developing locally using pandas DataFrames now...

### Create a good canton list

In [3]:
geo_map = {'city':['zurich','geneva','lausanne','zermatt','bern','basel','geneve','winterthur','luzern','lucerne','st-gallen','lugano'], 
           'canton': ['Zurich', 'Geneva','Vaud', 'Valais', "Bern", 'Basel-City', 'Geneva', 'Zurich', 'Lucerne','Lucerne', 'St-Gallen', 'Ticino']}

In [4]:
df_geo = pd.DataFrame.from_dict(geo_map)

In [5]:
df_geo.head()

Unnamed: 0,canton,city
0,Zurich,zurich
1,Geneva,geneva
2,Vaud,lausanne
3,Valais,zermatt
4,Bern,bern


In [6]:
top_population = ['zurich','geneva','lausanne','zermatt','bern','basel','geneve','winterthur','luzern','lucerne','st-gallen','lugano','biel','thun','fribourg']
top_tourist = ['zermatt','montreux','jungfrau','interlaken',]

In [7]:
popular_cities = ['zurich','geneva','lausanne','zermatt','bern','basel','geneve','winterthur','luzern','lusern','st-gallen','lugano']

In [8]:
df_geo.city.values

array(['zurich', 'geneva', 'lausanne', 'zermatt', 'bern', 'basel',
       'geneve', 'winterthur', 'luzern', 'lucerne', 'st-gallen', 'lugano'], dtype=object)

### Full Cantons
Downloaded an Excel File

In [9]:
df_cantons = pd.read_csv('cities_cantons.csv')
df_cantons = df_cantons[['Ortschaftsname','Gemeindename','Kantonskürzel']].drop_duplicates()
df_cantons.columns = ['City','Municipality','Canton']
df_cantons.head()

Unnamed: 0,City,Municipality,Canton
0,Aadorf,Aadorf,TG
1,Aarau,Aarau,AG
3,Aarau Rohr,Aarau,AG
4,Aarberg,Aarberg,BE
5,Aarburg,Aarburg,AG


## Dataframe with Geo and Sentiment
* 1) turning the raw dataframe with id, index, score, source and type into a dataframe with id, tags, and sentiment
* 2) turn the list of geo_id into one new column with just one geolocation
* 3) map that geolocation to another column called canton

### 0) Original DataFrame
read from Spark Application

In [10]:
df = pd.read_json('Sample Data/harvest3r_instagram_data_15-04_0.json')

In [11]:
df.head()

Unnamed: 0,_id,_index,_score,_source,_type
0,1460752286000003072,merged_content_2016_04_15_to_2016_04_21,0.048309,"{'main_format': 'TEXT', 'author_name': 'Fabio ...",content
1,1460727556000007168,merged_content_2016_04_15_to_2016_04_21,0.029747,"{'main_format': 'TEXT', 'author_name': 'MiRO',...",content
2,1460728673000006656,merged_content_2016_04_15_to_2016_04_21,0.026873,"{'main_format': 'TEXT', 'sentiment': 'NEUTRAL'...",content
3,1460749002000007168,merged_content_2016_04_15_to_2016_04_21,0.022194,"{'main_format': 'TEXT', 'sentiment': 'POSITIVE...",content
4,1460742655000008960,merged_content_2016_04_15_to_2016_04_21,0.021687,"{'main_format': 'TEXT', 'sentiment': 'POSITIVE...",content


### 1) Getting id, tags, sentiment

In [12]:
def get_tags(source):
    return source['tags']

In [13]:
def get_sentiment(source):
    return source.get('sentiment') # getting the value from key using dictionary

In [14]:
df_sentiment = df._source.apply(get_sentiment).to_frame('Sentiment')

In [15]:
df_tags = df._source.apply(get_tags).to_frame('Tags')

In [16]:
df_extracted = df._id.to_frame('_id').join(df_sentiment)
df_extracted = df_extracted.join(df_tags)
df = df_extracted

## Turning Tags into Categories

Ideas: if they are travels, then they are a traveller.
- there are some common tags as well
- food (food, desert, drinks)
- culture (books, movies, music)

In [17]:
# returns a list of tags with duplicates removed
def tags2list(tag_strings):
    tag_list = []
    for subcategory in tag_strings:
        tag_list.extend(subcategory.split(' #'))
    return list(set(tag_list)) #removes duplicates in list

In [77]:
food_tags = ["food #foodporn #yum #instafood #yummy #amazing #instagood #photooftheday #sweet #dinner #lunch #breakfast #fresh #tasty #food #delish #delicious #eating #foodpic #foodpics #eat #hungry #foodgasm #hot #foods",
            "dessert #food #desserts #yum #yummy #amazing #instagood #instafood #sweet #chocolate #cake #icecream #dessertporn #delish #foods #delicious #tasty #eat #eating #hungry #foodpics #sweettooth",
            "drink #drinks #slurp #pub #bar #liquor #yum #yummy #thirst #thirsty #instagood #cocktail #cocktails #drinkup #glass #can #photooftheday #beer #beers #wine"]

culture_tags = ["movies #theatre #video #movie #film #films #videos #actor #actress #cinema #dvd #amc #instamovies #star #moviestar #photooftheday #hollywood #goodmovie #instagood #flick #flicks #instaflick #instaflicks",
              "music #genre #song #songs #melody #hiphop #rnb #pop #love #rap #dubstep #instagood #beat #beats #jam #myjam #party #partymusic #newsong #lovethissong #remix #favoritesong #bestsong #photooftheday #bumpin #repeat #listentothis #goodmusic #instamusic",
              "movies #theatre #video #movie #film #films #videos #actor #actress #cinema #dvd #amc #instamovies #star #moviestar #photooftheday #hollywood #goodmovie #instagood #flick #flicks #instaflick #instaflicks"]

travel_tags = ["travel #traveling #vacation #visiting #instatravel #instago #instagood #trip #holiday #photooftheday #fun #travelling #tourism #tourist #instapassport #instatraveling #mytravelgram #travelgram #travelingram #igtravel"]

sports_tags = ["soccer #ball #futbol #futball #kick #pass #shoot #score #goal #field #net #team #soccerball #photooftheday #instafutbol #instagood #grass #run #soccergame #fifa #worldcup",
              "hockey #hockeystick #puck #ice #rink #icerink #hockeyplayer #instagood #hockeyplayers #fight #photooftheday #shot #skate #hockeygram #stanleycup #score #hockeylife #pucklife #nhl",
              "football #ball #pass #footballgame #footballseason #footballgames #footballplayer #instagood #pass #jersey #stadium #field #yards #photooftheday #yardline #pads #touchdown #catch #quarterback #fit #grass #nfl #superbowl #kickoff #run",
               "basketball #basket #ball #baller #hoop #balling #sports #sport #court #net #rim #backboard #instagood #game #photooftheday #active #pass #throw #shoot #instaballer #instaball #jump #nba #bball",
               "baseball #base #ball #bases #homerun #bat #throw #catch #swing #photooftheday #field #pitcher #mlb #firstbase #game #instagood #secondbase #thirdbase #inning #baseballbat #mitt #gloves #out #sport #sports",
               "sports #sport #active #fit #football #soccer #basketball #futball #ball #balls #fun #game #games #crowd #fans #play #playing #player #field #green #grass #score #goal #action #kick #throw #pass #win #winning",]

nature_tags = ["nature #sky #sun #summer #beach #beautiful #pretty #sunset #sunrise #blue #flowers #night #tree #twilight #clouds #beauty #light #cloudporn #photooftheday #love #green #skylovers #dusk #weather #day #red #iphonesia #mothernature",
              "clouds #cloud #cloudporn #weather #lookup #sky #skies #skyporn #cloudy #instacloud #instaclouds #instagood #nature #beautiful #gloomy #skyline #horizon #overcast #instasky #epicsky #crazyclouds #photooftheday #cloud_skye #skyback #insta_sky_lovers #iskyhub",
               "fall #autumn #leaves #falltime #season #seasons #instafall #instagood #instaautumn #photooftheday #leaf #foliage #colorful #orange #red #autumnweather #fallweather #nature",
               "snow #snowing #winter #cold #ice #white #weather #sky #skies #frosty #frost #chilly #nature #snowflakes #instagood #instawinter #instasnow #photooftheday #snowfall #blizzard",
               "sunset #sunrise #sun #pretty #beautiful #red #orange #pink #sky #skyporn #cloudporn #nature #clouds #horizon #photooftheday #instagood #gorgeous #warm #view #night #morning #silhouette #instasky #all_sunsets",
               "winter #cold #holidays #snow #rain #christmas #snowing #blizzard #snowflakes #wintertime #staywarm #cloudy #instawinter #instagood #holidayseason #photooftheday #season #seasons #nature"]

In [144]:
general_tags = ['amazing','fresh','photooftheday','instagood','green','fun','love','blue','red','hot']

In [76]:
tags_list = [food_tags, culture_tags, travel_tags, sports_tags, nature_tags]

In [114]:
tags2list(food_tags)

['dinner',
 'glass',
 'cake',
 'foodporn',
 'desserts',
 'dessert',
 'pub',
 'thirsty',
 'foods',
 'yum',
 'eating',
 'delish',
 'hungry',
 'breakfast',
 'foodpic',
 'yummy',
 'slurp',
 'delicious',
 'amazing',
 'can',
 'hot',
 'instafood',
 'chocolate',
 'beer',
 'icecream',
 'sweettooth',
 'drink',
 'eat',
 'cocktails',
 'foodpics',
 'dessertporn',
 'photooftheday',
 'thirst',
 'lunch',
 'wine',
 'foodgasm',
 'drinkup',
 'tasty',
 'sweet',
 'beers',
 'instagood',
 'cocktail',
 'drinks',
 'bar',
 'food',
 'fresh',
 'liquor']

In [115]:
# removes duplicates and combines all the tags from subcategories
def new_tags_list(tags_list):
    new_tags_list = []
    for tags in tags_list:
        new_tags_list.append(list(set(tags2list(tags)) - set(general_tags)))
    return new_tags_list

In [116]:
new_tags_list = new_tags_list(tags_list)

In [117]:
new_tags_list

[['dinner',
  'glass',
  'cake',
  'foodporn',
  'desserts',
  'dessert',
  'pub',
  'thirsty',
  'foods',
  'yum',
  'eating',
  'delish',
  'hungry',
  'breakfast',
  'foodpic',
  'yummy',
  'slurp',
  'delicious',
  'can',
  'hot',
  'instafood',
  'chocolate',
  'beer',
  'sweettooth',
  'icecream',
  'drink',
  'eat',
  'cocktails',
  'foodpics',
  'dessertporn',
  'thirst',
  'lunch',
  'wine',
  'foodgasm',
  'drinkup',
  'tasty',
  'sweet',
  'beers',
  'cocktail',
  'drinks',
  'bar',
  'food',
  'liquor'],
 ['dvd',
  'lovethissong',
  'flicks',
  'cinema',
  'theatre',
  'actor',
  'pop',
  'favoritesong',
  'song',
  'genre',
  'videos',
  'bestsong',
  'moviestar',
  'music',
  'flick',
  'rnb',
  'bumpin',
  'goodmovie',
  'amc',
  'goodmusic',
  'beats',
  'party',
  'myjam',
  'listentothis',
  'films',
  'movies',
  'dubstep',
  'instamovies',
  'actress',
  'hollywood',
  'melody',
  'rap',
  'hiphop',
  'star',
  'video',
  'movie',
  'newsong',
  'film',
  'instaflic

In [118]:
def find_set_intersection(pop_tags):
    return list(set(orig_tags) - (set(orig_tags) - set(pop_tags)))

In [119]:
category_names = ['Food', 'Sports', 'Nature', 'Culture', 'Travel']

In [120]:
for index in range(len(new_tags_list)):
    df[category_names[index]] = df.apply(lambda x: set(x['Tags']) - (set(x['Tags']) - set(new_tags_list[index])), axis=1)

In [121]:
df.head()

Unnamed: 0,_id,Sentiment,Tags,Food,Sports,Nature,Culture,Travel
0,1460752286000003072,,"[Switzerland, ch, genebra, geneve, genf, ginev...",{},{},{},{},{}
1,1460727556000007168,,"[art, border, ch, eidgenoss, helvetia, land, m...",{},{},{},{},{}
2,1460728673000006656,NEUTRAL,"[Wintersport, earnyourturns, flylowgear, jungf...",{},{},{},{},{}
3,1460749002000007168,POSITIVE,"[bahnhofklatscher, eidgenoss, eidgenosse, eidg...",{},{},{},{},{}
4,1460742655000008960,POSITIVE,"[Aarau, Austria, Austrija, Basel, Bazel, Bern,...",{},{},{},{},{}


In [132]:
# checking the the number of categories
for category in category_names:
    print(category + ": " + str(len(df[df.apply(lambda x: len(x[category]),axis=1) > 0])))

Food: 58
Sports: 117
Nature: 92
Culture: 82
Travel: 188


In [138]:
for category in category_names:
    df[category] = df[category].map(len)

In [143]:
df[df.Sports > 0]

Unnamed: 0,_id,Sentiment,Tags,Food,Sports,Nature,Culture,Travel
66,1460720573000010240,POSITIVE,"[and, ataxia, booyah, club, dancefloor, dims, ...",0,1,0,0,2
67,1460720573000010240,POSITIVE,"[and, ataxia, booyah, club, dancefloor, dims, ...",0,1,0,0,2
68,1460720574000013056,POSITIVE,"[and, ataxia, booyah, club, dancefloor, dims, ...",0,1,0,0,2
89,1460701241000001536,,"[Bar, George, Grill, Restaurant, afterwork, be...",3,1,0,0,2
118,1460742792000012032,POSITIVE,"[abstract, actor, architecture, awesome, bouti...",0,2,0,0,0
141,1460757080000015360,POSITIVE,"[brennereiclub, captainsclub, dasschwarzeschaf...",0,4,0,0,0
144,1460730835000011008,,"[alps, interlaken, movie, selca, selfie, skydi...",0,1,0,0,0
148,1460715627000008192,,"[feelgood, fridaymorning, keeponrunning, lakeg...",0,1,0,1,0
173,1460715481000016128,,"[Coach, H24, Lugano, arte, breakfast, coachlug...",1,1,1,1,0
187,1460729770000011520,,"[artist, beatmaker, beats, bern, compo, compos...",0,5,1,0,0


Total is 1224, now I have 537, which is good

In [133]:
58 + 117 + 92 + 82 + 188

537

## Removing General Tags

### 2) Getting one single city from list of tags

In [None]:
CITY_LIST = df_geo.city.values # gets more complicated as df_geo gets more complicated

def extract_city(tag_list):
    # geo_map['city]
    return next((city for city in tag_list if city in CITY_LIST), None)

In [None]:
df_city = df_extracted.Tags.apply(extract_city).to_frame('City')

In [None]:
df_extracted = df_extracted.join(df_city)

In [None]:
geo_map

In [None]:
geo_map = {'zurich':'ZH','geneva':'GE','lausanne':'VD','zermatt':'VS','bern':'BE',
           'basel':'BS','geneve':'GE','winterthur':'ZH','luzern':'LU','lucerne':'LU',
          'st-gallen':'SG','lugano':'TI'}

### 3) Mapping City to Canton

In [None]:
df_extracted['Canton'] = df_extracted['City'].map(geo_map)

In [None]:
df_extracted.dropna()

In [None]:
df_extracted.shape[0]

Only 123 out of 1225 have both sentiment and canton. It's just 10% now. We need to get more sentiment using CNN and get more geolocation by using a more comprehensive dictionary of cities and cantons. The current manual list fills about 33% of total locations.

In [None]:
df_viz = df_extracted.dropna()
df_viz['Sentiment_Val'] = df_viz['Sentiment'].replace(sent_map)

In [None]:
sent_map = {'NEUTRAL': 0, 'POSITIVE':1, 'NEGATIVE':-1}

In [None]:
df_viz = pd.DataFrame(df_viz.groupby('Canton').mean()['Sentiment_Val'])
df_viz.reset_index(inplace=1) 
df_viz.columns = ['Canton', 'Sentiment_Val']

In [None]:
df_viz

In [None]:
swiss_cantons = ['AG', 'AI', 'AR', 'BE', 'BL', 'BS', 'FR', 'GE', 'GL', 'GR', 'JU',
'LU', 'NE', 'NW', 'OW', 'SG', 'SH', 'SO', 'SZ', 'TG', 'TI', 'UR',
'VD', 'VS', 'ZG', 'ZH']

In [None]:
df_viz = pd.merge(df_viz, pd.Series(swiss_cantons).to_frame('Canton'), how='outer',on='Canton')

In [None]:
df_viz.fillna(0,inplace=True)

## Building the Swiss Sentiment Map

In [None]:
# Build the Scale with the Legend
Legend = folium.colormap.linear.YlGn.scale( 0, 1 )
Legend = Legend.to_step(10)
Legend.caption = 'Happiness of Swiss Cantons'
Legend

In [None]:
# Load geojson data
geo_json_data = json.load(open(r'Data/ch-cantons.geojson.json'))

In [None]:
canton_dict = df_viz.set_index('Canton')['Sentiment_Val']

In [None]:
swiss_map = folium.Map(location=[47, 8],tiles='cartodbpositron', zoom_start=8)

folium.GeoJson(
    geo_json_data,
    style_function=lambda feature: {
        'fillColor': Legend(canton_dict[feature['id']]),
        'color': 'black',
        'weight': 1,
        'dashArray': '1',
        'fillOpacity': 0.7,
    }
).add_to(swiss_map)

Legend.add_to(swiss_map)

swiss_map