This module gets the dataframe with id, sentiment, canton for instagram json files.

In [1]:
import pandas as pd
import requests
import time
import pickle
import numpy as np
import folium
import scipy.stats as stats
import matplotlib.pyplot as plt
import math
import json
%matplotlib inline

In [2]:
import os
import sys

spark_path = os.environ["SPARK_PATH"]
os.environ['SPARK_HOME'] = spark_path
os.environ['HADOOP_HOME'] = spark_path

sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.9-src.zip")

## Defining Spark Context

In [3]:
import json
import subprocess
import fnmatch
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

# Defining sc
conf = SparkConf().setAppName("ADA-GCL")
sc = SparkContext(conf=conf)

# use SQL context
sqlContext = SQLContext(sc)

## Spark Application

Quite simple at the moment, just getting the dataframe.

In [4]:
# Path definitions
# base_path = 'hdfs://iccluster046.iccluster.epfl.ch:8020/datasets/goodcitylife'
# insta_files='*/harvest3r_instagram_data*.json'
sample_path ='hdfs://iccluster046.iccluster.epfl.ch:8020/datasets/goodcitylife/april/harvest3r_instagram_data_28-04_0.json'

# getting the dataframe
df = sqlContext.read.json(sample_path)

# function
# def get_tags(source):
#     return source['tags']

# df_tags = df.select("_source").foreach(get_tags)

# Displays the content of the DataFrame to stdout
df.show()

+-------------------+--------------------+-----------+--------------------+-------+
|                _id|              _index|     _score|             _source|  _type|
+-------------------+--------------------+-----------+--------------------+-------+
|1461840004439800092|merged_content_20...| 0.18902478|[UNKNOWN,pdeleona...|content|
|1461845617018500058|merged_content_20...| 0.17140923|[UNKNOWN,lolig_sw...|content|
|1461843807915600114|merged_content_20...| 0.16049755|[UNKNOWN,bestworl...|content|
|1461865216725800014|merged_content_20...| 0.13359328|[MALE,netzwandern...|content|
|1461870662929600135|merged_content_20...|  0.1325924|[UNKNOWN,pdeleona...|content|
|1461882421947400206|merged_content_20...|0.122113265|[UNKNOWN,d_e_n_i_...|content|
|1461882806802000025|merged_content_20...| 0.12073777|[UNKNOWN,d_e_n_i_...|content|
|1461883237548800035|merged_content_20...| 0.12073777|[UNKNOWN,d_e_n_i_...|content|
|1461830185754200033|merged_content_20...| 0.08652698|[UNKNOWN,gruyeres...|c

# Local Operations
* will need to move this into Spark Applications with Spark DataFrames, but developing locally using pandas DataFrames now...

### Create a good canton list

In [5]:
geo_map = {'city':['zurich','geneva','lausanne','zermatt','bern','basel','geneve','winterthur','luzern','lucerne','st-gallen','lugano'], 
           'canton': ['Zurich', 'Geneva','Vaud', 'Valais', "Bern", 'Basel-City', 'Geneva', 'Zurich', 'Lucerne','Lucerne', 'St-Gallen', 'Ticino']}

In [6]:
df_geo = pd.DataFrame.from_dict(geo_map)

In [7]:
df_geo.head()

Unnamed: 0,canton,city
0,Zurich,zurich
1,Geneva,geneva
2,Vaud,lausanne
3,Valais,zermatt
4,Bern,bern


In [8]:
top_population = ['zurich','geneva','lausanne','zermatt','bern','basel','geneve','winterthur','luzern','lucerne','st-gallen','lugano','biel','thun','fribourg']
top_tourist = ['zermatt','montreux','jungfrau','interlaken',]

In [9]:
popular_cities = ['zurich','geneva','lausanne','zermatt','bern','basel','geneve','winterthur','luzern','lusern','st-gallen','lugano']

In [10]:
df_geo.city.values

array(['zurich', 'geneva', 'lausanne', 'zermatt', 'bern', 'basel',
       'geneve', 'winterthur', 'luzern', 'lucerne', 'st-gallen', 'lugano'], dtype=object)

### Full Cantons
Downloaded an Excel File

In [11]:
df_cantons = pd.read_csv('cities_cantons.csv')
df_cantons = df_cantons[['Ortschaftsname','Gemeindename','Kantonskürzel']].drop_duplicates()
df_cantons.columns = ['City','Municipality','Canton']
df_cantons.head()

Unnamed: 0,City,Municipality,Canton
0,Aadorf,Aadorf,TG
1,Aarau,Aarau,AG
3,Aarau Rohr,Aarau,AG
4,Aarberg,Aarberg,BE
5,Aarburg,Aarburg,AG


## Dataframe with Geo and Sentiment
* 1) turning the raw dataframe with id, index, score, source and type into a dataframe with id, tags, and sentiment
* 2) turn the list of geo_id into one new column with just one geolocation
* 3) map that geolocation to another column called canton

### 0) Original DataFrame

In [12]:
df = pd.read_json('Sample Data/harvest3r_instagram_data_15-04_0.json')

In [13]:
df.head()

Unnamed: 0,_id,_index,_score,_source,_type
0,1460752286000003072,merged_content_2016_04_15_to_2016_04_21,0.048309,"{'sequence_range': 83081, 'source_link': 'http...",content
1,1460727556000007168,merged_content_2016_04_15_to_2016_04_21,0.029747,"{'sequence_range': 70901, 'source_link': 'http...",content
2,1460728673000006656,merged_content_2016_04_15_to_2016_04_21,0.026873,"{'sequence_range': 42239, 'source_link': 'http...",content
3,1460749002000007168,merged_content_2016_04_15_to_2016_04_21,0.022194,"{'sequence_range': 87456, 'source_link': 'http...",content
4,1460742655000008960,merged_content_2016_04_15_to_2016_04_21,0.021687,"{'sequence_range': 58253, 'source_link': 'http...",content


### 1) Getting id, tags, sentiment

In [14]:
def get_tags(source):
    return source['tags']

In [15]:
def get_sentiment(source):
    return source.get('sentiment') # getting the value from key using dictionary

In [16]:
df_sentiment = df._source.apply(get_sentiment).to_frame('Sentiment')

In [17]:
df_tags = df._source.apply(get_tags).to_frame('Tags')

In [18]:
df_extracted = df._id.to_frame('_id').join(df_sentiment)
df_extracted = df_extracted.join(df_tags)
df_extracted

Unnamed: 0,_id,Sentiment,Tags
0,1460752286000003072,,"[Switzerland, ch, genebra, geneve, genf, ginev..."
1,1460727556000007168,,"[art, border, ch, eidgenoss, helvetia, land, m..."
2,1460728673000006656,NEUTRAL,"[Wintersport, earnyourturns, flylowgear, jungf..."
3,1460749002000007168,POSITIVE,"[bahnhofklatscher, eidgenoss, eidgenosse, eidg..."
4,1460742655000008960,POSITIVE,"[Aarau, Austria, Austrija, Basel, Bazel, Bern,..."
5,1460742313000008192,,"[europa, europe, flag, makeitrain, schweiz, su..."
6,1460762902000002560,NEUTRAL,"[ahmadiyya, antwort, basel, beatiful, bern, be..."
7,1460724976000008960,,"[AtTheAirport2016, blackandwhite, bnw, fujifee..."
8,1460859717000004864,,"[CH, bern, citylife, citytrain, commute, europ..."
9,1460748532000009216,,"[AtTheAirport2016, blackandwhite, bnw, fujifee..."


In [19]:
# tags are a list
df_extracted['Tags'][0]

['Switzerland',
 'ch',
 'genebra',
 'geneve',
 'genf',
 'ginevra',
 'onu',
 'schweiz',
 'suisse',
 'svizzera',
 'swiss',
 'un']

### 2) Getting one single city from list of tags

In [20]:
CITY_LIST = df_geo.city.values # gets more complicated as df_geo gets more complicated

def extract_city(tag_list):
    # geo_map['city]
    return next((city for city in tag_list if city in CITY_LIST), None)

In [21]:
df_city = df_extracted.Tags.apply(extract_city).to_frame('City')

In [22]:
df_extracted = df_extracted.join(df_city)

In [23]:
geo_map

{'canton': ['Zurich',
  'Geneva',
  'Vaud',
  'Valais',
  'Bern',
  'Basel-City',
  'Geneva',
  'Zurich',
  'Lucerne',
  'Lucerne',
  'St-Gallen',
  'Ticino'],
 'city': ['zurich',
  'geneva',
  'lausanne',
  'zermatt',
  'bern',
  'basel',
  'geneve',
  'winterthur',
  'luzern',
  'lucerne',
  'st-gallen',
  'lugano']}

In [24]:
geo_map = {'zurich':'Zurich','geneva':'Geneva','lausanne':'Vaud','zermatt':'Valais','bern':'Bern',
           'basel':'Basel-City','geneve':'Geneva','winterthur':'Zurich','luzern':'Lucerne','lucerne':'Lucerne',
          'st-gallen':'St-Gallen','lugano':'Ticino'}

### 3) Mapping City to Canton

In [25]:
df_extracted['Canton'] = df_extracted['City'].map(geo_map)

In [26]:
df_extracted.dropna()

Unnamed: 0,_id,Sentiment,Tags,City,Canton
6,1460762902000002560,NEUTRAL,"[ahmadiyya, antwort, basel, beatiful, bern, be...",basel,Basel-City
10,1460740192000012544,POSITIVE,"[all, alles, country, danke, europe, eurowing,...",zurich,Zurich
13,1460714111000013056,NEUTRAL,"[aeroport, choices, geneva, genevaguide, genev...",geneva,Geneva
16,1460746382000002816,POSITIVE,"[allblackeverything, art, beauty, beautyblogge...",geneve,Geneva
39,1460715820000002304,POSITIVE,"[aeroport, airport, beautiful, bigworld, board...",geneva,Geneva
41,1460731351000002048,NEUTRAL,"[djpush, matterhorn, pjsweetcouple, push_dj, p...",zermatt,Valais
43,1460709766000001792,NEUTRAL,"[badboy, badboys, bijoux, bikerlife, booba, ch...",geneve,Geneva
50,1460751107000014336,POSITIVE,"[black, blackandwhite, blackandwhitephotograph...",geneva,Geneva
52,1460697581000008704,NEUTRAL,"[Butterfly, KATRINLANGER, Lucerne, accessories...",luzern,Lucerne
65,1460706032000000256,NEUTRAL,"[beautiful, bling, celiabrown, celiabrownjewel...",geneva,Geneva


In [27]:
df_extracted.shape[0]

1225

Only 123 out of 1225 have both sentiment and canton. It's just 10% now. We need to get more sentiment using CNN and get more geolocation by using a more comprehensive dictionary of cities and cantons. The current manual list fills about 33% of total locations.