# Snapchat Results Analysis

### Importing Requisite Libraries

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from tqdm import tqdm

import os
import sys
import csv
import requests
import time
import json
import numpy as np
import pandas as pd

from pprint import pprint
from subprocess import call

### Initializing Matplotlib

In [None]:
from __future__ import division, print_function
# In a notebook environment, display the plots inline
%matplotlib inline

# Set some parameters to apply to all plots. These can be overridden
# in each plot if desired
# import matplotlib
# Plot size to 14" x 7"
matplotlib.rc('figure', figsize = (10, 8))
# Font size to 14
matplotlib.rc('font', size = 24)
# Do not display top and right frame lines
matplotlib.rc('axes.spines', top = False, right = False)
# Remove grid lines
matplotlib.rc('axes', grid = False)
# Set backgound color to white
matplotlib.rc('axes', facecolor = 'white')

### Import prediction results data

In [2]:
df = pd.read_csv("combined_v5.csv")
df.head()

Unnamed: 0,id,city,lat,lon,utc_timestamp,local_timestamp,scraped_date,duration,url,non_driving,local_date,local_time,local_hour,local_minute,local_second,local_weekday,local_day
0,W7_EDlXWTBiXAEEniNoMPwAAY3AiPrLhH65pdAWmCqu4nA...,Delhi,28.674125,77.064029,1552674600000,2019-03-16 00:00:00,16_03_2019,9.296667,https://s.sc-cdn.net/c8AAKzM0ru6kt8ftIHTCod001...,1,2019-03-16,00:00:00,0,0,0,Saturday,16
1,W7_EDlXWTBiXAEEniNoMPwAAYmSj9OwxbeKXrAWmCqul9A...,Delhi,28.674125,77.064029,1552674600000,2019-03-16 00:00:00,16_03_2019,10.0,https://s.sc-cdn.net/MoZ5SufxR6oQNk32WBYBu8o4Z...,1,2019-03-16,00:00:00,0,0,0,Saturday,16
2,W7_EDlXWTBiXAEEniNoMPwAAYmJ1wsiKtCM_uAWmDYGmiA...,Amman,31.95327,35.912186,1552687200000,2019-03-16 00:00:00,16_03_2019,10.0,https://s.sc-cdn.net/FqdN13UMj9SOpvXTZAH_mp90-...,1,2019-03-16,00:00:00,0,0,0,Saturday,16
3,W7_EDlXWTBiXAEEniNoMPwAAY7tay4k7ISEREAWmCoJ7-A...,Mumbai,19.06464,72.87139,1552674600000,2019-03-16 00:00:00,16_03_2019,10.0,https://s.sc-cdn.net/qH7cPflldGHu88ettD-GWS3Uq...,1,2019-03-16,00:00:00,0,0,0,Saturday,16
4,W7_EDlXWTBiXAEEniNoMPwAAY3VSbhsZ7kE2ZAWmCns9sA...,Mumbai,19.046673,72.899907,1552674600000,2019-03-16 00:00:00,16_03_2019,10.0,https://s.sc-cdn.net/JI29SEAn7MWkkDtbsx70cDqES...,1,2019-03-16,00:00:00,0,0,0,Saturday,16


##### Compute data by class & hour

In [38]:
data = df.groupby(['non_driving', 'local_hour']).count()['id'].reset_index()

data.rename(columns={'id': 'count'}, inplace=True)
data.non_driving.replace(0, 'driving', inplace=True)
data.non_driving.replace(1, 'non-driving', inplace=True)
data.head()

Unnamed: 0,non_driving,local_hour,count
0,driving,0,81935
1,driving,1,64256
2,driving,2,49212
3,driving,3,34322
4,driving,4,21259


In [42]:
hours_list = [[0, 0] for a in range(24)]
for i, row in data.iterrows():
    hour = row["local_hour"]
    if row["non_driving"] == "driving":
        hours_list[hour][0] = row["count"]
    else:
        hours_list[hour][1] = row["count"]

hours_list = np.array(hours_list)
hours_list

array([[ 81935, 231464],
       [ 64256, 191337],
       [ 49212, 132524],
       [ 34322,  81808],
       [ 21259,  49673],
       [ 19952,  40451],
       [ 36214,  56197],
       [ 46024,  75695],
       [ 43962,  98167],
       [ 46085, 129073],
       [ 50940, 164400],
       [ 57933, 194310],
       [ 58806, 211742],
       [ 68355, 236904],
       [ 74139, 247603],
       [ 64073, 238103],
       [ 73780, 252610],
       [ 89399, 288834],
       [ 82655, 311395],
       [ 81502, 329552],
       [ 80421, 347832],
       [ 99634, 377141],
       [100303, 357780],
       [ 90189, 271608]])

In [43]:
np.corrcoef(hours_list[:, 0], hours_list[:, 1])

array([[1.        , 0.95451707],
       [0.95451707, 1.        ]])

#### Compute daytime and night-time driving ratio

In [25]:
new_data = data[(data["non_driving"] == "driving") & ((data["local_hour"] >= 18) | ((data["local_hour"] >= 0) & (data["local_hour"] <= 2)))]
new_data.head()

Unnamed: 0,non_driving,local_hour,count
0,driving,0,81935
1,driving,1,64256
2,driving,2,49212
18,driving,18,82655
19,driving,19,81502


#### Compute daytime and nigh-time ratio for all snaps

In [28]:
data = df.groupby(['local_hour']).count()['id'].reset_index()

data.rename(columns={'id': 'count'}, inplace=True)
data.head()

Unnamed: 0,local_hour,count
0,0,313399
1,1,255593
2,2,181736
3,3,116130
4,4,70932


In [30]:
new_data = data[(data["local_hour"] >= 18) | ((data["local_hour"] >= 0) & (data["local_hour"] <= 2))]
new_data.head()

Unnamed: 0,local_hour,count
0,0,313399
1,1,255593
2,2,181736
18,18,394050
19,19,411054


In [None]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid' : False})

ax = sns.lmplot(data=data, x='local_hour', y='count', hue='non_driving', 
               fit_reg=True, order=3, legend=False, markers=['x', 'o'], height=7, aspect=1.2)
ax.set(xlabel='Hour of the day', ylabel='Number of Snaps')
plt.legend(loc='upper left')
plt.xlim([0, 24])

ax.set(yticks=[100000, 200000, 300000, 400000, 500000], yticklabels=['100k', '200k', '300k', '400k', '500k'])

ax.savefig('plots/diurnal.png', dpi=200, bbox_inches='tight')

### Day of the week analysis

In [None]:
num_saturdays = 5
num_sundays = 5
num_mondays = 5
num_tuesdays = 4
num_wednesdays = 4 
num_thursdays = 4
num_fridays = 4

In [None]:
data = df.groupby(['non_driving', 'local_weekday']).count()['id'].reset_index()

data.rename(columns={'id': 'count'}, inplace=True)
data.non_driving.replace(0, 'driving', inplace=True)
data.non_driving.replace(1, 'non-driving', inplace=True)
data.head(-1)

In [None]:
for i, row in data.iterrows():
    local_weekday = row['local_weekday']
    count = row['count']
    if local_weekday in ["Friday", "Thursday", "Wednesday", "Tuesday"]:
        data.at[i, 'count'] = count / 4
    else:
        data.at[i, 'count'] = count / 5
data.head()

In [None]:
a4_dims = (11.7, 8.27)
sns.set(rc={'figure.figsize':a4_dims}, font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid' : False})

ax = sns.barplot(x="local_weekday", y="count", hue="non_driving", data=data)

ax.set(xlabel='Day of the Week', ylabel='Number of Snaps')
plt.legend(loc='upper left')

plt.savefig('/home/shashanks/Projects/analysis/dayodtheweek.png')

### Day of the month analysis

In [None]:
data = df.groupby(['non_driving', 'local_day']).count()['id'].reset_index()

data.rename(columns={'id': 'count'}, inplace=True)
data.non_driving.replace(0, 'driving', inplace=True)
data.non_driving.replace(1, 'non-driving', inplace=True)
data.head()

In [None]:
a4_dims = (20, 8.27)
sns.set(rc={'figure.figsize':a4_dims}, font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid' : False})
order = sorted(range(1, 32))

ax = sns.barplot(x="local_day", y="count", hue="non_driving", data=data, order=order)
ax.set(xlabel='Day of the Month', ylabel='Number of Snaps')
plt.legend(loc='upper left')
plt.savefig('/home/shashanks/Projects/analysis/dayodthemoth.png')

##### Add secondary computations by various metrics

In [None]:
data = df.groupby(['non_driving', 'local_hour']).count()['id'].unstack(level=0)

# data.rename(index={'0': 'driving', '1': 'non-driving'}, inplace=True)
data['driving_percent_class'] = data.ix[:,0:].sum(axis=1)/np.sum(data.ix[:,0:].values) * 100
data['non_driving_percent_class'] = data.ix[:,1:].sum(axis=1)/np.sum(data.ix[:,1:].values) * 100
data['driving_percent_overall'] = data[0] * 100 / len(df)
data['non_driving_percent_overall'] = data[1] * 100 / len(df)
data['driving_percent_hourly'] = data[0] * 100 / (data[0] + data[1])
data['non_driving_percent_hourly'] = data[1] * 100 / (data[0] + data[1])

data.reset_index(level=0, inplace=True)
data

In [None]:
percent_class = data[['driving_percent_class', 'non_driving_percent_class']].unstack(level=1).reset_index()
percent_class.rename(columns={'level_1': 'hour', 0: 'percent'}, inplace=True)

ax = sns.lmplot(data=percent_class, x='hour', y='percent', hue='non_driving', 
               fit_reg=True, order=3, legend=False, markers=['x', 'o'])
ax.set(xlabel='Hour of the day', ylabel='Percent Class-wise')
plt.legend(loc='upper left')
ax.savefig('/home/shashanks/Projects/analysis/percent-class-wise.png')

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})
ax = data[['driving_percent_hourly', 'non_driving_percent_hourly']].plot(kind='bar', stacked=True)
plt.legend(loc='upper left')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=2, mode="expand", borderaxespad=0.)

In [None]:
percent_class = data[['driving_percent_hourly', 'non_driving_percent_hourly']].unstack(level=1).reset_index()
percent_class.rename(columns={'level_1': 'hour', 0: 'percent'}, inplace=True)

ax = sns.lmplot(data=percent_class, x='hour', y='percent', hue='non_driving', 
               fit_reg=True, order=3, legend=False, markers=['x', 'o'])

plt.legend(loc='upper left')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=1, mode="expand", borderaxespad=0.)
ax.savefig('/home/shashanks/Projects/analysis/percent-hourly.png', dpi=300)

In [None]:
percent_class = data[['driving_percent_overall', 'non_driving_percent_overall']].unstack(level=1).reset_index()
percent_class.rename(columns={'level_1': 'hour', 0: 'value'}, inplace=True)
percent_class.columns = ['non_driving', 'hour', 'percent']
percent_class.columns

ax = sns.lmplot(data=percent_class, x='hour', y='percent', hue='non_driving', 
               fit_reg=True, order=3, legend=False, markers=['x', 'o'])
ax.set(xlabel='Hour of the day', ylabel='Percent Overall')
plt.legend(loc='upper left')
ax.savefig('/home/shashanks/Projects/analysis/percent-overall.png')

### City analysis

In [44]:
city_data = df.groupby(['non_driving', 'city']).count()['id'].unstack(level=0)
city_data.reset_index(level=0, inplace=True)
city_data['driving_percent'] = city_data[0] / (city_data[0] + city_data[1])
city_data = city_data.sort_values(by=['driving_percent'], ascending=False)
city_data = city_data[city_data["driving_percent"] >= 0.197]
city_data.head()

non_driving,city,0,1,driving_percent
39,Chandigarh,9320,12437,0.428368
8,Amritsar,6472,9180,0.413493
143,Riyadh,398276,599051,0.399343
14,Baghdad,37522,59074,0.388443
3,Ahmedabad,4444,7372,0.3761


In [None]:
x, y = [], []
for i, row in city_data.iterrows():
    x.append(row['city'])
    y.append(row['driving_percent'])

In [None]:
a4_dims = (20, 10)
plt.figure(figsize=a4_dims)
plt.style.use('seaborn-white')

y_pos = range(len(x))

ax = plt.bar(x=y_pos, height=y)
plt.xlabel('City', fontsize=24)
plt.ylabel('Percentage of Driving Snaps', fontsize=24)
plt.xticks(y_pos, x, rotation=90, fontsize=24)
plt.yticks(fontsize=24)
plt.xlim([-1, 30])
plt.savefig('plots/city-percent.png', dpi=200, bbox_inches="tight")

### Grid center data and analysis

In [None]:
grid_csv = pd.read_csv('grid_center.csv')
grid_csv.shape

In [None]:
grid_csv.head()

In [None]:
grid_data = df.groupby(['lat', 'lon']).count()['id'].reset_index()
grid_data.rename(columns={'id': 'snap_count'}, inplace=True)
grid_data.head()

In [None]:
driving_grid_data = df[df.non_driving == 0].groupby(['lat', 'lon']).count()['id'].reset_index()
driving_grid_data.rename(columns={'id': 'driving_count'}, inplace=True)
driving_grid_data.head()

In [None]:
grid_data = pd.merge(grid_data, driving_grid_data, on=['lat', 'lon'], how='left')
grid_data.driving_count.fillna(0.0, inplace=True)
grid_data.sum()

In [None]:
grid_data.head()

In [None]:
grid_data = grid_data.merge(df[['city', 'lat', 'lon']].drop_duplicates(), on=['lat', 'lon'], how='left')
grid_data.head()

In [None]:
grid_data_new = grid_data
grid_data_new['driving_tile_percent'] = grid_data_new.driving_count * 100 / grid_data_new.snap_count
grid_data_new['snap_city_percent'] = grid_data_new.snap_count * 100 / grid_data_new.groupby('city').snap_count.transform('sum')
grid_data_new['driving_city_percent'] = grid_data_new.driving_count * 100 / grid_data_new.groupby('city').driving_count.transform('sum')

In [None]:
grid_data_new = grid_data_new[['city', 'lat', 'lon', 'snap_count', 'driving_count', 'driving_tile_percent', 'snap_city_percent', 'driving_city_percent']]
grid_data_new.head()

In [None]:
grid_data_new.to_csv("grid_data.csv", index=False)

In [None]:
num_zero_tiles = grid_csv.groupby('city').tile_number.count() - grid_data.groupby('city').snap_count.count()
num_zero_tiles = num_zero_tiles.reset_index()
num_zero_tiles.rename(columns={0: "num_zero"}, inplace=True)
num_zero_tiles.fillna(0.0, inplace=True)
num_zero_tiles.head()

In [None]:
ax = sns.lmplot(data=grid_data, x='snap_count', y='driving_count', \
                legend=False, markers='o', fit_reg=True, order=2, scatter_kws={"s":1})
# ax.set(xscale="log", yscale="log")
ax.set(xlabel='Number of Snaps', ylabel='Number of Driving Snaps')

### Heatmap

In [None]:
import folium

In [None]:
def generateBaseMap(default_location=[40.6452228,-74.015037], default_zoom_start=10):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

#### New York

In [None]:
from folium.plugins import HeatMap
new_york_df = df[df.city == "New York"]
base_map = generateBaseMap()
HeatMap(data=new_york_df[['lat', 'lon', 'non_driving']].groupby(['lat', 'lon']).sum().reset_index().values.tolist(), radius=8, max_zoom=10).add_to(base_map)
base_map

In [None]:
new_york_df = df[(df.city == "New York") & (df.non_driving == 0)]
base_map = generateBaseMap()
HeatMap(data=new_york_df[['lat', 'lon', 'non_driving']].groupby(['lat', 'lon']).count().reset_index().values.tolist(), radius=8, max_zoom=10).add_to(base_map)
base_map

#### Riyadh

In [None]:
city_df = df[(df.city == "Riyadh") & (df.non_driving == 0)]
base_map = generateBaseMap(default_location=[24.7241504,46.2620616])
HeatMap(data=city_df[['lat', 'lon', 'non_driving']].groupby(['lat', 'lon']).count().reset_index().values.tolist(), radius=8, max_zoom=10).add_to(base_map)
base_map

In [None]:
city_df = df[(df.city == "Riyadh")]
base_map = generateBaseMap(default_location=[24.7241504,46.2620616])
HeatMap(data=city_df[['lat', 'lon', 'non_driving']].groupby(['lat', 'lon']).count().reset_index().values.tolist(), radius=8, max_zoom=10).add_to(base_map)
base_map

#### NYC Grid

In [None]:
def get_geojson_grid(upper_right, lower_left, n=6):
    """Returns a grid of geojson rectangles, and computes the exposure in each section of the grid based on the vessel data.

    Parameters
    ----------
    upper_right: array_like
        The upper right hand corner of "grid of grids" (the default is the upper right hand [lat, lon] of the USA).

    lower_left: array_like
        The lower left hand corner of "grid of grids"  (the default is the lower left hand [lat, lon] of the USA).

    n: integer
        The number of rows/columns in the (n,n) grid.

    Returns
    -------

    list
        List of "geojson style" dictionary objects   
    """

    all_boxes = []

    lat_steps = np.linspace(lower_left[0], upper_right[0], n+1)
    lon_steps = np.linspace(lower_left[1], upper_right[1], n+1)

    lat_stride = lat_steps[1] - lat_steps[0]
    lon_stride = lon_steps[1] - lon_steps[0]

    for lat in lat_steps[:-1]:
        for lon in lon_steps[:-1]:
            # Define dimensions of box in grid
            upper_left = [lon, lat + lat_stride]
            upper_right = [lon + lon_stride, lat + lat_stride]
            lower_right = [lon + lon_stride, lat]
            lower_left = [lon, lat]

            # Define json coordinates for polygon
            coordinates = [
                upper_left,
                upper_right,
                lower_right,
                lower_left,
                upper_left
            ]

            geo_json = {"type": "FeatureCollection",
                        "properties":{
                            "lower_left": lower_left,
                            "upper_right": upper_right
                        },
                        "features":[]}

            grid_feature = {
                "type":"Feature",
                "geometry":{
                    "type":"Polygon",
                    "coordinates": [coordinates],
                }
            }

            geo_json["features"].append(grid_feature)

            all_boxes.append(geo_json)

    return all_boxes

In [None]:
upper_right = [40.908600, -73.702201]
lower_left = [40.495366, -74.247241]
grid = get_geojson_grid(upper_right, lower_left, n=15)
counts_array = []

regional_counts = []

for box in grid:
    upper_right = box["properties"]["upper_right"]
    lower_left = box["properties"]["lower_left"]

    mask = (
        (new_york_df.lat <= upper_right[1]) & (new_york_df.lat >= lower_left[1]) &
        (new_york_df.lon <= upper_right[0]) & (new_york_df.lon >= lower_left[0])
           )

    region_incidents = len(new_york_df[mask])
    regional_counts.append(region_incidents)

In [None]:
import branca
worst_region = max(regional_counts)
m = generateBaseMap()
for i, box in enumerate(grid):
    geo_json = json.dumps(box)

    color = plt.cm.Reds(regional_counts[i] / worst_region)
    color = matplotlib.colors.to_hex(color)

    gj = folium.GeoJson(geo_json,
                        style_function=lambda feature, color=color: {
                                                                        'fillColor': color,
                                                                        'color':"black",
                                                                        'weight': 2,
                                                                        'dashArray': '5, 5',
                                                                        'fillOpacity': 0.55,
                                                                    })

    m.add_child(gj)