# Snapchat Results Analysis

### Importing Requisite Libraries

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from tqdm import tqdm

import os
import sys
import csv
import requests
import time
import json
import numpy as np
import pandas as pd

from pprint import pprint
from subprocess import call

### Querying for spatial base-map imagery using ArcGIS

In [None]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np

m = Basemap(llcrnrlon=76.838069, llcrnrlat=28.412593,
        urcrnrlon=77.348458, urcrnrlat=28.881338)
 
lons, lats, x, y = m.makegrid(30, 30, returnxy=True)

m.drawparallels(range(0, 90, 20))
# m.scatter(x, y)
m.drawcoastlines()

m.arcgisimage(service='ESRI_Imagery_World_2D', verbose=True)

plt.show()

### Querying ArcGIS Server for base-maps of all cities

In [None]:
import numpy as np
from math import pi, cos
import matplotlib.ticker as pltticker

fig = plt.figure()
ax = fig.gca()

RADIUS_EARTH = 6378 # in kilometers
TILE_SIZE = 2.500 # the tile size in the grid (in kilometers)

# cities_df = pd.read_csv('cities.csv')
# for i in range(len(cities_df)):
#     city = cities_df.loc[i]["City"]
#     SW_LAT = cities_df.loc[i]["Lat (Southwest)"]
#     NE_LAT = cities_df.loc[i]["Lat (Northeast)"]
#     SW_LNG = cities_df.loc[i]["Lon (Southwest)"]
#     NE_LNG = cities_df.loc[i]["Lon (Northeast)"]

SW_LAT = 76.838069
SW_LNG = 28.412593
NE_LAT = 77.348458
NE_LNG = 28.881338

lat_change = (TILE_SIZE / ((2 * pi/360) * RADIUS_EARTH))
lng_change = (TILE_SIZE / ((2 * pi/360) * RADIUS_EARTH)) / cos(((SW_LAT + NE_LAT) / 2) * pi/180)

basemap_url = "http://server.arcgisonline.com/ArcGIS/rest/services/ESRI_Imagery_World_2D/MapServer/export?bbox=%f,%f,%f,%f&bboxSR=104013&imageSR=104013&dpi=96&format=png32&f=image" % (SW_LAT, SW_LNG, NE_LAT, NE_LNG)

lats = np.arange(SW_LAT, NE_LAT, lat_change)
lngs = np.arange(SW_LNG, NE_LNG, lat_change)

fig.subplots_adjust(left=0,right=1,bottom=0,top=1)

ax.xaxis.set_major_locator(plticker.LinearLocator(numticks=len(lats)))
ax.yaxis.set_major_locator(plticker.LinearLocator(numticks=len(lngs)))

ax.set_xticklabels(lats, fontsize=9)
ax.set_yticklabels(lngs, fontsize=9)

plt.xticks(rotation=75)

plt.imshow(plt.imread(urlopen(basemap_url)), origin='upper') 
plt.grid()
plt.show() 

### Initializing Matplotlib

In [None]:
from __future__ import division, print_function
# In a notebook environment, display the plots inline
%matplotlib inline

# Set some parameters to apply to all plots. These can be overridden
# in each plot if desired
# import matplotlib
# Plot size to 14" x 7"
matplotlib.rc('figure', figsize = (10, 8))
# Font size to 14
matplotlib.rc('font', size = 24)
# Do not display top and right frame lines
matplotlib.rc('axes.spines', top = False, right = False)
# Remove grid lines
matplotlib.rc('axes', grid = False)
# Set backgound color to white
matplotlib.rc('axes', facecolor = 'white')

### Read Model Accuracy Data

In [None]:
df = pd.read_csv('/Users/shwetanshusingh/Desktop/Snapchat/robustness.csv')
df

In [None]:
colors = ['black', 'grey']
positions = [0, 1]

fig, ax = plt.subplots()  

for group, color, pos in zip(df.groupby('WideResNet'), colors, positions):
    key, group = group
    group.plot('Threshold', 'mean_accuracy', yerr='dev_accuracy', kind='bar', width=0.4, label=key, 
               position=pos, color=color, alpha=0.5, ax=ax, error_kw=dict(lw=3, capsize=5, capthick=1.5))

ax.yaxis.set_label_text('Accuracy')
ax.set_ylim(0, 1.05)    
ax.set_xlim(-0.71, 4.5)
plt.xticks(rotation=0)
fig.tight_layout()
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=2, mode="expand", borderaxespad=0.)
plt.savefig('/Users/shwetanshusingh/Desktop/accuracy.png', dpi=300, bbox_inches='tight')
plt.show()

### Import class-wise data

In [None]:
df = pd.read_csv('/Users/shwetanshusingh/Desktop/Snapchat/class-wise.csv')
df

In [None]:
colors = ['black', 'grey', 'blue']
positions = [0, 1, 2]

fig, ax = plt.subplots()  

for group, color, pos in zip(df.groupby('Models'), colors, positions):
    key, group = group
    group.plot('Type', 'mean_recall', yerr='dev_recall', kind='bar', width=0.3, label=key, 
               position=pos, color=color, alpha=0.5, ax=ax, error_kw=dict(lw=3, capsize=5, capthick=1.5))

ax.yaxis.set_label_text('Recall')
ax.set_ylim(0.85, 1.025)    
ax.set_xlim(-0.75, 1.5)
plt.xticks(rotation=0)
fig.tight_layout()
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=2, mode="expand", borderaxespad=0.)
plt.savefig('/Users/shwetanshusingh/Desktop/classwise-recall.png', dpi=300, bbox_inches='tight')
plt.show()

### Import prediction results data

In [2]:
df = pd.read_csv("combined_v5.csv")
df.head()

Unnamed: 0,id,city,lat,lon,utc_timestamp,local_timestamp,scraped_date,duration,url,non_driving,local_date,local_time,local_hour,local_minute,local_second,local_weekday,local_day
0,W7_EDlXWTBiXAEEniNoMPwAAY3AiPrLhH65pdAWmCqu4nA...,Delhi,28.674125,77.064029,1552674600000,2019-03-16 00:00:00,16_03_2019,9.296667,https://s.sc-cdn.net/c8AAKzM0ru6kt8ftIHTCod001...,1,2019-03-16,00:00:00,0,0,0,Saturday,16
1,W7_EDlXWTBiXAEEniNoMPwAAYmSj9OwxbeKXrAWmCqul9A...,Delhi,28.674125,77.064029,1552674600000,2019-03-16 00:00:00,16_03_2019,10.0,https://s.sc-cdn.net/MoZ5SufxR6oQNk32WBYBu8o4Z...,1,2019-03-16,00:00:00,0,0,0,Saturday,16
2,W7_EDlXWTBiXAEEniNoMPwAAYmJ1wsiKtCM_uAWmDYGmiA...,Amman,31.95327,35.912186,1552687200000,2019-03-16 00:00:00,16_03_2019,10.0,https://s.sc-cdn.net/FqdN13UMj9SOpvXTZAH_mp90-...,1,2019-03-16,00:00:00,0,0,0,Saturday,16
3,W7_EDlXWTBiXAEEniNoMPwAAY7tay4k7ISEREAWmCoJ7-A...,Mumbai,19.06464,72.87139,1552674600000,2019-03-16 00:00:00,16_03_2019,10.0,https://s.sc-cdn.net/qH7cPflldGHu88ettD-GWS3Uq...,1,2019-03-16,00:00:00,0,0,0,Saturday,16
4,W7_EDlXWTBiXAEEniNoMPwAAY3VSbhsZ7kE2ZAWmCns9sA...,Mumbai,19.046673,72.899907,1552674600000,2019-03-16 00:00:00,16_03_2019,10.0,https://s.sc-cdn.net/JI29SEAn7MWkkDtbsx70cDqES...,1,2019-03-16,00:00:00,0,0,0,Saturday,16


##### Compute data by class & hour

In [38]:
data = df.groupby(['non_driving', 'local_hour']).count()['id'].reset_index()

data.rename(columns={'id': 'count'}, inplace=True)
data.non_driving.replace(0, 'driving', inplace=True)
data.non_driving.replace(1, 'non-driving', inplace=True)
data.head()

Unnamed: 0,non_driving,local_hour,count
0,driving,0,81935
1,driving,1,64256
2,driving,2,49212
3,driving,3,34322
4,driving,4,21259


In [42]:
hours_list = [[0, 0] for a in range(24)]
for i, row in data.iterrows():
    hour = row["local_hour"]
    if row["non_driving"] == "driving":
        hours_list[hour][0] = row["count"]
    else:
        hours_list[hour][1] = row["count"]

hours_list = np.array(hours_list)
hours_list

array([[ 81935, 231464],
       [ 64256, 191337],
       [ 49212, 132524],
       [ 34322,  81808],
       [ 21259,  49673],
       [ 19952,  40451],
       [ 36214,  56197],
       [ 46024,  75695],
       [ 43962,  98167],
       [ 46085, 129073],
       [ 50940, 164400],
       [ 57933, 194310],
       [ 58806, 211742],
       [ 68355, 236904],
       [ 74139, 247603],
       [ 64073, 238103],
       [ 73780, 252610],
       [ 89399, 288834],
       [ 82655, 311395],
       [ 81502, 329552],
       [ 80421, 347832],
       [ 99634, 377141],
       [100303, 357780],
       [ 90189, 271608]])

In [43]:
np.corrcoef(hours_list[:, 0], hours_list[:, 1])

array([[1.        , 0.95451707],
       [0.95451707, 1.        ]])

#### Compute daytime and night-time driving ratio

In [25]:
new_data = data[(data["non_driving"] == "driving") & ((data["local_hour"] >= 18) | ((data["local_hour"] >= 0) & (data["local_hour"] <= 2)))]
new_data.head()

Unnamed: 0,non_driving,local_hour,count
0,driving,0,81935
1,driving,1,64256
2,driving,2,49212
18,driving,18,82655
19,driving,19,81502


#### Compute daytime and nigh-time ratio for all snaps

In [28]:
data = df.groupby(['local_hour']).count()['id'].reset_index()

data.rename(columns={'id': 'count'}, inplace=True)
data.head()

Unnamed: 0,local_hour,count
0,0,313399
1,1,255593
2,2,181736
3,3,116130
4,4,70932


In [30]:
new_data = data[(data["local_hour"] >= 18) | ((data["local_hour"] >= 0) & (data["local_hour"] <= 2))]
new_data.head()

Unnamed: 0,local_hour,count
0,0,313399
1,1,255593
2,2,181736
18,18,394050
19,19,411054


In [None]:
sns.set(font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid' : False})

ax = sns.lmplot(data=data, x='local_hour', y='count', hue='non_driving', 
               fit_reg=True, order=3, legend=False, markers=['x', 'o'], height=7, aspect=1.2)
ax.set(xlabel='Hour of the day', ylabel='Number of Snaps')
plt.legend(loc='upper left')
plt.xlim([0, 24])

ax.set(yticks=[100000, 200000, 300000, 400000, 500000], yticklabels=['100k', '200k', '300k', '400k', '500k'])

ax.savefig('plots/diurnal.png', dpi=200, bbox_inches='tight')

### Day of the week analysis

In [None]:
num_saturdays = 5
num_sundays = 5
num_mondays = 5
num_tuesdays = 4
num_wednesdays = 4 
num_thursdays = 4
num_fridays = 4

In [None]:
data = df.groupby(['non_driving', 'local_weekday']).count()['id'].reset_index()

data.rename(columns={'id': 'count'}, inplace=True)
data.non_driving.replace(0, 'driving', inplace=True)
data.non_driving.replace(1, 'non-driving', inplace=True)
data.head(-1)

In [None]:
for i, row in data.iterrows():
    local_weekday = row['local_weekday']
    count = row['count']
    if local_weekday in ["Friday", "Thursday", "Wednesday", "Tuesday"]:
        data.at[i, 'count'] = count / 4
    else:
        data.at[i, 'count'] = count / 5
data.head()

In [None]:
a4_dims = (11.7, 8.27)
sns.set(rc={'figure.figsize':a4_dims}, font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid' : False})

ax = sns.barplot(x="local_weekday", y="count", hue="non_driving", data=data)

ax.set(xlabel='Day of the Week', ylabel='Number of Snaps')
plt.legend(loc='upper left')

plt.savefig('/home/shashanks/Projects/analysis/dayodtheweek.png')

### Day of the month analysis

In [None]:
data = df.groupby(['non_driving', 'local_day']).count()['id'].reset_index()

data.rename(columns={'id': 'count'}, inplace=True)
data.non_driving.replace(0, 'driving', inplace=True)
data.non_driving.replace(1, 'non-driving', inplace=True)
data.head()

In [None]:
a4_dims = (20, 8.27)
sns.set(rc={'figure.figsize':a4_dims}, font_scale=1.5)
sns.set_style("whitegrid", {'axes.grid' : False})
order = sorted(range(1, 32))

ax = sns.barplot(x="local_day", y="count", hue="non_driving", data=data, order=order)
ax.set(xlabel='Day of the Month', ylabel='Number of Snaps')
plt.legend(loc='upper left')
plt.savefig('/home/shashanks/Projects/analysis/dayodthemoth.png')

##### Add secondary computations by various metrics

In [None]:
data = df.groupby(['non_driving', 'local_hour']).count()['id'].unstack(level=0)

# data.rename(index={'0': 'driving', '1': 'non-driving'}, inplace=True)
data['driving_percent_class'] = data.ix[:,0:].sum(axis=1)/np.sum(data.ix[:,0:].values) * 100
data['non_driving_percent_class'] = data.ix[:,1:].sum(axis=1)/np.sum(data.ix[:,1:].values) * 100
data['driving_percent_overall'] = data[0] * 100 / len(df)
data['non_driving_percent_overall'] = data[1] * 100 / len(df)
data['driving_percent_hourly'] = data[0] * 100 / (data[0] + data[1])
data['non_driving_percent_hourly'] = data[1] * 100 / (data[0] + data[1])

data.reset_index(level=0, inplace=True)
data

In [None]:
percent_class = data[['driving_percent_class', 'non_driving_percent_class']].unstack(level=1).reset_index()
percent_class.rename(columns={'level_1': 'hour', 0: 'percent'}, inplace=True)

ax = sns.lmplot(data=percent_class, x='hour', y='percent', hue='non_driving', 
               fit_reg=True, order=3, legend=False, markers=['x', 'o'])
ax.set(xlabel='Hour of the day', ylabel='Percent Class-wise')
plt.legend(loc='upper left')
ax.savefig('/home/shashanks/Projects/analysis/percent-class-wise.png')

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})
ax = data[['driving_percent_hourly', 'non_driving_percent_hourly']].plot(kind='bar', stacked=True)
plt.legend(loc='upper left')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=2, mode="expand", borderaxespad=0.)

In [None]:
percent_class = data[['driving_percent_hourly', 'non_driving_percent_hourly']].unstack(level=1).reset_index()
percent_class.rename(columns={'level_1': 'hour', 0: 'percent'}, inplace=True)

ax = sns.lmplot(data=percent_class, x='hour', y='percent', hue='non_driving', 
               fit_reg=True, order=3, legend=False, markers=['x', 'o'])

plt.legend(loc='upper left')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=1, mode="expand", borderaxespad=0.)
ax.savefig('/home/shashanks/Projects/analysis/percent-hourly.png', dpi=300)

In [None]:
percent_class = data[['driving_percent_overall', 'non_driving_percent_overall']].unstack(level=1).reset_index()
percent_class.rename(columns={'level_1': 'hour', 0: 'value'}, inplace=True)
percent_class.columns = ['non_driving', 'hour', 'percent']
percent_class.columns

ax = sns.lmplot(data=percent_class, x='hour', y='percent', hue='non_driving', 
               fit_reg=True, order=3, legend=False, markers=['x', 'o'])
ax.set(xlabel='Hour of the day', ylabel='Percent Overall')
plt.legend(loc='upper left')
ax.savefig('/home/shashanks/Projects/analysis/percent-overall.png')

### City analysis

In [44]:
city_data = df.groupby(['non_driving', 'city']).count()['id'].unstack(level=0)
city_data.reset_index(level=0, inplace=True)
city_data['driving_percent'] = city_data[0] / (city_data[0] + city_data[1])
city_data = city_data.sort_values(by=['driving_percent'], ascending=False)
city_data = city_data[city_data["driving_percent"] >= 0.197]
city_data.head()

non_driving,city,0,1,driving_percent
39,Chandigarh,9320,12437,0.428368
8,Amritsar,6472,9180,0.413493
143,Riyadh,398276,599051,0.399343
14,Baghdad,37522,59074,0.388443
3,Ahmedabad,4444,7372,0.3761


In [None]:
x, y = [], []
for i, row in city_data.iterrows():
    x.append(row['city'])
    y.append(row['driving_percent'])

In [None]:
a4_dims = (20, 10)
plt.figure(figsize=a4_dims)
plt.style.use('seaborn-white')

y_pos = range(len(x))

ax = plt.bar(x=y_pos, height=y)
plt.xlabel('City', fontsize=24)
plt.ylabel('Percentage of Driving Snaps', fontsize=24)
plt.xticks(y_pos, x, rotation=90, fontsize=24)
plt.yticks(fontsize=24)
plt.xlim([-1, 30])
plt.savefig('plots/city-percent.png', dpi=200, bbox_inches="tight")

### Compute Grid Centers for each grid tile in each city

In [None]:
grid_csv = pd.read_csv('grid_center.csv')
grid_csv.shape

In [None]:
grid_csv.head()

In [None]:
grid_data = df.groupby(['lat', 'lon']).count()['id'].reset_index()
grid_data.rename(columns={'id': 'snap_count'}, inplace=True)
grid_data.head()

In [None]:
driving_grid_data = df[df.non_driving == 0].groupby(['lat', 'lon']).count()['id'].reset_index()
driving_grid_data.rename(columns={'id': 'driving_count'}, inplace=True)
driving_grid_data.head()

In [None]:
grid_data = pd.merge(grid_data, driving_grid_data, on=['lat', 'lon'], how='left')
grid_data.driving_count.fillna(0.0, inplace=True)
grid_data.sum()

In [None]:
grid_data.head()

In [None]:
grid_data = grid_data.merge(df[['city', 'lat', 'lon']].drop_duplicates(), on=['lat', 'lon'], how='left')
grid_data.head()

In [None]:
grid_data_new = grid_data
grid_data_new['driving_tile_percent'] = grid_data_new.driving_count * 100 / grid_data_new.snap_count
grid_data_new['snap_city_percent'] = grid_data_new.snap_count * 100 / grid_data_new.groupby('city').snap_count.transform('sum')
grid_data_new['driving_city_percent'] = grid_data_new.driving_count * 100 / grid_data_new.groupby('city').driving_count.transform('sum')

In [None]:
grid_data_new = grid_data_new[['city', 'lat', 'lon', 'snap_count', 'driving_count', 'driving_tile_percent', 'snap_city_percent', 'driving_city_percent']]
grid_data_new.head()

In [None]:
grid_data_new.to_csv("grid_data.csv", index=False)

In [None]:
num_zero_tiles = grid_csv.groupby('city').tile_number.count() - grid_data.groupby('city').snap_count.count()
num_zero_tiles = num_zero_tiles.reset_index()
num_zero_tiles.rename(columns={0: "num_zero"}, inplace=True)
num_zero_tiles.fillna(0.0, inplace=True)
num_zero_tiles.head()

### Visual Analysis to check for Power Law

In [None]:
for city in tqdm(grid_data.city.unique()):
    plot_df = grid_data[grid_data.city == city]['snap_count'].reset_index().snap_count.value_counts().reset_index().sort_values(by='index', ascending=False)
    print(plot_df.head())
    break

In [None]:
for city in tqdm(grid_data.city.unique()):
    plot_df = grid_data[grid_data.city == city]['snap_count'].reset_index().snap_count.value_counts().reset_index().sort_values(by='index', ascending=False)
    plot_df.rename(columns={"index": "snap_count", "snap_count": "# of tiles"}, inplace=True)
    ax = sns.lmplot(data=plot_df, x='snap_count', y='# of tiles', legend=False, markers='o', fit_reg=False)
    ax.set(xlabel='Number of Snaps', ylabel='# of Tiles')
    ax.savefig('./power_law/snap_count/' + str(city) + "_power_law.png")
    plt.close()

In [None]:
for city in tqdm(grid_data.city.unique()):
    plot_df = grid_data[grid_data.city == city]['driving_count'].reset_index().driving_count.value_counts().reset_index().sort_values(by='index', ascending=False)
    plot_df.rename(columns={"index": "driving_count", "driving_count": "# of tiles"}, inplace=True)
    ax = sns.lmplot(data=plot_df, x='driving_count', y='# of tiles', legend=False, markers='o', fit_reg=False)
    ax.set(xlabel='Number of Driving Snaps', ylabel='# of Tiles')
    ax.savefig('./power_law/driving_count/' + str(city) + "_driving_power_law.png")

##### Evaluating Powerlaw Statistically

In [None]:
import powerlaw

with open('./power_law/driving_power_law_results.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(['city', 'alpha', 'xmin', 'sigma'])

    for city in tqdm(grid_data.city.unique()):
        plot_df = grid_data[grid_data.city == city]['driving_count'].reset_index().driving_count.value_counts().reset_index()
        plot_df.rename(columns={"index": "driving_count", "driving_count": "# of tiles"}, inplace=True)
        results = powerlaw.Fit(plot_df.driving_count)
        csv_writer.writerow([city, results.power_law.alpha, results.power_law.xmin, results.power_law.sigma])

# print(results.distribution_compare('power_law', 'exponential'))

In [None]:
print(results.distribution_compare('power_law', 'exponential'))

In [None]:
powerlaw_df = pd.read_csv("./power_law/driving_power_law_results.csv")
powerlaw_df.head()

In [None]:
plot_df = pd.read_csv('./power_law/driving_power_law_results.csv')
ax = sns.lmplot(data=plot_df, x='xmin', y='alpha', legend=False, markers='o', fit_reg=False)
# ax.set(xlim=(0, 100), ylim=(0, 20))

# ax.savefig('/Users/shwetanshusingh/Desktop/Snapchat/power-law/power-law.png')

grid_csv = pd.merge(grid_csv, grid_data, on=['lat', 'lon'], how='left').fillna(0.0)
grid_csv.sum()

In [None]:
with open('./power_law/total_power_law_results.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(['city', 'alpha', 'xmin', 'sigma'])

    for city in tqdm(grid_data.city.unique()):
        plot_df = grid_data[grid_data.city == city]['snap_count'].reset_index().snap_count.value_counts().reset_index()
        plot_df.rename(columns={"index": "snap_count", "snap_count": "# of tiles"}, inplace=True)
        results = powerlaw.Fit(plot_df.snap_count)
        csv_writer.writerow([city, results.power_law.alpha, results.power_law.xmin, results.power_law.sigma])

In [None]:
powerlaw_df = pd.read_csv("./power_law/total_power_law_results.csv")
powerlaw_df.head(10)

In [None]:
city_wise = grid_data.groupby('city').sum()[['snap_count', 'driving_count']]
city_wise['percent_driving'] = city_wise.driving_count * 100 / city_wise.snap_count
city_wise.reset_index(inplace=True)
city_wise = city_wise.merge(grid_data.groupby('city').lat.count().reset_index().rename(columns={'lat': 'tile_data_count'}))
city_wise = city_wise.merge(grid_csv.groupby('city').count()['tile_number'].reset_index())
city_wise['percent_data_tiles'] = city_wise.tile_data_count * 100 / city_wise.tile_number
city_wise.head()

In [None]:
ax = sns.lineplot(data=city_wise, x='percent_driving', y='percent_data_tiles', legend=False)
# ax = sns.lmplot(data=plot_df, x='snap_count', y='# of tiles', legend=False, markers='o', fit_reg=False)
ax.set(xscale="log", yscale="log")
ax.set(xlabel='% Driving', ylabel='% Tiles with Data')

In [None]:
ax = sns.lmplot(data=city_wise, x='driving_count', y='tile_data_count', legend=False)
# markers='o', fit_reg=False)
ax.set(xscale="log", yscale="log")
ax.set(xlabel='# Driving Snaps', ylabel='# Tiles with Data')

In [None]:
df[df.non_driving == 0].duration.describe()

In [None]:
sns.distplot(df[df.non_driving == 0].duration)
ax = sns.distplot(df[df.non_driving == 1].duration)
ax.set(xlabel='Duration')

In [None]:
df['local_date'] = df.local_date.apply(lambda x: dt.strftime(x, '%Y-%m-%d'))
df.local_date.head()

In [None]:
sns.catplot(x="duration", y="local_date", data=df)

In [None]:
sns.catplot(x="duration", y="local_date", data=df, hue='non_driving', kind="boxen")

In [None]:
sns.catplot(x="duration", y="local_date", data=df, hue='non_driving', kind='violin')

In [None]:
from scipy.stats import ks_2samp
from scipy.stats import pareto, powerlaw as spl

fw = open('/Users/shwetanshusingh/Desktop/Snapchat/power-law/SUMMARY_PLFits.csv','w')
for city in tqdm(grid_data.city.unique()):

#     print("City=" + str(city))
    data = grid_data[grid_data.city == city]['driving_count'].values
    params = spl.fit(data)
    rvs = spl.rvs(params[0], params[1], params[2], size=len(data))

    y, x = np.histogram(data)
    plt.plot(x[0:len(x)-1], y)
    y, x = np.histogram(rvs, bins = x)
    plt.plot(x[0:len(x)-1], y, color='red')
    plt.gca().set_yscale('log')
    plt.gca().set_xscale('log')
    plt.savefig('/Users/shwetanshusingh/Desktop/Snapchat/power-law/'+str(city)+'.pdf')

    D,pval = ks_2samp(data, rvs)
    fw.write(str(city)+','+str(params[0])+','+str(params[1])+','+str(params[2])+','+str(D)+','+
        str(pval)+','+str(np.sum(spl.logpdf(data, *params)))+'\n')

fw.close()

In [None]:
with open('/Users/shwetanshusingh/Desktop/Snapchat/tile_driving_stats.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(['city', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])

    for city in tqdm(grid_data.city.unique()):
        data = grid_data[grid_data.city == city]
        csv_writer.writerow([city] + [value for value in data['driving_tile_percent'].describe()])

In [None]:
plot_df = pd.read_csv('/Users/shwetanshusingh/Desktop/Snapchat/tile_driving_stats.csv')
plot_df.head()

In [None]:
with open('/Users/shwetanshusingh/Desktop/Snapchat/tile_driving_stats.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(['city', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])

    for city in tqdm(data.city.unique()):
        data = grid_data[grid_data.city == grid_data.city.unique()[0]]['driving_tile_percent'].values
        data = list(data) + [0] * (len(grid_csv[grid_csv.city == grid_data.city.unique()[0]]) - len(data))
        data = pd.DataFrame(data).describe()
        csv_writer.writerow([city] + [value for value in data])

In [None]:
plot_df = pd.read_csv('/Users/shwetanshusingh/Desktop/Snapchat/tile_driving_stats.csv')
plot_df.head()

In [None]:
plot_df[(plot_df['mean'] > plot_df['std']) & (plot_df['count'] > 50)][['city', 'mean', 'std', 'count']]

In [None]:
fig, ax = plt.subplots()
ax.scatter(plot_df[(plot_df['count'] > 50)]['mean'], plot_df[(plot_df['count'] > 50)]['std'], s=25, cmap=plt.cm.coolwarm, zorder=10)

lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set(xlabel='mean', ylabel='std')

In [None]:
ax = sns.lmplot(data=plot_df, x='mean', y='std', legend=False, markers='o', fit_reg=True)
ax.set(xscale="log", yscale="log")
ax.set(xlabel='mean', ylabel='std')

In [None]:
data = df.groupby('city').count()['non_driving'].reset_index().rename(columns={'non_driving': 'snap_count'})
data = data.merge(df[df.non_driving == 0].groupby('city').count()['non_driving'].reset_index().rename(columns={'non_driving': 'driving_count'}))
ax = sns.lmplot(data=data, x='snap_count', y='driving_count', legend=False, markers='o', fit_reg=True, order=2)
ax.set(xscale="log", yscale="log")
ax.set(xlabel='# Snaps', ylabel='# Driving Snaps')

In [None]:
grid_data[['snap_count', 'driving_count']].corr()

In [None]:
import pandas as pd
from datetime import datetime

unclean_df = pd.read_csv('/Users/shwetanshusingh/Downloads/combined_v3.csv')

df = unclean_df.dropna()

del df['Unnamed: 0']

df.rename(columns={"timestamp": "utc_timestamp", 
                   "local_time": "local_timestamp",
                    "driving": "non_driving"}, inplace=True)

df.local_timestamp = pd.to_datetime(df['local_timestamp'])

df['local_date'] = df.local_timestamp.dt.date
df['local_time'] = df.local_timestamp.dt.time
df['local_hour'] = df.local_timestamp.dt.hour
df['local_minute'] = df.local_timestamp.dt.minute
df['local_second'] = df.local_timestamp.dt.second

data = df.groupby('local_hour').count()['id'].reset_index().rename(columns={'id': 'snap_count'})
data = data.merge(df[df.non_driving == 0].groupby('local_hour').count()['id'].reset_index())
data.rename(columns={'id': 'driving_count'}, inplace=True)
data[['snap_count', 'driving_count']].corr()
data.sum()

In [None]:
(data[(data.local_hour >= 18) | (data.local_hour < 2)].sum() / 8) / (data[(data.local_hour >=2) & (data.local_hour <18)].sum() / 16)

In [None]:
data[['snap_count', 'driving_count']].corr()

In [None]:
grid_data[['snap_count', 'driving_count']].corr()

In [None]:
ax = sns.lmplot(data=grid_data, x='snap_count', y='driving_count', \
                legend=False, markers='o', fit_reg=True, order=2, scatter_kws={"s":1})
# ax.set(xscale="log", yscale="log")
ax.set(xlabel='Number of Snaps', ylabel='Number of Driving Snaps')

In [None]:
ax = sns.lmplot(data=grid_data, x='snap_count', y='driving_count', \
                legend=False, markers='o', fit_reg=True, order=2, scatter_kws={"s":1})
ax.set(xscale="log", yscale="log")
ax.set(xlabel='Number of Snaps', ylabel='Number of Driving Snaps')

In [None]:
with open('/Users/shwetanshusingh/Desktop/Snapchat/tile_driving_stats.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(['city', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'var'])

    for city in tqdm(grid_data.city.unique()):
        data = grid_data[grid_data.city == city]['driving_tile_percent'].values
        data = list(data) + [0] * (len(grid_csv[grid_csv.city == city]) - len(data))
        d = pd.DataFrame(data).describe()
        csv_writer.writerow([city] + [value for value in d[0]] + [pd.DataFrame(data).var()[0]])

In [None]:
plot_df = pd.read_csv('/Users/shwetanshusingh/Desktop/Snapchat/tile_driving_stats.csv')
plot_df.head()

In [None]:
plot_df[(plot_df['mean'] > plot_df['std']) & (plot_df['count'] > 50)][['city', 'mean', 'std', 'count']]

In [None]:
data = grid_data.groupby('city').max().reset_index()[['city', 'driving_city_percent']]
ax = sns.scatterplot(data=data, x='city', y='driving_city_percent')
plt.xticks(rotation=90)

In [None]:
data.head()

In [None]:
data = data.merge(plot_df)
data.head()

In [None]:
ax = sns.lmplot(data=data, x='driving_city_percent', y='var', fit_reg=False)

In [None]:
ax = sns.lmplot(data=data, x='driving_city_percent', y='std', fit_reg=False)
#                fit_reg=True, order=3, legend=False)
# ax.set(xlabel='Hour of the day', ylabel='Number of Snaps')
# plt.legend(loc='upper left')

In [None]:
sns.lmplot(data=grid_data, x='driving_city_percent', y='snap_city_percent', fit_reg=False)

In [None]:
data.sort_values('var', inplace=True)
data.tail()['']

In [None]:
grid_data.head()

In [None]:
data.fillna(0, inplace=True)
data.head()

In [None]:
q = []
for city in data[data['count'] >= 50].city.unique():
    q.append(sum(sorted(grid_data[grid_data.city == city].driving_city_percent, reverse=True)[:int(0.2 * data[data.city == city]['count'])]))
    

In [None]:
min(q)

In [None]:
q = []
d = df.groupby('city').count()['id'].reset_index()
for city in d[d.id > 1000].city.unique():
    q.append(sum(sorted(grid_data[grid_data.city == city].driving_city_percent, reverse=True)[:int(0.1 * data[data.city == city]['count'])]))
   

In [None]:
d = df[df.non_driving == 0].groupby('city').count()['id'] / df.groupby('city').count()['id']
d.reset_index()['id'].describe()

In [None]:
df[df.non_driving == 0].count()['id'] / df.count()['id']

### Heatmap

In [None]:
import folium

In [None]:
def generateBaseMap(default_location=[40.6452228,-74.015037], default_zoom_start=10):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

#### New York

In [None]:
from folium.plugins import HeatMap
new_york_df = df[df.city == "New York"]
base_map = generateBaseMap()
HeatMap(data=new_york_df[['lat', 'lon', 'non_driving']].groupby(['lat', 'lon']).sum().reset_index().values.tolist(), radius=8, max_zoom=10).add_to(base_map)
base_map

In [None]:
new_york_df = df[(df.city == "New York") & (df.non_driving == 0)]
base_map = generateBaseMap()
HeatMap(data=new_york_df[['lat', 'lon', 'non_driving']].groupby(['lat', 'lon']).count().reset_index().values.tolist(), radius=8, max_zoom=10).add_to(base_map)
base_map

#### Riyadh

In [None]:
city_df = df[(df.city == "Riyadh") & (df.non_driving == 0)]
base_map = generateBaseMap(default_location=[24.7241504,46.2620616])
HeatMap(data=city_df[['lat', 'lon', 'non_driving']].groupby(['lat', 'lon']).count().reset_index().values.tolist(), radius=8, max_zoom=10).add_to(base_map)
base_map

In [None]:
city_df = df[(df.city == "Riyadh")]
base_map = generateBaseMap(default_location=[24.7241504,46.2620616])
HeatMap(data=city_df[['lat', 'lon', 'non_driving']].groupby(['lat', 'lon']).count().reset_index().values.tolist(), radius=8, max_zoom=10).add_to(base_map)
base_map

#### NYC Grid

In [None]:
def get_geojson_grid(upper_right, lower_left, n=6):
    """Returns a grid of geojson rectangles, and computes the exposure in each section of the grid based on the vessel data.

    Parameters
    ----------
    upper_right: array_like
        The upper right hand corner of "grid of grids" (the default is the upper right hand [lat, lon] of the USA).

    lower_left: array_like
        The lower left hand corner of "grid of grids"  (the default is the lower left hand [lat, lon] of the USA).

    n: integer
        The number of rows/columns in the (n,n) grid.

    Returns
    -------

    list
        List of "geojson style" dictionary objects   
    """

    all_boxes = []

    lat_steps = np.linspace(lower_left[0], upper_right[0], n+1)
    lon_steps = np.linspace(lower_left[1], upper_right[1], n+1)

    lat_stride = lat_steps[1] - lat_steps[0]
    lon_stride = lon_steps[1] - lon_steps[0]

    for lat in lat_steps[:-1]:
        for lon in lon_steps[:-1]:
            # Define dimensions of box in grid
            upper_left = [lon, lat + lat_stride]
            upper_right = [lon + lon_stride, lat + lat_stride]
            lower_right = [lon + lon_stride, lat]
            lower_left = [lon, lat]

            # Define json coordinates for polygon
            coordinates = [
                upper_left,
                upper_right,
                lower_right,
                lower_left,
                upper_left
            ]

            geo_json = {"type": "FeatureCollection",
                        "properties":{
                            "lower_left": lower_left,
                            "upper_right": upper_right
                        },
                        "features":[]}

            grid_feature = {
                "type":"Feature",
                "geometry":{
                    "type":"Polygon",
                    "coordinates": [coordinates],
                }
            }

            geo_json["features"].append(grid_feature)

            all_boxes.append(geo_json)

    return all_boxes

In [None]:
upper_right = [40.908600, -73.702201]
lower_left = [40.495366, -74.247241]
grid = get_geojson_grid(upper_right, lower_left, n=15)
counts_array = []

regional_counts = []

for box in grid:
    upper_right = box["properties"]["upper_right"]
    lower_left = box["properties"]["lower_left"]

    mask = (
        (new_york_df.lat <= upper_right[1]) & (new_york_df.lat >= lower_left[1]) &
        (new_york_df.lon <= upper_right[0]) & (new_york_df.lon >= lower_left[0])
           )

    region_incidents = len(new_york_df[mask])
    regional_counts.append(region_incidents)

In [None]:
import branca
worst_region = max(regional_counts)
m = generateBaseMap()
for i, box in enumerate(grid):
    geo_json = json.dumps(box)

    color = plt.cm.Reds(regional_counts[i] / worst_region)
    color = matplotlib.colors.to_hex(color)

    gj = folium.GeoJson(geo_json,
                        style_function=lambda feature, color=color: {
                                                                        'fillColor': color,
                                                                        'color':"black",
                                                                        'weight': 2,
                                                                        'dashArray': '5, 5',
                                                                        'fillOpacity': 0.55,
                                                                    })

    m.add_child(gj)
#     colormap = branca.colormap.linear.YlGn_09.scale(0, 1)
#     colormap = colormap.to_step(index=[0, 0.3, 0.6, 0.8 , 1]) 
#     colormap.add_to(m)

In [None]:
m.save("abcd.html")

### Classification

In [3]:
from datetime import datetime as dt
df = pd.read_csv("/scratch/talsperre/combine_v4.csv")
df.rename(columns={"timestamp": "utc_timestamp", 
                   "local_time": "local_timestamp",
                    "driving": "non_driving"}, inplace=True)

df.local_timestamp = pd.to_datetime(df['local_timestamp'])

df['local_date'] = df.local_timestamp.dt.date
df['local_time'] = df.local_timestamp.dt.time
df['local_hour'] = df.local_timestamp.dt.hour
df['local_minute'] = df.local_timestamp.dt.minute
df['local_second'] = df.local_timestamp.dt.second
df['local_weekday'] = df.local_timestamp.dt.weekday_name
df['local_day'] = df.local_timestamp.dt.day

df.head()

Unnamed: 0,id,city,lat,lon,utc_timestamp,local_timestamp,scraped_date,duration,url,non_driving,local_date,local_time,local_hour,local_minute,local_second,local_weekday,local_day
0,W7_EDlXWTBiXAEEniNoMPwAAYp-nmZ_iQhJ8ZAWnUYZt1A...,Jaipur,26.890944,75.751022,1554046320000,2019-03-31 21:02:00,01_04_2019,10.457,https://s.sc-cdn.net/fGnB8X2KoCi4dSuUBUYmCau84...,1,2019-03-31,21:02:00,21,2,0,Sunday,31
1,W7_EDlXWTBiXAEEniNoMPwAAYSaz9jitYR-SCAWnWtL-KA...,Chicago,42.003253,-87.891874,1554085028000,2019-03-31 21:17:08,01_04_2019,10.0,https://s.sc-cdn.net/CAlD3x7OwpfqIh0m9UuSjUvYC...,1,2019-03-31,21:17:08,21,17,8,Sunday,31
2,W7_EDlXWTBiXAEEniNoMPwAAYrcLsJ9As3tReAWnQLZd2A...,Medina,24.43692,39.609041,1553975701000,2019-03-30 22:55:01,01_04_2019,10.0,https://s.sc-cdn.net/rbHaV9-XXeCu1DYyZjnlRw4qH...,1,2019-03-30,22:55:01,22,55,1,Saturday,30
3,W7_EDlXWTBiXAEEniNoMPwAAYm5mVcCrKegBNAWnUqtxYA...,Dallas,32.703049,-96.776003,1554050860000,2019-03-31 11:47:40,01_04_2019,2.0,https://s.sc-cdn.net/WJ7w5lSCcs2LIn62Cv87ZYyVS...,1,2019-03-31,11:47:40,11,47,40,Sunday,31
4,W7_EDlXWTBiXAEEniNoMPwAAYB3BAlpAwCkmmAWnYSG3EA...,Dubai,25.242917,55.340233,1554111756000,2019-04-01 13:42:36,01_04_2019,10.0,https://s.sc-cdn.net/f3xl3b81sTrwnI04lVjrZRTnZ...,0,2019-04-01,13:42:36,13,42,36,Monday,1


In [4]:
train_df = pd.read_csv("train.csv", names=["id", "label"], sep=" ")
test_df = pd.read_csv("test-gt.csv", names=["id", "label"], sep=" ")
train_df.head()

Unnamed: 0,id,label
0,non-dangerous/W7_EDlXWTBiXAEEniNoMPwAAYZBYZksk...,2
1,dangerous/W7_EDlXWTBiXAEEniNoMPwAAYwsH034M7hVA...,1
2,non-dangerous/W7_EDlXWTBiXAEEniNoMPwAAYU4ljwEu...,2
3,non-dangerous/W7_EDlXWTBiXAEEniNoMPwAAYLlk0H8U...,2
4,non-dangerous/W7_EDlXWTBiXAEEniNoMPwAAYB5z-pIa...,2


In [5]:
for i, row in train_df.iterrows():
    cur_id = row['id']
    new_id = cur_id.split("/")[1]
    train_df.at[i, 'id'] = new_id[:-4]

train_df.head()

Unnamed: 0,id,label
0,W7_EDlXWTBiXAEEniNoMPwAAYZBYZksk_Aof_AWoO3JOeA...,2
1,W7_EDlXWTBiXAEEniNoMPwAAYwsH034M7hVAMAWmrnuCZA...,1
2,W7_EDlXWTBiXAEEniNoMPwAAYU4ljwEuz81xZAWmIO_iSA...,2
3,W7_EDlXWTBiXAEEniNoMPwAAYLlk0H8UKFo7oAWmWSaZwA...,2
4,W7_EDlXWTBiXAEEniNoMPwAAYB5z-pIaRycSuAWoEBDVYA...,2


In [6]:
for i, row in test_df.iterrows():
    cur_id = row['id']
    new_id = cur_id.split("/")[1]
    test_df.at[i, 'id'] = new_id[:-4]

test_df.head()

Unnamed: 0,id,label
0,W7_EDlXWTBiXAEEniNoMPwAAYQswOR5MGj4ZsAWmLW_0iA...,1
1,W7_EDlXWTBiXAEEniNoMPwAAYYX1439IMudCfAWmHZ3t5A...,1
2,W7_EDlXWTBiXAEEniNoMPwAAYe3UNt8vdH_yGAWmPSWyHA...,1
3,W7_EDlXWTBiXAEEniNoMPwAAYPi2k4bmCAFJaAWnuXATmA...,2
4,W7_EDlXWTBiXAEEniNoMPwAAYlJKEITtixxFUAWmswK_LA...,2


#### Random Forest location classifier

In [7]:
train_df = pd.merge(df, train_df, on=['id'], how='inner')
train_df.head()

Unnamed: 0,id,city,lat,lon,utc_timestamp,local_timestamp,scraped_date,duration,url,non_driving,local_date,local_time,local_hour,local_minute,local_second,local_weekday,local_day,label
0,W7_EDlXWTBiXAEEniNoMPwAAYwTAdatxvm2tYAWnT8Bh9A...,Detroit,42.407909,-83.178556,1553999967000,2019-03-30 22:39:27,01_04_2019,7.891678,https://s.sc-cdn.net/XGkFkyHgEkm78E3m4mas2f2Jf...,1,2019-03-30,22:39:27,22,39,27,Saturday,30,2
1,W7_EDlXWTBiXAEEniNoMPwAAYcpF6cH2Fb9x5AWnVR3EwA...,Riyadh,24.651636,46.604478,1554061345000,2019-03-31 22:42:25,01_04_2019,10.0,https://s.sc-cdn.net/4iDg-y4s4GTRBuAGuUvFEOacd...,0,2019-03-31,22:42:25,22,42,25,Sunday,31,1
2,W7_EDlXWTBiXAEEniNoMPwAAYOw7XDzYeKLg_AWnVRJfUA...,Riyadh,24.759436,46.732952,1554061029000,2019-03-31 22:37:09,01_04_2019,2.0,https://s.sc-cdn.net/RNvslaj8DIKiyEFSD6rHmd8Ph...,0,2019-03-31,22:37:09,22,37,9,Sunday,31,1
3,W7_EDlXWTBiXAEEniNoMPwAAYGH4fcyvlPF9mAWnWJYVsA...,Toronto,43.78687,-79.403115,1554075701000,2019-03-31 19:41:41,01_04_2019,10.0,https://s.sc-cdn.net/YM3nuGAEge6ACnlCgyYYXZCpw...,0,2019-03-31,19:41:41,19,41,41,Sunday,31,1
4,W7_EDlXWTBiXAEEniNoMPwAAYsOyMsfZxDGkFAWnXCkcXA...,Chicago,41.769686,-87.602511,1554090911000,2019-03-31 22:55:11,01_04_2019,10.0,https://s.sc-cdn.net/MwMYqmGmTZsAiSR7eVuZXhdOu...,1,2019-03-31,22:55:11,22,55,11,Sunday,31,2


In [8]:
test_df = pd.merge(df, test_df, on=['id'], how='inner')
test_df.head()

Unnamed: 0,id,city,lat,lon,utc_timestamp,local_timestamp,scraped_date,duration,url,non_driving,local_date,local_time,local_hour,local_minute,local_second,local_weekday,local_day,label
0,W7_EDlXWTBiXAEEniNoMPwAAYrdGw06ScXhj9AWnUcbEeA...,Riyadh,24.588753,46.693422,1554047317000,2019-03-31 18:48:37,01_04_2019,7.2,https://s.sc-cdn.net/6bZHWmBxWKo6Gqahr4PVLXmIF...,1,2019-03-31,18:48:37,18,48,37,Sunday,31,2
1,W7_EDlXWTBiXAEEniNoMPwAAYWbOiJNAgaSGGAWnWyqxsA...,Chicago,41.877486,-87.638681,1554086613000,2019-03-31 21:43:33,01_04_2019,10.003333,https://s.sc-cdn.net/S423RBGo1JAqslRxevgzmmjcz...,1,2019-03-31,21:43:33,21,43,33,Sunday,31,2
2,W7_EDlXWTBiXAEEniNoMPwAAY0MmnuWya-4gRAWnW6YZlA...,Dallas,32.7839,-96.797382,1554088752000,2019-03-31 22:19:12,01_04_2019,10.0,https://s.sc-cdn.net/ZghdfIYVx-ufQicHKiXQ7CQ9W...,0,2019-03-31,22:19:12,22,19,12,Sunday,31,1
3,W7_EDlXWTBiXAEEniNoMPwAAYGV_ny7hwC9pVAWnXPb-nA...,Riyadh,24.678586,46.772483,1554094062000,2019-04-01 07:47:42,01_04_2019,10.0,https://s.sc-cdn.net/6dpHGbghy9Ady_JbwBihMmR3b...,1,2019-04-01,07:47:42,7,47,42,Monday,1,2
4,W7_EDlXWTBiXAEEniNoMPwAAYN18ncibHUr8hAWnYsyzbA...,Amritsar,31.627025,74.840292,1554118724000,2019-04-01 17:08:44,01_04_2019,10.0,https://s.sc-cdn.net/_iz1ezsz2dbUfyeAaJTnA4l3m...,0,2019-04-01,17:08:44,17,8,44,Monday,1,1


#### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

np.set_printoptions(precision=2)

#### Location Features

In [9]:
X_train = train_df[["lat", "lon"]]
y_train = train_df[["non_driving"]]

In [10]:
X_test = test_df[["lat", "lon"]]
y_test = test_df[["non_driving"]]

In [11]:
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=4)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

  after removing the cwd from sys.path.
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


building tree 1 of 50building tree 2 of 50building tree 3 of 50

building tree 4 of 50

building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50building tree 18 of 50

building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50building tree 24 of 50

building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50building tree 37 of 50

building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished


In [12]:
from sklearn import metrics
from sklearn.metrics import classification_report

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1 score: ", metrics.f1_score(y_test, y_pred))
print("Precision score: ", metrics.precision_score(y_test, y_pred))
print("Recall score: ", metrics.recall_score(y_test, y_pred))

Accuracy:  0.7153482082488167
F1 score:  0.8192357234864749
Precision score:  0.7976588628762542
Recall score:  0.8420123565754634


In [13]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.367     0.301     0.331       346
           1      0.798     0.842     0.819      1133

    accuracy                          0.715      1479
   macro avg      0.583     0.571     0.575      1479
weighted avg      0.697     0.715     0.705      1479



In [None]:
plot_confusion_matrix(y_test, y_pred, classes=['Driving', 'Non-Driving'], title='Confusion matrix, without normalization')

#### Temporal Features

In [14]:
X_train = train_df[["local_hour", "local_minute", "local_weekday"]]
y_train = train_df[["non_driving"]]

X_train["local_weekday"] = X_train["local_weekday"].astype('category')
X_train["local_weekday"] = X_train["local_weekday"].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [15]:
X_test = test_df[["local_hour", "local_minute", "local_weekday"]]
y_test = test_df[["non_driving"]]

X_test["local_weekday"] = X_test["local_weekday"].astype('category')
X_test["local_weekday"] = X_test["local_weekday"].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [16]:
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=4)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

  after removing the cwd from sys.path.
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50building tree 12 of 50

building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50building tree 19 of 50
building tree 20 of 50

building tree 21 of 50building tree 22 of 50

building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50building tree 31 of 50

building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50building tree 44 of 5

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished


In [17]:
from sklearn import metrics
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1 score: ", metrics.f1_score(y_test, y_pred))
print("Precision score: ", metrics.precision_score(y_test, y_pred))
print("Recall score: ", metrics.recall_score(y_test, y_pred))

Accuracy:  0.6646382691007438
F1 score:  0.7867583834909716
Precision score:  0.7669740150880134
Recall score:  0.8075904677846425


In [19]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.238     0.197     0.215       346
           1      0.767     0.808     0.787      1133

    accuracy                          0.665      1479
   macro avg      0.502     0.502     0.501      1479
weighted avg      0.643     0.665     0.653      1479



In [None]:
# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')