In [2]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [3]:
tweets = pd.read_csv("./data/london_tweets.csv", encoding="latin1")

In [4]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [5]:
tweets["parsed_tweets"] = tweets.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [6]:
tweets["datetime"] = pd.to_datetime(tweets.MESSAGEDATE)

In [7]:
tweets["datetime_to_nearest_hour"] = tweets.datetime.apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour))

In [8]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,parsed_tweets,datetime_to_nearest_hour
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50,Not yet got tics for Liverpool waiting till Fr...,2014-12-31 09:00:00
1,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,Croydon,,0,2014-12-31 09:47:53,If it doesn t make you happy by December st le...,2014-12-31 09:00:00
2,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Newham,,0,2014-12-31 09:47:55,thanks and are you open tomorrow at all,2014-12-31 09:00:00
3,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Havering,,0,2014-12-31 09:49:25,but he is,2014-12-31 09:00:00
4,550227602011856896,,-0.104869,51.50953,-0.104869,51.509529,31/12/2014 09:50:57,151094999,HevnKISZ,@JJJ_Jeeppy >> # XD,Southwark,,0,2014-12-31 09:50:57,XD,2014-12-31 09:00:00


In [16]:
# tweets.to_csv("./data/tweets_by_spatial_grid.csv", index=False)

In [9]:
data = tweets[["id", "LOCATION", "parsed_tweets", "datetime", "datetime_to_nearest_hour"]]
data.head()

Unnamed: 0,id,LOCATION,parsed_tweets,datetime,datetime_to_nearest_hour
0,550226818624942080,Kingston upon Thames,Not yet got tics for Liverpool waiting till Fr...,2014-12-31 09:47:50,2014-12-31 09:00:00
1,550226827944660992,Croydon,If it doesn t make you happy by December st le...,2014-12-31 09:47:53,2014-12-31 09:00:00
2,550226837537058816,Newham,thanks and are you open tomorrow at all,2014-12-31 09:47:55,2014-12-31 09:00:00
3,550227214227505152,Havering,but he is,2014-12-31 09:49:25,2014-12-31 09:00:00
4,550227602011856896,Southwark,XD,2014-12-31 09:50:57,2014-12-31 09:00:00


In [10]:
tweets_by_borough_and_hour = data.groupby(["LOCATION", "datetime_to_nearest_hour"])["id"].agg("count").reset_index()

In [11]:
tweets_by_borough_and_hour["norm_count"] = np.nan
tweets_by_borough_and_hour.head()

Unnamed: 0,LOCATION,datetime_to_nearest_hour,id,norm_count
0,Barking and Dagenham,2014-12-31 04:00:00,7,
1,Barking and Dagenham,2014-12-31 05:00:00,2,
2,Barking and Dagenham,2014-12-31 06:00:00,6,
3,Barking and Dagenham,2014-12-31 07:00:00,9,
4,Barking and Dagenham,2014-12-31 08:00:00,24,


In [17]:
for idx, row in tweets_by_borough_and_hour.iterrows():
    tweets_by_borough = tweets_by_borough_and_hour[tweets_by_borough_and_hour["LOCATION"] == row["LOCATION"]]
    norm = (row.id - tweets_by_borough.id.min()) / (tweets_by_borough.id.max() - tweets_by_borough.id.min())
    tweets_by_borough_and_hour.at[idx,'norm_count'] = norm

In [18]:
tweets_by_borough_and_hour.head()

Unnamed: 0,LOCATION,datetime_to_nearest_hour,id,norm_count
0,Barking and Dagenham,2014-12-31 04:00:00,7,0.044643
1,Barking and Dagenham,2014-12-31 05:00:00,2,0.0
2,Barking and Dagenham,2014-12-31 06:00:00,6,0.035714
3,Barking and Dagenham,2014-12-31 07:00:00,9,0.0625
4,Barking and Dagenham,2014-12-31 08:00:00,24,0.196429


In [26]:
tweets_by_grid_and_hour.Ref.unique()

array([11, 21, 31, 41, 51, 61, 71, 81, 91, 12, 22, 32, 42, 52, 62, 72, 82,
       92, 13, 23, 33, 43, 53, 63, 73, 83, 93, 14, 24, 34, 44, 54, 64, 74,
       84, 94, 15, 25, 35, 45, 55, 65, 75, 85, 95, 16, 26, 36, 46, 56, 66,
       76, 86, 96, 17, 27, 37, 47, 57, 67, 77, 87, 97, 18, 28, 38, 48, 58,
       68, 78, 88, 98, 19, 29, 39, 49, 59, 69, 79, 89, 99])

In [20]:
norm_count_by_borough = tweets_by_borough_and_hour[["LOCATION", "norm_count", "datetime_to_nearest_hour"]]
norm_count_by_borough.head()

Unnamed: 0,LOCATION,norm_count,datetime_to_nearest_hour
0,Barking and Dagenham,0.044643,2014-12-31 04:00:00
1,Barking and Dagenham,0.0,2014-12-31 05:00:00
2,Barking and Dagenham,0.035714,2014-12-31 06:00:00
3,Barking and Dagenham,0.0625,2014-12-31 07:00:00
4,Barking and Dagenham,0.196429,2014-12-31 08:00:00


In [28]:
df = pd.DataFrame(columns=norm_count_by_borough.datetime_to_nearest_hour.unique())

In [29]:
df["LOCATION"] = norm_count_by_borough.LOCATION.unique()

In [30]:
# df.columns[:-1] = 0
df.loc[:,:-1] = 0.0

In [31]:
df.head()

Unnamed: 0,2014-12-31 04:00:00,2014-12-31 05:00:00,2014-12-31 06:00:00,2014-12-31 07:00:00,2014-12-31 08:00:00,2014-12-31 09:00:00,2014-12-31 10:00:00,2014-12-31 11:00:00,2014-12-31 12:00:00,2014-12-31 13:00:00,...,2015-01-01 19:00:00,2015-01-01 20:00:00,2015-01-01 21:00:00,2015-01-01 22:00:00,2015-01-01 23:00:00,2015-02-01 00:00:00,2015-02-01 01:00:00,2015-02-01 02:00:00,2015-02-01 03:00:00,LOCATION
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Barking and Dagenham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Barnet
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bexley
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Brent
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bromley


In [32]:
for idx, row in norm_count_by_borough.iterrows():
    rowIndex = df.index[df.LOCATION == row["LOCATION"]]
    column = row["datetime_to_nearest_hour"]
    value = row["norm_count"]
    df.at[rowIndex, column] = value

In [33]:
df.head()

Unnamed: 0,2014-12-31 04:00:00,2014-12-31 05:00:00,2014-12-31 06:00:00,2014-12-31 07:00:00,2014-12-31 08:00:00,2014-12-31 09:00:00,2014-12-31 10:00:00,2014-12-31 11:00:00,2014-12-31 12:00:00,2014-12-31 13:00:00,...,2015-01-01 19:00:00,2015-01-01 20:00:00,2015-01-01 21:00:00,2015-01-01 22:00:00,2015-01-01 23:00:00,2015-02-01 00:00:00,2015-02-01 01:00:00,2015-02-01 02:00:00,2015-02-01 03:00:00,LOCATION
0,0.0446429,0.0,0.0357143,0.0625,0.196429,0.526786,0.482143,0.294643,0.357143,0.491071,...,1.0,0.660714,0.6875,0.517857,0.464286,0.348214,0.258929,0.125,0.116071,Barking and Dagenham
1,0.374408,0.00473934,0.0,0.0379147,0.184834,0.165877,0.440758,0.587678,0.625592,0.597156,...,1.0,0.578199,0.933649,0.663507,0.729858,0.454976,0.322275,0.222749,0.0947867,Barnet
2,0.0,0.0,0.0275862,0.0413793,0.158621,0.42069,0.57931,0.544828,0.744828,0.482759,...,0.924138,0.606897,0.917241,0.655172,0.848276,0.462069,0.482759,0.206897,0.0482759,Bexley
3,0.0596026,0.0,0.00662252,0.0728477,0.15894,0.15894,0.225166,0.218543,0.264901,0.284768,...,0.456954,0.483444,0.655629,0.503311,0.483444,0.562914,0.443709,0.311258,0.125828,Brent
4,0.0301205,0.0180723,0.0,0.120482,0.120482,0.253012,0.343373,0.415663,0.355422,0.355422,...,0.89759,0.704819,0.89759,0.96988,0.692771,0.5,0.331325,0.060241,0.0903614,Bromley


In [35]:
df.to_csv("./data/norm_by_borough_dateime.csv", index=False)

In [22]:
tweets_by_grid_and_hour.to_csv("./data/norm_count_by_grid_and_interval.csv", index=False)

In [36]:
norm_count_by_borough.head()

Unnamed: 0,LOCATION,norm_count,datetime_to_nearest_hour
0,Barking and Dagenham,0.044643,2014-12-31 04:00:00
1,Barking and Dagenham,0.0,2014-12-31 05:00:00
2,Barking and Dagenham,0.035714,2014-12-31 06:00:00
3,Barking and Dagenham,0.0625,2014-12-31 07:00:00
4,Barking and Dagenham,0.196429,2014-12-31 08:00:00


In [36]:
tweets_by_grid_and_hour["datetime_to_nearest_hour"].unique()

array(['2014-12-31T04:00:00.000000000', '2014-12-31T05:00:00.000000000',
       '2014-12-31T09:00:00.000000000', '2014-12-31T10:00:00.000000000',
       '2014-12-31T11:00:00.000000000', '2014-12-31T12:00:00.000000000',
       '2014-12-31T13:00:00.000000000', '2014-12-31T14:00:00.000000000',
       '2014-12-31T16:00:00.000000000', '2014-12-31T17:00:00.000000000',
       '2014-12-31T18:00:00.000000000', '2014-12-31T19:00:00.000000000',
       '2014-12-31T20:00:00.000000000', '2014-12-31T22:00:00.000000000',
       '2014-12-31T23:00:00.000000000', '2015-01-01T00:00:00.000000000',
       '2015-01-01T01:00:00.000000000', '2015-01-01T02:00:00.000000000',
       '2015-01-01T03:00:00.000000000', '2015-01-01T04:00:00.000000000',
       '2015-01-01T09:00:00.000000000', '2015-01-01T10:00:00.000000000',
       '2015-01-01T11:00:00.000000000', '2015-01-01T12:00:00.000000000',
       '2015-01-01T13:00:00.000000000', '2015-01-01T14:00:00.000000000',
       '2015-01-01T15:00:00.000000000', '2015-01-01

In [37]:
tweets_by_grid_and_hour[tweets_by_grid_and_hour["norm_count"] == 1.0].datetime_to_nearest_hour.value_counts()

2015-01-01 21:00:00    15
2015-01-01 00:00:00    14
2015-01-01 18:00:00    13
2015-01-01 19:00:00    13
2015-01-01 22:00:00    10
2015-01-01 14:00:00     4
2015-01-01 12:00:00     3
2015-01-01 20:00:00     2
2014-12-31 18:00:00     2
2014-12-31 21:00:00     2
2015-01-01 16:00:00     2
2014-12-31 17:00:00     2
2015-01-01 03:00:00     1
2015-01-01 15:00:00     1
2015-02-01 00:00:00     1
2014-12-31 10:00:00     1
2015-01-01 01:00:00     1
2015-01-01 23:00:00     1
2014-12-31 15:00:00     1
Name: datetime_to_nearest_hour, dtype: int64

In [38]:
peak_tweet_time_by_spatial_grid = tweets_by_grid_and_hour[tweets_by_grid_and_hour["norm_count"] == 1.0]

In [39]:
peak_tweet_time_by_spatial_grid.to_csv("./data/peak_tweet_time_by_spatial_grid.csv", index=False)

In [32]:
tweets_by_spatial_grid.head()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,blah
3442,9,10,1,2014-12-31 09:00:00,1,0.0,19
3443,9,10,1,2014-12-31 10:00:00,1,0.0,19
3444,9,10,1,2014-12-31 11:00:00,2,0.25,19
3445,9,10,1,2014-12-31 12:00:00,3,0.5,19
3446,9,10,1,2014-12-31 13:00:00,2,0.25,19


In [32]:
ref_to_cluster = pd.read_csv("./data/ref_to_cluster.csv")

In [33]:
ref_to_cluster.head()

Unnamed: 0,Ref,Cluster
0,11,0
1,21,1
2,31,1
3,41,0
4,51,0


In [34]:
tweets_by_grid_and_hour = pd.read_csv("./data/norm_count_by_grid_and_interval.csv")

In [35]:
tweets_by_grid_and_hour["Ref"] = tweets_by_grid_and_hour["Space"] + (tweets_by_grid_and_hour["Row"] - 1) * 10
tweets_by_grid_and_hour.head()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,Ref
0,1,2,1,2014-12-31 04:00:00,3,0.222222,11
1,1,2,1,2014-12-31 05:00:00,1,0.0,11
2,1,2,1,2014-12-31 09:00:00,1,0.0,11
3,1,2,1,2014-12-31 10:00:00,4,0.333333,11
4,1,2,1,2014-12-31 11:00:00,9,0.888889,11


In [36]:
tweets_by_cluster = tweets_by_grid_and_hour.merge(ref_to_cluster, left_on='Ref', right_on='Ref', how='outer')
tweets_by_cluster.head()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,Ref,Cluster
0,1,2,1,2014-12-31 04:00:00,3,0.222222,11,0
1,1,2,1,2014-12-31 05:00:00,1,0.0,11,0
2,1,2,1,2014-12-31 09:00:00,1,0.0,11,0
3,1,2,1,2014-12-31 10:00:00,4,0.333333,11,0
4,1,2,1,2014-12-31 11:00:00,9,0.888889,11,0


In [37]:
tweets_by_cluster.to_csv("./data/tweets_by_cluster.csv", index=False)

In [38]:
tweets_by_cluster.tail()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,Ref,Cluster
3459,9,10,1,2015-01-01 17:00:00,1,0.0,99,1
3460,9,10,1,2015-01-01 18:00:00,3,0.5,99,1
3461,9,10,1,2015-01-01 19:00:00,1,0.0,99,1
3462,9,10,1,2015-01-01 20:00:00,1,0.0,99,1
3463,9,10,1,2015-01-01 21:00:00,1,0.0,99,1


In [39]:
average_by_cluster = tweets_by_cluster.groupby(["Cluster", "datetime_to_nearest_hour"])["norm_count"].mean().reset_index()

In [40]:
cluster_6 = tweets_by_cluster[tweets_by_cluster["Cluster"] == 6]
cluster_6[cluster_6["datetime_to_nearest_hour"] == "2014-12-31 04:00:00"]["norm_count"].mean()

nan

In [41]:
average_by_cluster.to_csv("./data/average_by_cluster.csv", index=False)

In [42]:
average_by_cluster.head()

Unnamed: 0,Cluster,datetime_to_nearest_hour,norm_count
0,0,2014-12-31 04:00:00,0.069024
1,0,2014-12-31 05:00:00,0.03738
2,0,2014-12-31 06:00:00,0.037828
3,0,2014-12-31 07:00:00,0.077087
4,0,2014-12-31 08:00:00,0.149669


In [43]:
average_by_cluster.dtypes

Cluster                       int64
datetime_to_nearest_hour     object
norm_count                  float64
dtype: object

In [52]:
average_by_cluster["renormed"] = 0

In [53]:
average_by_cluster.head()

Unnamed: 0,Cluster,datetime_to_nearest_hour,norm_count,renormed
0,0,2014-12-31 04:00:00,0.069024,0
1,0,2014-12-31 05:00:00,0.03738,0
2,0,2014-12-31 06:00:00,0.037828,0
3,0,2014-12-31 07:00:00,0.077087,0
4,0,2014-12-31 08:00:00,0.149669,0


In [57]:
max_cluster_0 = average_by_cluster[average_by_cluster.Cluster == 0].norm_count.max()

In [62]:
average_by_cluster[average_by_cluster.Cluster == 0].loc[:,3] = average_by_cluster[average_by_cluster.Cluster == 0].norm_count / max_cluster_0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [72]:
average_by_cluster[average_by_cluster.Cluster == 1].norm_count.max()

0.4989209874475744

In [70]:
average_by_cluster.to_csv("./data/average_by_cluster.csv", index=False)

In [13]:
borough_to_cluster = pd.read_csv("./data/borough_to_cluster.csv")

In [14]:
borough_to_cluster.head()

Unnamed: 0,LOCATION,Cluster
0,Barking and Dagenham,1
1,Barnet,1
2,Bexley,1
3,Brent,2
4,Bromley,1


In [44]:
# tweets_by_grid_and_hour = pd.read_csv("./data/norm_count_by_grid_and_interval.csv")
norm_count_by_borough.head()

Unnamed: 0,LOCATION,norm_count,datetime_to_nearest_hour
0,Barking and Dagenham,0.044643,2014-12-31 04:00:00
1,Barking and Dagenham,0.0,2014-12-31 05:00:00
2,Barking and Dagenham,0.035714,2014-12-31 06:00:00
3,Barking and Dagenham,0.0625,2014-12-31 07:00:00
4,Barking and Dagenham,0.196429,2014-12-31 08:00:00


In [80]:
tweets_by_cluster = norm_count_by_borough.merge(borough_to_cluster, left_on='LOCATION', right_on='LOCATION', how='outer')
tweets_by_cluster.head()

Unnamed: 0,LOCATION,norm_count,datetime_to_nearest_hour,Cluster
0,Barking and Dagenham,0.044643,2014-12-31 04:00:00,1
1,Barking and Dagenham,0.0,2014-12-31 05:00:00,1
2,Barking and Dagenham,0.035714,2014-12-31 06:00:00,1
3,Barking and Dagenham,0.0625,2014-12-31 07:00:00,1
4,Barking and Dagenham,0.196429,2014-12-31 08:00:00,1


In [46]:
tweets_by_cluster.to_csv("./data/tweets_by_borough_cluster.csv", index=False)

In [47]:
# Find Average by cluster
average_by_cluster = tweets_by_cluster.groupby(["Cluster", "datetime_to_nearest_hour"])["norm_count"].mean().reset_index()

In [49]:
tweets_by_borough_and_hour.head()

Unnamed: 0,LOCATION,datetime_to_nearest_hour,id,norm_count
0,Barking and Dagenham,2014-12-31 04:00:00,7,0.044643
1,Barking and Dagenham,2014-12-31 05:00:00,2,0.0
2,Barking and Dagenham,2014-12-31 06:00:00,6,0.035714
3,Barking and Dagenham,2014-12-31 07:00:00,9,0.0625
4,Barking and Dagenham,2014-12-31 08:00:00,24,0.196429


In [50]:
tweets_by_cluster = tweets_by_cluster.merge(tweets_by_borough_and_hour, left_on='LOCATION', right_on='LOCATION', how='outer')

In [74]:
tweets_by_cluster = tweets_by_cluster[["Cluster","datetime_to_nearest_hour_x", "id"]]
tweets_by_cluster.columns = ['Cluster', 'datetime_to_nearest_hour', 'tweets']
tweets_by_cluster.head()

Unnamed: 0,Cluster,datetime_to_nearest_hour,tweets
0,1,2014-12-31 04:00:00,7
1,1,2014-12-31 04:00:00,2
2,1,2014-12-31 04:00:00,6
3,1,2014-12-31 04:00:00,9
4,1,2014-12-31 04:00:00,24


In [79]:
# tweets_by_cluster.groupby(["Cluster", "tweets_by_cluster"]).sum("id")
# tweets_by_cluster.groupby(["Cluster", "datetime_to_nearest_hour_x"])["id"].agg("sum")

# For each cluster and datetime, find total tweets
tweets_by_cluster.groupby(["Cluster", "datetime_to_nearest_hour"])["tweets"].sum()

Cluster  datetime_to_nearest_hour
0        2014-12-31 04:00:00         15264
         2014-12-31 05:00:00         15264
         2014-12-31 06:00:00         15264
         2014-12-31 07:00:00         15264
         2014-12-31 08:00:00         15264
         2014-12-31 09:00:00         15264
         2014-12-31 10:00:00         15264
         2014-12-31 11:00:00         15264
         2014-12-31 12:00:00         15264
         2014-12-31 13:00:00         15264
         2014-12-31 14:00:00         15264
         2014-12-31 15:00:00         15264
         2014-12-31 16:00:00         15264
         2014-12-31 17:00:00         15264
         2014-12-31 18:00:00         15264
         2014-12-31 19:00:00         15264
         2014-12-31 20:00:00         15264
         2014-12-31 21:00:00         15264
         2014-12-31 22:00:00         15264
         2014-12-31 23:00:00         15264
         2015-01-01 00:00:00         15264
         2015-01-01 01:00:00         15264
         2015-01-01 

In [81]:
tweets_by_cluster.head()

Unnamed: 0,LOCATION,norm_count,datetime_to_nearest_hour,Cluster
0,Barking and Dagenham,0.044643,2014-12-31 04:00:00,1
1,Barking and Dagenham,0.0,2014-12-31 05:00:00,1
2,Barking and Dagenham,0.035714,2014-12-31 06:00:00,1
3,Barking and Dagenham,0.0625,2014-12-31 07:00:00,1
4,Barking and Dagenham,0.196429,2014-12-31 08:00:00,1


In [82]:
average_by_cluster = tweets_by_cluster.groupby(["Cluster", "datetime_to_nearest_hour"])["norm_count"].mean().reset_index()

In [83]:
average_by_cluster.shape

(239, 3)

In [84]:
average_by_cluster.head()

Unnamed: 0,Cluster,datetime_to_nearest_hour,norm_count
0,0,2014-12-31 04:00:00,0.066748
1,0,2014-12-31 05:00:00,0.039402
2,0,2014-12-31 06:00:00,0.017628
3,0,2014-12-31 07:00:00,0.092108
4,0,2014-12-31 08:00:00,0.09649


In [85]:
average_by_cluster["renormed"] = 0

In [86]:
average_by_cluster.to_csv("./data/average_by_cluster.csv", index=False)

In [91]:
average_by_cluster[average_by_cluster["Cluster"] == 4].norm_count.max()

1.0

In [12]:
tweets_by_borough_and_hour

Unnamed: 0,LOCATION,datetime_to_nearest_hour,id,norm_count
0,Barking and Dagenham,2014-12-31 04:00:00,7,
1,Barking and Dagenham,2014-12-31 05:00:00,2,
2,Barking and Dagenham,2014-12-31 06:00:00,6,
3,Barking and Dagenham,2014-12-31 07:00:00,9,
4,Barking and Dagenham,2014-12-31 08:00:00,24,
5,Barking and Dagenham,2014-12-31 09:00:00,61,
6,Barking and Dagenham,2014-12-31 10:00:00,56,
7,Barking and Dagenham,2014-12-31 11:00:00,35,
8,Barking and Dagenham,2014-12-31 12:00:00,42,
9,Barking and Dagenham,2014-12-31 13:00:00,57,


In [15]:
tweets_by_cluster = tweets_by_borough_and_hour.merge(borough_to_cluster, left_on='LOCATION', right_on='LOCATION', how='outer')

In [16]:
tweets_by_cluster.head()

Unnamed: 0,LOCATION,datetime_to_nearest_hour,id,norm_count,Cluster
0,Barking and Dagenham,2014-12-31 04:00:00,7,,1
1,Barking and Dagenham,2014-12-31 05:00:00,2,,1
2,Barking and Dagenham,2014-12-31 06:00:00,6,,1
3,Barking and Dagenham,2014-12-31 07:00:00,9,,1
4,Barking and Dagenham,2014-12-31 08:00:00,24,,1


In [42]:
total_tweets_by_cluster_datetime = tweets_by_cluster.groupby(["Cluster", "datetime_to_nearest_hour"])["id"].agg("sum").reset_index()

In [43]:
total_tweets_by_cluster_datetime.head()

Unnamed: 0,Cluster,datetime_to_nearest_hour,id
0,0,2014-12-31 04:00:00,91
1,0,2014-12-31 05:00:00,59
2,0,2014-12-31 06:00:00,32
3,0,2014-12-31 07:00:00,97
4,0,2014-12-31 08:00:00,103


In [45]:
total_tweets_by_cluster_datetime.tail()

Unnamed: 0,Cluster,datetime_to_nearest_hour,id
234,4,2015-01-01 23:00:00,24
235,4,2015-02-01 00:00:00,3
236,4,2015-02-01 01:00:00,6
237,4,2015-02-01 02:00:00,3
238,4,2015-02-01 03:00:00,1


In [35]:
total_tweets_by_cluster_datetime["norm_count"] = np.nan
total_tweets_by_cluster_datetime.head()

Unnamed: 0,Cluster,datetime_to_nearest_hour,id,norm_count
0,0,2014-12-31 04:00:00,91,
1,0,2014-12-31 05:00:00,59,
2,0,2014-12-31 06:00:00,32,
3,0,2014-12-31 07:00:00,97,
4,0,2014-12-31 08:00:00,103,


In [44]:
total_tweets_by_cluster_datetime.shape

(239, 3)

In [48]:
for idx, row in total_tweets_by_cluster_datetime.iterrows():
    tweets_by_cluster = total_tweets_by_cluster_datetime[total_tweets_by_cluster_datetime["Cluster"] == row["Cluster"]]
    norm = (row.id - tweets_by_cluster.id.min()) / (tweets_by_cluster.id.max() - tweets_by_cluster.id.min())
    total_tweets_by_cluster_datetime.at[idx,'norm_count'] = norm

In [51]:
total_tweets_by_cluster_datetime[total_tweets_by_cluster_datetime.Cluster == 0]

Unnamed: 0,Cluster,datetime_to_nearest_hour,id,norm_count
0,0,2014-12-31 04:00:00,91,0.072417
1,0,2014-12-31 05:00:00,59,0.038339
2,0,2014-12-31 06:00:00,32,0.009585
3,0,2014-12-31 07:00:00,97,0.078807
4,0,2014-12-31 08:00:00,103,0.085197
5,0,2014-12-31 09:00:00,176,0.162939
6,0,2014-12-31 10:00:00,271,0.264111
7,0,2014-12-31 11:00:00,264,0.256656
8,0,2014-12-31 12:00:00,245,0.236422
9,0,2014-12-31 13:00:00,284,0.277955


In [1]:
total_tweets_by_cluster_datetime

NameError: name 'total_tweets_by_cluster_datetime' is not defined

In [53]:
total_tweets_by_cluster_datetime.to_csv("./data/average_by_cluster.csv", index=False)

In [52]:
23/962

0.02390852390852391

In [9]:
total_tweets_by_cluster_datetime = pd.read_csv("./data/average_by_cluster.csv")

In [12]:
total_tweets_by_cluster_datetime[total_tweets_by_cluster_datetime.Cluster == 4]

Unnamed: 0,Cluster,datetime_to_nearest_hour,id,norm_count
192,4,2014-12-31 04:00:00,2,0.015625
193,4,2014-12-31 06:00:00,7,0.09375
194,4,2014-12-31 07:00:00,10,0.140625
195,4,2014-12-31 08:00:00,33,0.5
196,4,2014-12-31 09:00:00,34,0.515625
197,4,2014-12-31 10:00:00,39,0.59375
198,4,2014-12-31 11:00:00,53,0.8125
199,4,2014-12-31 12:00:00,57,0.875
200,4,2014-12-31 13:00:00,52,0.796875
201,4,2014-12-31 14:00:00,46,0.703125
