In [3]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [5]:
tweets = pd.read_csv("./data/tweets_all.csv", encoding="latin1")

In [6]:
min_lat = tweets.LATITUDE.min()
min_long = tweets.LONGITUDE.min()

In [7]:
lat_range = tweets.LATITUDE.max() - tweets.LATITUDE.min()
long_range = tweets.LONGITUDE.max() - tweets.LONGITUDE.min()

lat_bin_size = lat_range/9
long_bin_size= long_range/9

In [8]:
tweets["Space"] = tweets.LONGITUDE.apply(lambda long: (((long - min_long)//long_bin_size)+1))
tweets["Row"] = tweets.LATITUDE.apply(lambda lat: (9-((lat - min_lat)//lat_bin_size)+1))

In [9]:
tweets["Space"] = tweets["Space"].apply(lambda l: int(l))
tweets["Row"] = tweets["Row"].apply(lambda l: int(l))

In [10]:
tweets = tweets[~((tweets["Row"] == 1) & (tweets["Space"] == 8 ))]
tweets = tweets[~((tweets["Row"] == 3) & (tweets["Space"] == 10 ))]

In [11]:
tweets["One"] = 1

In [12]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [13]:
tweets["parsed_tweets"] = tweets.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [14]:
# need to get pos tag
class StemmerTokenizer(object):

    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def __call__(self, doc):
        is_noun = lambda pos: pos[:2] == 'NN'
        doc = " ".join(re.findall("[a-zA-Z]+", doc))
        tokenized = nltk.word_tokenize(doc)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [15]:
tweets["datetime"] = pd.to_datetime(tweets.MESSAGEDATE)

In [16]:
tweets["datetime_to_nearest_hour"] = tweets.datetime.apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour))

In [17]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,Space,Row,One,parsed_tweets,datetime,datetime_to_nearest_hour
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0,3,8,1,Not yet got tics for Liverpool waiting till Fr...,2014-12-31 09:47:50,2014-12-31 09:00:00
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0,4,6,1,So Many Accents,2014-12-31 09:47:52,2014-12-31 09:00:00
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0,6,9,1,If it doesn t make you happy by December st le...,2014-12-31 09:47:53,2014-12-31 09:00:00
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0,6,5,1,thanks and are you open tomorrow at all,2014-12-31 09:47:55,2014-12-31 09:00:00
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0,8,4,1,but he is,2014-12-31 09:49:25,2014-12-31 09:00:00


In [16]:
# tweets.to_csv("./data/tweets_by_spatial_grid.csv", index=False)

In [18]:
data = tweets[["id", "Space", "Row", "One", "parsed_tweets", "datetime", "datetime_to_nearest_hour"]]
data.head()

Unnamed: 0,id,Space,Row,One,parsed_tweets,datetime,datetime_to_nearest_hour
0,550226818624942080,3,8,1,Not yet got tics for Liverpool waiting till Fr...,2014-12-31 09:47:50,2014-12-31 09:00:00
1,550226823314178048,4,6,1,So Many Accents,2014-12-31 09:47:52,2014-12-31 09:00:00
2,550226827944660992,6,9,1,If it doesn t make you happy by December st le...,2014-12-31 09:47:53,2014-12-31 09:00:00
3,550226837537058816,6,5,1,thanks and are you open tomorrow at all,2014-12-31 09:47:55,2014-12-31 09:00:00
4,550227214227505152,8,4,1,but he is,2014-12-31 09:49:25,2014-12-31 09:00:00


In [19]:
tweets_by_grid_and_hour = data.groupby(["Space", "Row", "One", "datetime_to_nearest_hour"])["id"].agg("count").reset_index()

In [20]:
tweets_by_grid_and_hour["norm_count"] = np.nan
tweets_by_grid_and_hour.head()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count
0,1,2,1,2014-12-31 04:00:00,3,
1,1,2,1,2014-12-31 05:00:00,1,
2,1,2,1,2014-12-31 09:00:00,1,
3,1,2,1,2014-12-31 10:00:00,4,
4,1,2,1,2014-12-31 11:00:00,9,


In [21]:
for idx, row in tweets_by_grid_and_hour.iterrows():
    tweets_by_spatial_grid = tweets_by_grid_and_hour[(tweets_by_grid_and_hour["Space"] == row["Space"]) & (tweets_by_grid_and_hour["Row"] == row["Row"])]
    norm = (row.id - tweets_by_spatial_grid.id.min()) / (tweets_by_spatial_grid.id.max() - tweets_by_spatial_grid.id.min())
    tweets_by_grid_and_hour.at[idx,'norm_count'] = norm

In [22]:
tweets_by_grid_and_hour.head()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count
0,1,2,1,2014-12-31 04:00:00,3,0.222222
1,1,2,1,2014-12-31 05:00:00,1,0.0
2,1,2,1,2014-12-31 09:00:00,1,0.0
3,1,2,1,2014-12-31 10:00:00,4,0.333333
4,1,2,1,2014-12-31 11:00:00,9,0.888889


In [23]:
print(tweets_by_grid_and_hour.Space.unique())
print(tweets_by_grid_and_hour.Row.unique())

[1 2 3 4 5 6 7 8 9]
[ 2  3  4  5  6  7  8  9 10]


In [24]:
# Grid ref = Space + (Row // 10) * 10
tweets_by_grid_and_hour["Ref"] = tweets_by_grid_and_hour["Space"] + (tweets_by_grid_and_hour["Row"] - 1) * 10

In [25]:
tweets_by_grid_and_hour[tweets_by_grid_and_hour.Ref == 11]

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,Ref
0,1,2,1,2014-12-31 04:00:00,3,0.222222,11
1,1,2,1,2014-12-31 05:00:00,1,0.0,11
2,1,2,1,2014-12-31 09:00:00,1,0.0,11
3,1,2,1,2014-12-31 10:00:00,4,0.333333,11
4,1,2,1,2014-12-31 11:00:00,9,0.888889,11
5,1,2,1,2014-12-31 12:00:00,6,0.555556,11
6,1,2,1,2014-12-31 13:00:00,2,0.111111,11
7,1,2,1,2014-12-31 14:00:00,6,0.555556,11
8,1,2,1,2014-12-31 16:00:00,6,0.555556,11
9,1,2,1,2014-12-31 17:00:00,6,0.555556,11


In [26]:
tweets_by_grid_and_hour.Ref.unique()

array([11, 21, 31, 41, 51, 61, 71, 81, 91, 12, 22, 32, 42, 52, 62, 72, 82,
       92, 13, 23, 33, 43, 53, 63, 73, 83, 93, 14, 24, 34, 44, 54, 64, 74,
       84, 94, 15, 25, 35, 45, 55, 65, 75, 85, 95, 16, 26, 36, 46, 56, 66,
       76, 86, 96, 17, 27, 37, 47, 57, 67, 77, 87, 97, 18, 28, 38, 48, 58,
       68, 78, 88, 98, 19, 29, 39, 49, 59, 69, 79, 89, 99])

In [27]:
norm_count_by_ref = tweets_by_grid_and_hour[["Ref", "norm_count", "datetime_to_nearest_hour"]]
norm_count_by_ref.Ref.unique()

array([11, 21, 31, 41, 51, 61, 71, 81, 91, 12, 22, 32, 42, 52, 62, 72, 82,
       92, 13, 23, 33, 43, 53, 63, 73, 83, 93, 14, 24, 34, 44, 54, 64, 74,
       84, 94, 15, 25, 35, 45, 55, 65, 75, 85, 95, 16, 26, 36, 46, 56, 66,
       76, 86, 96, 17, 27, 37, 47, 57, 67, 77, 87, 97, 18, 28, 38, 48, 58,
       68, 78, 88, 98, 19, 29, 39, 49, 59, 69, 79, 89, 99])

In [28]:
norm_count_by_ref.head()

Unnamed: 0,Ref,norm_count,datetime_to_nearest_hour
0,11,0.222222,2014-12-31 04:00:00
1,11,0.0,2014-12-31 05:00:00
2,11,0.0,2014-12-31 09:00:00
3,11,0.333333,2014-12-31 10:00:00
4,11,0.888889,2014-12-31 11:00:00


In [29]:
df = pd.DataFrame(columns=norm_count_by_ref.datetime_to_nearest_hour.unique())

In [30]:
df["Ref"] = norm_count_by_ref.Ref.unique()

In [31]:
# df.columns[:-1] = 0
df.loc[:,:-1] = 0.0

In [32]:
df.head()

Unnamed: 0,2014-12-31 04:00:00,2014-12-31 05:00:00,2014-12-31 09:00:00,2014-12-31 10:00:00,2014-12-31 11:00:00,2014-12-31 12:00:00,2014-12-31 13:00:00,2014-12-31 14:00:00,2014-12-31 16:00:00,2014-12-31 17:00:00,...,2014-12-31 07:00:00,2014-12-31 08:00:00,2014-12-31 15:00:00,2014-12-31 21:00:00,2015-01-01 05:00:00,2015-01-01 08:00:00,2015-02-01 03:00:00,2015-01-01 06:00:00,2015-01-01 07:00:00,Ref
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51


In [33]:
for idx, row in norm_count_by_ref.iterrows():
    rowIndex = df.index[df.Ref == row["Ref"]]
    column = row["datetime_to_nearest_hour"]
    value = row["norm_count"]
    df.at[rowIndex, column] = value

In [34]:
df.head()

Unnamed: 0,2014-12-31 04:00:00,2014-12-31 05:00:00,2014-12-31 09:00:00,2014-12-31 10:00:00,2014-12-31 11:00:00,2014-12-31 12:00:00,2014-12-31 13:00:00,2014-12-31 14:00:00,2014-12-31 16:00:00,2014-12-31 17:00:00,...,2014-12-31 07:00:00,2014-12-31 08:00:00,2014-12-31 15:00:00,2014-12-31 21:00:00,2015-01-01 05:00:00,2015-01-01 08:00:00,2015-02-01 03:00:00,2015-01-01 06:00:00,2015-01-01 07:00:00,Ref
0,0.222222,0.0,0.0,0.333333,0.888889,0.555556,0.111111,0.555556,0.555556,0.555556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
1,0.0,0.0,0.076923,0.410256,0.282051,0.205128,0.128205,0.25641,0.230769,0.179487,...,0.0,0.076923,0.076923,0.128205,0.0,0.230769,0.076923,0.0,0.0,21
2,0.0,0.0,0.032258,0.096774,0.193548,0.16129,0.129032,0.16129,0.16129,0.129032,...,0.0,0.0,0.032258,0.0,0.064516,0.032258,0.0,0.129032,0.0,31
3,0.039216,0.0,0.196078,0.156863,0.529412,0.823529,0.490196,0.411765,0.392157,0.411765,...,0.058824,0.156863,0.705882,0.313725,0.0,0.0,0.019608,0.0,0.0,41
4,0.0,0.064516,0.645161,0.494624,0.311828,0.763441,0.462366,0.494624,0.301075,0.462366,...,0.290323,0.55914,0.569892,0.150538,0.064516,0.11828,0.11828,0.096774,0.053763,51


In [35]:
df.isnull().any()

2014-12-31 04:00:00    False
2014-12-31 05:00:00    False
2014-12-31 09:00:00    False
2014-12-31 10:00:00    False
2014-12-31 11:00:00    False
2014-12-31 12:00:00    False
2014-12-31 13:00:00    False
2014-12-31 14:00:00    False
2014-12-31 16:00:00    False
2014-12-31 17:00:00    False
2014-12-31 18:00:00    False
2014-12-31 19:00:00    False
2014-12-31 20:00:00    False
2014-12-31 22:00:00    False
2014-12-31 23:00:00    False
2015-01-01 00:00:00    False
2015-01-01 01:00:00    False
2015-01-01 02:00:00    False
2015-01-01 03:00:00    False
2015-01-01 04:00:00    False
2015-01-01 09:00:00    False
2015-01-01 10:00:00    False
2015-01-01 11:00:00    False
2015-01-01 12:00:00    False
2015-01-01 13:00:00    False
2015-01-01 14:00:00    False
2015-01-01 15:00:00    False
2015-01-01 16:00:00    False
2015-01-01 17:00:00    False
2015-01-01 18:00:00    False
2015-01-01 19:00:00    False
2015-01-01 20:00:00    False
2015-01-01 21:00:00    False
2015-01-01 22:00:00    False
2015-01-01 23:

In [64]:
df.to_csv("./data/norm_by_ref_datime.csv", index=False)

In [22]:
tweets_by_grid_and_hour.to_csv("./data/norm_count_by_grid_and_interval.csv", index=False)

In [36]:
tweets_by_grid_and_hour["datetime_to_nearest_hour"].unique()

array(['2014-12-31T04:00:00.000000000', '2014-12-31T05:00:00.000000000',
       '2014-12-31T09:00:00.000000000', '2014-12-31T10:00:00.000000000',
       '2014-12-31T11:00:00.000000000', '2014-12-31T12:00:00.000000000',
       '2014-12-31T13:00:00.000000000', '2014-12-31T14:00:00.000000000',
       '2014-12-31T16:00:00.000000000', '2014-12-31T17:00:00.000000000',
       '2014-12-31T18:00:00.000000000', '2014-12-31T19:00:00.000000000',
       '2014-12-31T20:00:00.000000000', '2014-12-31T22:00:00.000000000',
       '2014-12-31T23:00:00.000000000', '2015-01-01T00:00:00.000000000',
       '2015-01-01T01:00:00.000000000', '2015-01-01T02:00:00.000000000',
       '2015-01-01T03:00:00.000000000', '2015-01-01T04:00:00.000000000',
       '2015-01-01T09:00:00.000000000', '2015-01-01T10:00:00.000000000',
       '2015-01-01T11:00:00.000000000', '2015-01-01T12:00:00.000000000',
       '2015-01-01T13:00:00.000000000', '2015-01-01T14:00:00.000000000',
       '2015-01-01T15:00:00.000000000', '2015-01-01

In [37]:
tweets_by_grid_and_hour[tweets_by_grid_and_hour["norm_count"] == 1.0].datetime_to_nearest_hour.value_counts()

2015-01-01 21:00:00    15
2015-01-01 00:00:00    14
2015-01-01 18:00:00    13
2015-01-01 19:00:00    13
2015-01-01 22:00:00    10
2015-01-01 14:00:00     4
2015-01-01 12:00:00     3
2015-01-01 20:00:00     2
2014-12-31 18:00:00     2
2014-12-31 21:00:00     2
2015-01-01 16:00:00     2
2014-12-31 17:00:00     2
2015-01-01 03:00:00     1
2015-01-01 15:00:00     1
2015-02-01 00:00:00     1
2014-12-31 10:00:00     1
2015-01-01 01:00:00     1
2015-01-01 23:00:00     1
2014-12-31 15:00:00     1
Name: datetime_to_nearest_hour, dtype: int64

In [38]:
peak_tweet_time_by_spatial_grid = tweets_by_grid_and_hour[tweets_by_grid_and_hour["norm_count"] == 1.0]

In [39]:
peak_tweet_time_by_spatial_grid.to_csv("./data/peak_tweet_time_by_spatial_grid.csv", index=False)

In [32]:
tweets_by_spatial_grid.head()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,blah
3442,9,10,1,2014-12-31 09:00:00,1,0.0,19
3443,9,10,1,2014-12-31 10:00:00,1,0.0,19
3444,9,10,1,2014-12-31 11:00:00,2,0.25,19
3445,9,10,1,2014-12-31 12:00:00,3,0.5,19
3446,9,10,1,2014-12-31 13:00:00,2,0.25,19


In [32]:
ref_to_cluster = pd.read_csv("./data/ref_to_cluster.csv")

In [33]:
ref_to_cluster.head()

Unnamed: 0,Ref,Cluster
0,11,0
1,21,1
2,31,1
3,41,0
4,51,0


In [34]:
tweets_by_grid_and_hour = pd.read_csv("./data/norm_count_by_grid_and_interval.csv")

In [35]:
tweets_by_grid_and_hour["Ref"] = tweets_by_grid_and_hour["Space"] + (tweets_by_grid_and_hour["Row"] - 1) * 10
tweets_by_grid_and_hour.head()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,Ref
0,1,2,1,2014-12-31 04:00:00,3,0.222222,11
1,1,2,1,2014-12-31 05:00:00,1,0.0,11
2,1,2,1,2014-12-31 09:00:00,1,0.0,11
3,1,2,1,2014-12-31 10:00:00,4,0.333333,11
4,1,2,1,2014-12-31 11:00:00,9,0.888889,11


In [36]:
tweets_by_cluster = tweets_by_grid_and_hour.merge(ref_to_cluster, left_on='Ref', right_on='Ref', how='outer')
tweets_by_cluster.head()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,Ref,Cluster
0,1,2,1,2014-12-31 04:00:00,3,0.222222,11,0
1,1,2,1,2014-12-31 05:00:00,1,0.0,11,0
2,1,2,1,2014-12-31 09:00:00,1,0.0,11,0
3,1,2,1,2014-12-31 10:00:00,4,0.333333,11,0
4,1,2,1,2014-12-31 11:00:00,9,0.888889,11,0


In [37]:
tweets_by_cluster.to_csv("./data/tweets_by_cluster.csv", index=False)

In [38]:
tweets_by_cluster.tail()

Unnamed: 0,Space,Row,One,datetime_to_nearest_hour,id,norm_count,Ref,Cluster
3459,9,10,1,2015-01-01 17:00:00,1,0.0,99,1
3460,9,10,1,2015-01-01 18:00:00,3,0.5,99,1
3461,9,10,1,2015-01-01 19:00:00,1,0.0,99,1
3462,9,10,1,2015-01-01 20:00:00,1,0.0,99,1
3463,9,10,1,2015-01-01 21:00:00,1,0.0,99,1


In [39]:
average_by_cluster = tweets_by_cluster.groupby(["Cluster", "datetime_to_nearest_hour"])["norm_count"].mean().reset_index()

In [40]:
cluster_6 = tweets_by_cluster[tweets_by_cluster["Cluster"] == 6]
cluster_6[cluster_6["datetime_to_nearest_hour"] == "2014-12-31 04:00:00"]["norm_count"].mean()

nan

In [41]:
average_by_cluster.to_csv("./data/average_by_cluster.csv", index=False)

In [42]:
average_by_cluster.head()

Unnamed: 0,Cluster,datetime_to_nearest_hour,norm_count
0,0,2014-12-31 04:00:00,0.069024
1,0,2014-12-31 05:00:00,0.03738
2,0,2014-12-31 06:00:00,0.037828
3,0,2014-12-31 07:00:00,0.077087
4,0,2014-12-31 08:00:00,0.149669


In [43]:
average_by_cluster.dtypes

Cluster                       int64
datetime_to_nearest_hour     object
norm_count                  float64
dtype: object

In [52]:
average_by_cluster["renormed"] = 0

In [53]:
average_by_cluster.head()

Unnamed: 0,Cluster,datetime_to_nearest_hour,norm_count,renormed
0,0,2014-12-31 04:00:00,0.069024,0
1,0,2014-12-31 05:00:00,0.03738,0
2,0,2014-12-31 06:00:00,0.037828,0
3,0,2014-12-31 07:00:00,0.077087,0
4,0,2014-12-31 08:00:00,0.149669,0


In [57]:
max_cluster_0 = average_by_cluster[average_by_cluster.Cluster == 0].norm_count.max()

In [62]:
average_by_cluster[average_by_cluster.Cluster == 0].loc[:,3] = average_by_cluster[average_by_cluster.Cluster == 0].norm_count / max_cluster_0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [72]:
average_by_cluster[average_by_cluster.Cluster == 1].norm_count.max()

0.4989209874475744

In [70]:
average_by_cluster.to_csv("./data/average_by_cluster.csv", index=False)