In [1]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
tweets = pd.read_csv("./data/tweets_all.csv", encoding="latin1")

In [3]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0


In [4]:
tweets["datetime"] = pd.to_datetime(tweets.MESSAGEDATE)

In [5]:
tweets["datetime_to_nearest_hour"] = tweets.datetime.apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour))

Remove non-plaintext elements from tweets

In [6]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [7]:
tweets["parsed_tweets"] = tweets.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [8]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,datetime_to_nearest_hour,parsed_tweets
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0,2014-12-31 09:47:50,2014-12-31 09:00:00,Not yet got tics for Liverpool waiting till Fr...
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0,2014-12-31 09:47:52,2014-12-31 09:00:00,So Many Accents
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0,2014-12-31 09:47:53,2014-12-31 09:00:00,If it doesn t make you happy by December st le...
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0,2014-12-31 09:47:55,2014-12-31 09:00:00,thanks and are you open tomorrow at all
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0,2014-12-31 09:49:25,2014-12-31 09:00:00,but he is


In [9]:
# tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))
# tweets_by_hour = pd.DataFrame({"message_text_for_hour" : tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))}).reset_index()

In [10]:
# need to get pos tag
class StemmerTokenizer(object):

    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def __call__(self, doc):
        is_noun = lambda pos: pos[:2] == 'NN'
        doc = " ".join(re.findall("[a-zA-Z]+", doc))
        tokenized = nltk.word_tokenize(doc)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [11]:
idf_by_hourly_interval = {}

for hour_interval in tweets.datetime_to_nearest_hour.unique():
    clf = TfidfVectorizer(tokenizer=StemmerTokenizer(), min_df=5, max_df=0.7, ngram_range=(1, 1), stop_words=set(stopwords.words('english')))
    clf.fit(tweets[tweets["datetime_to_nearest_hour"] == hour_interval]["parsed_tweets"])
    word2idf = dict(zip(clf.get_feature_names(), clf.idf_))
    sorted_idf = sorted(word2idf.items(), key=lambda x: x[1], reverse=True)

    idf_by_hourly_interval[hour_interval] = {
        "hour_interval": hour_interval,
        "sorted_idf": sorted_idf
    }
    
    print("Completed", hour_interval)

print("Finished!")

Completed 2014-12-31T09:00:00.000000000
Completed 2014-12-31T10:00:00.000000000
Completed 2014-12-31T08:00:00.000000000
Completed 2014-12-31T06:00:00.000000000
Completed 2014-12-31T07:00:00.000000000
Completed 2014-12-31T04:00:00.000000000
Completed 2014-12-31T05:00:00.000000000
Completed 2014-12-31T14:00:00.000000000
Completed 2014-12-31T15:00:00.000000000
Completed 2014-12-31T11:00:00.000000000
Completed 2014-12-31T13:00:00.000000000
Completed 2014-12-31T12:00:00.000000000
Completed 2014-12-31T18:00:00.000000000
Completed 2014-12-31T19:00:00.000000000
Completed 2014-12-31T16:00:00.000000000
Completed 2014-12-31T17:00:00.000000000
Completed 2014-12-31T23:00:00.000000000
Completed 2015-01-01T00:00:00.000000000
Completed 2014-12-31T20:00:00.000000000
Completed 2014-12-31T22:00:00.000000000
Completed 2015-01-01T08:00:00.000000000
Completed 2015-01-01T09:00:00.000000000
Completed 2015-01-01T04:00:00.000000000
Completed 2015-01-01T05:00:00.000000000
Completed 2015-01-01T06:00:00.000000000


In [12]:
idf_by_hourly_interval[np.datetime64('2015-01-01T00:00:00.000000000')]

{'hour_interval': numpy.datetime64('2015-01-01T00:00:00.000000000'),
 'sorted_idf': [('ador', 7.809774870621934),
  ('alcohol', 7.809774870621934),
  ('apart', 7.809774870621934),
  ('appreci', 7.809774870621934),
  ('auguri', 7.809774870621934),
  ('awkward', 7.809774870621934),
  ('bath', 7.809774870621934),
  ('begin', 7.809774870621934),
  ('bell', 7.809774870621934),
  ('bon', 7.809774870621934),
  ('book', 7.809774870621934),
  ('bought', 7.809774870621934),
  ('british', 7.809774870621934),
  ('catch', 7.809774870621934),
  ('chanc', 7.809774870621934),
  ('class', 7.809774870621934),
  ('clear', 7.809774870621934),
  ('cross', 7.809774870621934),
  ('cup', 7.809774870621934),
  ('current', 7.809774870621934),
  ('david', 7.809774870621934),
  ('delight', 7.809774870621934),
  ('deserv', 7.809774870621934),
  ('di', 7.809774870621934),
  ('dinner', 7.809774870621934),
  ('ed', 7.809774870621934),
  ('either', 7.809774870621934),
  ('emot', 7.809774870621934),
  ('er', 7.80977487

In [13]:
idf_by_hourly_interval[np.datetime64('2015-01-01T18:00:00.000000000')]

{'hour_interval': numpy.datetime64('2015-01-01T18:00:00.000000000'),
 'sorted_idf': [('aliv', 7.728827955256198),
  ('alon', 7.728827955256198),
  ('android', 7.728827955256198),
  ('anyway', 7.728827955256198),
  ('asleep', 7.728827955256198),
  ('austin', 7.728827955256198),
  ('babi', 7.728827955256198),
  ('baller', 7.728827955256198),
  ('bbc', 7.728827955256198),
  ('bitch', 7.728827955256198),
  ('black', 7.728827955256198),
  ('bless', 7.728827955256198),
  ('bodi', 7.728827955256198),
  ('boo', 7.728827955256198),
  ('bt', 7.728827955256198),
  ('btw', 7.728827955256198),
  ('burger', 7.728827955256198),
  ('calm', 7.728827955256198),
  ('cant', 7.728827955256198),
  ('car', 7.728827955256198),
  ('centr', 7.728827955256198),
  ('central', 7.728827955256198),
  ('chees', 7.728827955256198),
  ('client', 7.728827955256198),
  ('cook', 7.728827955256198),
  ('cup', 7.728827955256198),
  ('delet', 7.728827955256198),
  ('detail', 7.728827955256198),
  ('dirti', 7.728827955256198)

In [14]:
idf_by_hourly_interval[np.datetime64('2015-01-01T19:00:00.000000000')]

{'hour_interval': numpy.datetime64('2015-01-01T19:00:00.000000000'),
 'sorted_idf': [('abl', 7.725833642209509),
  ('accept', 7.725833642209509),
  ('ahh', 7.725833642209509),
  ('aint', 7.725833642209509),
  ('angel', 7.725833642209509),
  ('app', 7.725833642209509),
  ('appar', 7.725833642209509),
  ('ass', 7.725833642209509),
  ('assist', 7.725833642209509),
  ('aswel', 7.725833642209509),
  ('avail', 7.725833642209509),
  ('averag', 7.725833642209509),
  ('badli', 7.725833642209509),
  ('bare', 7.725833642209509),
  ('bastard', 7.725833642209509),
  ('beaten', 7.725833642209509),
  ('block', 7.725833642209509),
  ('bottl', 7.725833642209509),
  ('build', 7.725833642209509),
  ('bum', 7.725833642209509),
  ('captain', 7.725833642209509),
  ('card', 7.725833642209509),
  ('career', 7.725833642209509),
  ('chant', 7.725833642209509),
  ('chat', 7.725833642209509),
  ('clearli', 7.725833642209509),
  ('cloth', 7.725833642209509),
  ('clue', 7.725833642209509),
  ('collect', 7.725833642

In [15]:
idf_by_hourly_interval[np.datetime64('2015-01-01T21:00:00.000000000')]

{'hour_interval': numpy.datetime64('2015-01-01T21:00:00.000000000'),
 'sorted_idf': [('abi', 7.687731855004171),
  ('acc', 7.687731855004171),
  ('ahhh', 7.687731855004171),
  ('albert', 7.687731855004171),
  ('allah', 7.687731855004171),
  ('allow', 7.687731855004171),
  ('alright', 7.687731855004171),
  ('ann', 7.687731855004171),
  ('ano', 7.687731855004171),
  ('avec', 7.687731855004171),
  ('awesom', 7.687731855004171),
  ('aww', 7.687731855004171),
  ('beal', 7.687731855004171),
  ('biggest', 7.687731855004171),
  ('birth', 7.687731855004171),
  ('blame', 7.687731855004171),
  ('bless', 7.687731855004171),
  ('bon', 7.687731855004171),
  ('bran', 7.687731855004171),
  ('brown', 7.687731855004171),
  ('calm', 7.687731855004171),
  ('ce', 7.687731855004171),
  ('challeng', 7.687731855004171),
  ('chines', 7.687731855004171),
  ('clean', 7.687731855004171),
  ('close', 7.687731855004171),
  ('coffe', 7.687731855004171),
  ('commentari', 7.687731855004171),
  ('con', 7.68773185500417

In [16]:
idf_by_hourly_interval[np.datetime64('2015-01-01T16:00:00.000000000')]

{'hour_interval': numpy.datetime64('2015-01-01T16:00:00.000000000'),
 'sorted_idf': [('abbey', 7.600323911145338),
  ('abl', 7.600323911145338),
  ('admit', 7.600323911145338),
  ('aim', 7.600323911145338),
  ('amen', 7.600323911145338),
  ('appear', 7.600323911145338),
  ('aswel', 7.600323911145338),
  ('awesom', 7.600323911145338),
  ('ba', 7.600323911145338),
  ('bank', 7.600323911145338),
  ('battl', 7.600323911145338),
  ('bench', 7.600323911145338),
  ('biggest', 7.600323911145338),
  ('bitch', 7.600323911145338),
  ('blow', 7.600323911145338),
  ('board', 7.600323911145338),
  ('boom', 7.600323911145338),
  ('brendan', 7.600323911145338),
  ('bromley', 7.600323911145338),
  ('bush', 7.600323911145338),
  ('buzz', 7.600323911145338),
  ('charact', 7.600323911145338),
  ('charg', 7.600323911145338),
  ('chat', 7.600323911145338),
  ('chicken', 7.600323911145338),
  ('corner', 7.600323911145338),
  ('cost', 7.600323911145338),
  ('count', 7.600323911145338),
  ('cover', 7.600323911

In [24]:
searchfor = ['new', 'year', "new year"]

tweets[tweets["parsed_tweets"].str.contains("|".join(searchfor))].groupby(['datetime_to_nearest_hour'])['id'] \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(5)

Unnamed: 0,datetime_to_nearest_hour,count
20,2015-01-01 00:00:00,1437
21,2015-01-01 01:00:00,515
16,2014-12-31 20:00:00,465
19,2014-12-31 23:00:00,460
14,2014-12-31 18:00:00,353


In [28]:
searchfor = ['foot', 'chelsea', "tottenham"]

tweets[tweets["parsed_tweets"].str.contains("|".join(searchfor))].groupby(['datetime_to_nearest_hour'])['id'] \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(5)

Unnamed: 0,datetime_to_nearest_hour,count
34,2015-01-01 19:00:00,94
33,2015-01-01 18:00:00,79
32,2015-01-01 17:00:00,41
31,2015-01-01 16:00:00,34
37,2015-01-01 22:00:00,26


In [30]:
searchfor = ['firework', 'fireworks']

tweets[tweets["parsed_tweets"].str.contains("|".join(searchfor))].groupby(['datetime_to_nearest_hour'])['id'] \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(5)

Unnamed: 0,datetime_to_nearest_hour,count
18,2015-01-01 00:00:00,189
19,2015-01-01 01:00:00,52
14,2014-12-31 20:00:00,35
17,2014-12-31 23:00:00,34
13,2014-12-31 19:00:00,32


In [31]:
tweets[tweets["parsed_tweets"].str.contains("|".join(searchfor))].groupby(['datetime_to_nearest_hour'])['id'] \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(5)

Unnamed: 0,datetime_to_nearest_hour,count
10,2015-01-01 00:00:00,4
13,2015-01-01 20:00:00,2
14,2015-02-01 00:00:00,2
0,2014-12-31 11:00:00,1
1,2014-12-31 14:00:00,1
