In [1]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
tweets = pd.read_csv("./data/london_tweets.csv", encoding="latin1")

In [3]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50
1,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,Croydon,,0,2014-12-31 09:47:53
2,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Newham,,0,2014-12-31 09:47:55
3,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Havering,,0,2014-12-31 09:49:25
4,550227602011856896,,-0.104869,51.50953,-0.104869,51.509529,31/12/2014 09:50:57,151094999,HevnKISZ,@JJJ_Jeeppy >> # XD,Southwark,,0,2014-12-31 09:50:57


In [4]:
tweets["datetime"] = pd.to_datetime(tweets.MESSAGEDATE)

In [5]:
tweets["datetime_to_nearest_hour"] = tweets.datetime.apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour))

Remove non-plaintext elements from tweets

In [6]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [7]:
tweets["parsed_tweets"] = tweets.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [9]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,datetime_to_nearest_hour,parsed_tweets
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50,2014-12-31 09:00:00,Not yet got tics for Liverpool waiting till Fr...
1,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,Croydon,,0,2014-12-31 09:47:53,2014-12-31 09:00:00,If it doesn t make you happy by December st le...
2,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Newham,,0,2014-12-31 09:47:55,2014-12-31 09:00:00,thanks and are you open tomorrow at all
3,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Havering,,0,2014-12-31 09:49:25,2014-12-31 09:00:00,but he is
4,550227602011856896,,-0.104869,51.50953,-0.104869,51.509529,31/12/2014 09:50:57,151094999,HevnKISZ,@JJJ_Jeeppy >> # XD,Southwark,,0,2014-12-31 09:50:57,2014-12-31 09:00:00,XD


In [10]:
# tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))
# tweets_by_hour = pd.DataFrame({"message_text_for_hour" : tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))}).reset_index()

In [10]:
# need to get pos tag
class StemmerTokenizer(object):

    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def __call__(self, doc):
        is_noun = lambda pos: pos[:2] == 'NN'
        doc = " ".join(re.findall("[a-zA-Z]+", doc))
        tokenized = nltk.word_tokenize(doc)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [11]:
idf_by_hourly_interval = {}

for hour_interval in tweets.datetime_to_nearest_hour.unique():
    clf = TfidfVectorizer(tokenizer=StemmerTokenizer(), ngram_range=(1, 3), stop_words=set(stopwords.words('english')))
    clf.fit(tweets[tweets["datetime_to_nearest_hour"] == hour_interval]["parsed_tweets"])
    word2idf = dict(zip(clf.get_feature_names(), clf.idf_))
    sorted_idf = sorted(word2idf.items(), key=lambda x: x[1], reverse=True)

    idf_by_hourly_interval[hour_interval] = {
        "hour_interval": hour_interval,
        "sorted_idf": sorted_idf
    }
    
    print("Completed", hour_interval)

print("Finished!")

Completed 2014-12-31T09:00:00.000000000
Completed 2014-12-31T10:00:00.000000000
Completed 2014-12-31T08:00:00.000000000
Completed 2014-12-31T06:00:00.000000000
Completed 2014-12-31T07:00:00.000000000
Completed 2014-12-31T04:00:00.000000000
Completed 2014-12-31T05:00:00.000000000
Completed 2014-12-31T14:00:00.000000000
Completed 2014-12-31T15:00:00.000000000
Completed 2014-12-31T11:00:00.000000000
Completed 2014-12-31T13:00:00.000000000
Completed 2014-12-31T12:00:00.000000000
Completed 2014-12-31T18:00:00.000000000
Completed 2014-12-31T19:00:00.000000000
Completed 2014-12-31T16:00:00.000000000
Completed 2014-12-31T17:00:00.000000000
Completed 2014-12-31T23:00:00.000000000
Completed 2015-01-01T00:00:00.000000000
Completed 2014-12-31T20:00:00.000000000
Completed 2014-12-31T22:00:00.000000000
Completed 2015-01-01T08:00:00.000000000
Completed 2015-01-01T09:00:00.000000000
Completed 2015-01-01T04:00:00.000000000
Completed 2015-01-01T05:00:00.000000000
Completed 2015-01-01T06:00:00.000000000


In [12]:
for interval in idf_by_hourly_interval.keys():
    print(interval)
    print(idf_by_hourly_interval[interval]["sorted_idf"][-5:])

2014-12-31T09:00:00.000000000
[('happi', 3.862553552419634), ('new year', 3.703125815491532), ('new', 3.503412515985708), ('thi', 3.4778792139805432), ('year', 3.181828891858246)]
2014-12-31T10:00:00.000000000
[('happi', 4.029398290247109), ('new year', 3.8507065015037334), ('thi', 3.7149049603446715), ('new', 3.64591208885772), ('year', 3.2184680740307807)]
2014-12-31T08:00:00.000000000
[('day', 3.8409524650491713), ('new year', 3.7508013680548737), ('thi', 3.7508013680548737), ('new', 3.5207849374529023), ('year', 3.300384872084443)]
2014-12-31T06:00:00.000000000
[('today', 3.878395566849449), ('new year', 3.627081138568543), ('new', 3.5217206229107165), ('thi', 3.472930458741285), ('year', 3.472930458741285)]
2014-12-31T07:00:00.000000000
[('work', 3.894478539848818), ('thi', 3.8581108956779433), ('new', 3.7245795030534206), ('london', 3.693807844386667), ('year', 3.5793974932089228)]
2014-12-31T04:00:00.000000000
[('like', 4.4094961844768505), ('sleep', 4.237645927550192), ('thi', 

In [13]:
idf_by_hourly_interval[np.datetime64('2015-01-01T21:00:00.000000000')]["sorted_idf"][-10:]

[('go', 4.488095834480246),
 ('kill', 4.488095834480246),
 ('die', 4.440467785490991),
 ('new', 4.289644895756409),
 ('like', 4.250424182603126),
 ('ronni', 4.032422030789336),
 ('wa', 3.934304174830812),
 ('year', 3.915785127063575),
 ('eastend', 3.7828272933879563),
 ('thi', 3.731935686091567)]

In [None]:
('eastend', 3.7828272933879563),
('ronni', 4.032422030789336),
('kill', 4.488095834480246),
('die', 4.440467785490991),
('emma', 4.735931998384828),
('luci', 4.767680696699408),
('phil', 5.023614070836609)
('nick', 5.539080074159101)
('denis', 5.110625447826238)
('dead', 5.292947004620193)

In [15]:
idf_by_hourly_interval[np.datetime64('2015-01-01T18:00:00.000000000')]["sorted_idf"][-10:]

[('wa', 4.331218802850445),
 ('spur', 4.324345923562682),
 ('harri', 4.317519958492283),
 ('happi', 4.284072024424743),
 ('new year', 4.109455514005193),
 ('new', 3.92345748248193),
 ('chelsea', 3.9007807738109004),
 ('kane', 3.8655348347492255),
 ('year', 3.7904996488063114),
 ('thi', 3.5179923743737658)]

In [37]:
idf_by_hourly_interval[np.datetime64('2015-01-01T17:00:00.000000000')]["sorted_idf"][-10:]

[('happi new', 4.403730402737885),
 ('get', 4.281128080645553),
 ('go', 4.281128080645553),
 ('london', 4.146395486675397),
 ('happi', 4.042717057200555),
 ('wa', 4.012864094050874),
 ('thi', 3.8950810583944904),
 ('new year', 3.8317687789801633),
 ('new', 3.7105832221779402),
 ('year', 3.453248306115451)]

In [41]:
idf_by_hourly_interval[np.datetime64('2015-01-01T18:00:00.000000000')]["sorted_idf"][-10:]

[('wa', 4.331218802850445),
 ('spur', 4.324345923562682),
 ('harri', 4.317519958492283),
 ('happi', 4.284072024424743),
 ('new year', 4.109455514005193),
 ('new', 3.92345748248193),
 ('chelsea', 3.9007807738109004),
 ('kane', 3.8655348347492255),
 ('year', 3.7904996488063114),
 ('thi', 3.5179923743737658)]

In [42]:
idf_by_hourly_interval[np.datetime64('2015-01-01T19:00:00.000000000')]["sorted_idf"][-100:]

[('wait', 5.802280832982721),
 ('ye', 5.802280832982721),
 ('could', 5.77242786983304),
 ('absolut', 5.743440332959787),
 ('believ', 5.743440332959787),
 ('chadli', 5.743440332959787),
 ('ever', 5.743440332959787),
 ('lane', 5.715269455993091),
 ('last night', 5.715269455993091),
 ('better', 5.687870481804977),
 ('home', 5.687870481804977),
 ('point', 5.687870481804977),
 ('pleas', 5.609908940335265),
 ('rt', 5.609908940335265),
 ('peopl', 5.585216327744893),
 ('player', 5.585216327744893),
 ('let', 5.5611187761658325),
 ('tonight', 5.5611187761658325),
 ('beat', 5.51459876053094),
 ('cahil', 5.51459876053094),
 ('score', 5.51459876053094),
 ('take', 5.448640792739143),
 ('much', 5.427587383541311),
 ('shit', 5.427587383541311),
 ('way', 5.427587383541311),
 ('hope', 5.386765389021055),
 ('na', 5.386765389021055),
 ('realli', 5.386765389021055),
 ('right', 5.386765389021055),
 ('best', 5.366962761724875),
 ('feel', 5.366962761724875),
 ('hi', 5.366962761724875),
 ('first', 5.3475446758

In [43]:
['arsen','goal','lampard','wenger','liverpool','team','chelsea','game','kane','spur','harri','tottenham','sunderland','player','leagu','footbal','arsen','mourinho','play','beat','cahil','score']

['arsen',
 'goal',
 'lampard',
 'wenger',
 'liverpool',
 'team',
 'chelsea',
 'game',
 'kane',
 'spur',
 'harri',
 'tottenham',
 'sunderland',
 'player',
 'leagu',
 'footbal',
 'arsen',
 'mourinho',
 'play',
 'beat',
 'cahil',
 'score']

In [45]:
idf_by_hourly_interval[np.datetime64('2014-12-31T19:00:00.000000000')]["sorted_idf"][-100:]

[('next', 5.541164856012179),
 ('still', 5.541164856012179),
 ('thing', 5.541164856012179),
 ('want', 5.541164856012179),
 ('de', 5.501944142858898),
 ('let', 5.501944142858898),
 ('need', 5.501944142858898),
 ('readi', 5.501944142858898),
 ('think', 5.501944142858898),
 ('ever', 5.464203814876051),
 ('us', 5.464203814876051),
 ('friend', 5.427836170705175),
 ('gon', 5.427836170705175),
 ('gon na', 5.427836170705175),
 ('good luck run', 5.427836170705175),
 ('hi', 5.427836170705175),
 ('luck run', 5.427836170705175),
 ('veri', 5.427836170705175),
 ('fuck', 5.392744850893906),
 ('last', 5.392744850893906),
 ('start', 5.392744850893906),
 ('day', 5.358843299218225),
 ('greater', 5.358843299218225),
 ('greater london', 5.358843299218225),
 ('way', 5.358843299218225),
 ('work', 5.358843299218225),
 ('would', 5.358843299218225),
 ('follow', 5.326053476395233),
 ('forev matter', 5.326053476395233),
 ('forev matter wat', 5.326053476395233),
 ('good luck', 5.326053476395233),
 ('keep thi', 5.3