In [1]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
tweets = pd.read_csv("./data/london_tweets.csv", encoding="latin1")

In [3]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50
1,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,Croydon,,0,2014-12-31 09:47:53
2,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Newham,,0,2014-12-31 09:47:55
3,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Havering,,0,2014-12-31 09:49:25
4,550227602011856896,,-0.104869,51.50953,-0.104869,51.509529,31/12/2014 09:50:57,151094999,HevnKISZ,@JJJ_Jeeppy >> # XD,Southwark,,0,2014-12-31 09:50:57


In [3]:
borough_to_cluster = pd.read_csv("./data/borough_to_cluster.csv")

In [4]:
tweets = tweets.merge(borough_to_cluster, left_on='LOCATION', right_on='LOCATION', how='outer')

Remove non-plaintext elements from tweets

In [5]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [6]:
tweets["parsed_tweets"] = tweets.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [7]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,Cluster,parsed_tweets
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50,1,Not yet got tics for Liverpool waiting till Fr...
1,550228733014331392,,-0.305024,51.427956,-0.305024,51.427956,31/12/2014 09:55:27,275233035,2bcmel,@pjbish hey dude from bude; U 2! Big hugs to t...,Kingston upon Thames,,0,2014-12-31 09:55:27,1,hey dude from bude U Big hugs to the family an...
2,550212608746606594,,-0.261681,51.39176,-0.261681,51.391762,31/12/2014 08:51:23,113918054,12Elbestreet,One of the greatest singers of all time good m...,Kingston upon Thames,,0,2014-12-31 08:51:23,1,One of the greatest singers of all time good m...
3,550218811904114688,,-0.298933,51.42021,-0.298933,51.420212,31/12/2014 09:16:01,2463405963,mrsclaireismail,@theshores124 We were planning to go down to B...,Kingston upon Thames,,0,2014-12-31 09:16:01,1,We were planning to go down to Bournemouth Fri...
4,550225718060519424,,-0.293391,51.416702,-0.293391,51.416702,31/12/2014 09:43:28,450719269,manda10110,@joemcelderry91 Wishing you Health; Love and ...,Kingston upon Thames,,0,2014-12-31 09:43:28,1,Wishing you Health Love and Laughter for and s...


In [7]:
# tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))
# tweets_by_hour = pd.DataFrame({"message_text_for_hour" : tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))}).reset_index()

In [8]:
# need to get pos tag
class StemmerTokenizer(object):

    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def __call__(self, doc):
        is_noun = lambda pos: pos[:2] == 'NN'
        doc = " ".join(re.findall("[a-zA-Z]+", doc))
        tokenized = nltk.word_tokenize(doc)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [19]:
idf_by_cluster = {}

for cluster in tweets.Cluster.unique():
    clf = TfidfVectorizer(tokenizer=StemmerTokenizer(), ngram_range=(1, 3), stop_words=set(stopwords.words('english')))
    try:
        clf.fit(tweets[tweets["Cluster"] == cluster]["parsed_tweets"])
        word2idf = dict(zip(clf.get_feature_names(), clf.idf_))
        sorted_idf = sorted(word2idf.items(), key=lambda x: x[1], reverse=True)
    except ValueError:
        sorted_idf = []

    idf_by_cluster[cluster] = {
        "cluster": cluster,
        "sorted_idf": sorted_idf
    }
    
    print("Completed", cluster)

print("Finished!")

Completed 1
Completed 2
Completed 3
Completed 4
Completed 0
Finished!


In [9]:
idf_by_borough = {}

for borough in tweets.LOCATION.unique():
    clf = TfidfVectorizer(tokenizer=StemmerTokenizer(), ngram_range=(1, 3), stop_words=set(stopwords.words('english')))
    try:
        clf.fit(tweets[tweets["LOCATION"] == borough]["parsed_tweets"])
        word2idf = dict(zip(clf.get_feature_names(), clf.idf_))
        sorted_idf = sorted(word2idf.items(), key=lambda x: x[1], reverse=True)
    except ValueError:
        sorted_idf = []

    idf_by_borough[borough] = {
        "borough": borough,
        "sorted_idf": sorted_idf
    }
    
    print("Completed", borough)

print("Finished!")

Completed Kingston upon Thames
Completed Croydon
Completed Newham
Completed Havering
Completed Southwark
Completed Hackney
Completed Wandsworth
Completed Westminster
Completed Bexley
Completed Kensington and Chelsea
Completed Islington
Completed Hillingdon
Completed Camden
Completed Barnet
Completed Hounslow
Completed City of London
Completed Ealing
Completed Lambeth
Completed Waltham Forest
Completed Tower Hamlets
Completed Harrow
Completed Bromley
Completed Redbridge
Completed Brent
Completed Greenwich
Completed Richmond upon Thames
Completed Enfield
Completed Sutton
Completed Lewisham
Completed Hammersmith and Fulham
Completed Barking and Dagenham
Completed Haringey
Completed Merton
Finished!


In [21]:
df = pd.DataFrame(columns = ["Borough", "1st N-gram", "2nd N-gram", "3rd N-gram", "4th N-gram", "5th N-gram"])

In [22]:
for borough in idf_by_borough.keys():
    df["Borough"] = borough
    df["1st N-gram"] = idf_by_borough[borough]["sorted_idf"][-1][0]
    df["2nd N-gram"] = idf_by_borough[borough]["sorted_idf"][-2][0]
    df["3rd N-gram"] = idf_by_borough[borough]["sorted_idf"][-3][0]
    df["4th N-gram"] = idf_by_borough[borough]["sorted_idf"][-4][0]
    df["5th N-gram"] = idf_by_borough[borough]["sorted_idf"][-5][0]

In [23]:
df.head()

Unnamed: 0,Borough,1st N-gram,2nd N-gram,3rd N-gram,4th N-gram,5th N-gram


In [25]:
for borough in idf_by_borough.keys():
    print(borough)
    print(idf_by_borough[borough]["sorted_idf"][-5:])

Kingston upon Thames
[('happi new', 3.3971791968637266), ('happi', 3.123372470905563), ('new year', 3.0595827330302328), ('new', 2.9430489167742815), ('year', 2.7775344782967077)]
Croydon
[('thi', 3.761498035545623), ('happi', 3.732510498672371), ('new year', 3.5069209616299006), ('new', 3.3644717960833233), ('year', 3.1245211264527324)]
Newham
[('happi new', 3.635650171293644), ('new year', 3.495728980570351), ('happi', 3.4250390361262424), ('new', 3.2561605495887407), ('year', 3.1204385755843993)]
Havering
[('happi', 3.5690435289941997), ('thi', 3.5601939137172174), ('new year', 3.4798123952662574), ('new', 3.3978952727983707), ('year', 3.074347287158093)]
Southwark
[('happi new', 3.2311011599111614), ('happi', 3.096727425526879), ('new year', 2.947359424671304), ('new', 2.8384519075748944), ('year', 2.7037830740694457)]
Hackney
[('happi new', 3.866274206317104), ('happi', 3.650299144481507), ('new year', 3.389042299839263), ('new', 3.159823060130796), ('year', 3.0835148670674717)]
W

In [37]:
for cluster in idf_by_cluster.keys():
    print(cluster)
    print(idf_by_cluster[cluster]["sorted_idf"][-100:])

1
[('someon', 5.974339144231222), ('goal', 5.944486181081541), ('miss', 5.93149898555473), ('parti', 5.9250680952244394), ('de', 5.915498644208289), ('veri', 5.906019900253745), ('birthday', 5.893519737489513), ('alway', 5.8781111121366685), ('actual', 5.853940751208856), ('ever', 5.853940751208856), ('chelsea', 5.795953493558506), ('guy', 5.7931405521818915), ('friend', 5.7736684487690715), ('girl', 5.730535077951496), ('next', 5.72790003031349), ('home', 5.712233913569091), ('mate', 5.699363723048556), ('wish', 5.699363723048556), ('right', 5.679109866144058), ('never', 5.6568041086297605), ('game', 5.64462369007289), ('play', 5.63498506123512), ('nye', 5.627816571756508), ('realli', 5.625438449351541), ('oh', 5.604286074346314), ('wait', 5.604286074346314), ('take', 5.599645694789812), ('gon na', 5.592725251945238), ('life', 5.585852372657476), ('us', 5.576761400956224), ('gon', 5.565512688420354), ('first', 5.563278050618937), ('rt', 5.543387892139718), ('thing', 5.543387892139718)

In [24]:
tweets[(tweets.Cluster == 0)].shape

(15264, 16)

In [32]:
relevant = tweets[(tweets.Cluster == 0) & (tweets.datetime > "2015-01-01 18:00:00") & (tweets.datetime < "2015-01-01 20:00:00")]

In [35]:
for idx, rows in relevant.iterrows():
    print(rows.parsed_tweets)

Harry Kane is a god
arsene leaves subs to late aswell to long to make changes wen we need to
KANEEEEEEE
What a goal Harry Kane
Incredible
Some senior Cong leaders uneasy abt hijack of party by US as successor to British colonialism overruled told Modi threatens its survival
Celebrating New Years in Style with my donny chandz Maldives
thank you for the favourite Happy new year sir
thank you for the RT Happy new year
it still feels like it is lol happy New Years Woop Woop lol
thank you for the RT and favourite Happy new year
awwh thank you for your kind words and support I hope you have a prosperous year as well You deserve it
Tottenham scummy trampy cunts fuck off with ur Europa league players how is this fucking real Chelsea are a shambles
And we all thought Chelsea would go unbeaten
good to hear happy new year my lovely
This match is hurting my fantasy squad the rest been good but nah
because you re a Wycombe fan
US agencies church fronts lost confidence on Cong ability to act after m

Es gibt schlechtere Orte um das Jahr zu begr en zur Halbzeit
hi twitter
Too much
How is it a troll tweet because I don t approve of it
God said it s wrong to fornicate be you teaching kids to use protection this is what they will do
From when my brother said putting on a condom is like putting a sock on your leg I thought enough is enough I m speaking to his teacher
Erm lol
I swear these rent boys are better than this wtf
Pizza in the oven and time to watch a film
Just hope we can keep it up We re usually better in the second half so anything could happen
Tonight the last ever episode of Miranda YEEESSS Please please please don t repeat it Yes I know I can change the channel
At the ally pally
HE S ONE OF OUR OWN
Ain t that pretty
Can Chelsea just score more goal please
Bonne annive ga
Terrible embarrassing
highlight of
Matee
Big shock potentially here But Spurs no way deserve to be winning
damn we all thought Chelsea would walk the prem
Londra merkez kafas na g re herkesssss
Yes Signs 

In [36]:
relevant.shape

(1845, 16)

In [40]:
tweets[tweets.parsed_tweets.str.contains("oxford")]

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,parsed_tweets,Cluster
5793,550273198638776320,,-0.003368,51.541485,-0.003368,51.541485,31/12/2014 12:52:08,1363118065,siobhanese,oxford circus station is basically the 9th cir...,Newham,,0,2014-12-31 12:52:08,oxford circus station is basically the th circ...,1
24323,550365102793588736,,-0.188181,51.519054,-0.188181,51.519054,31/12/2014 18:57:20,500568545,DahafLa,Off to oxford st. #NewYearsEve2014,Westminster,NewYearsEve2014,0,2014-12-31 18:57:20,Off to oxford st,3
24381,550317080000872450,,-0.143723,51.51677,-0.143723,51.516769,31/12/2014 15:46:30,159003477,crystaltantan,Happy New Year everyone!!!!! May 2015 be good ...,Westminster,,0,2014-12-31 15:46:30,Happy New Year everyone May be good to ALL OF ...,3
24519,550265533623721984,,-0.146461,51.51475,-0.146461,51.514751,31/12/2014 12:21:41,233219054,NqbZhrm,omnomnomoxford. #feelgood #QuibTravels #london...,Westminster,feelgood QuibTravels london thisislondon ootdm...,0,2014-12-31 12:21:41,omnomnomoxford,3
31258,550514100887748608,,-0.200736,51.506454,-0.200736,51.506454,01/01/2015 04:49:24,478402262,alexalqausar,@HollieBarberx well long celebration huh; oxfo...,Kensington and Chelsea,,0,2015-01-01 04:49:24,well long celebration huh oxford st,2
58493,550826231273635841,,0.007019,51.402733,0.007019,51.402733,02/01/2015 01:29:42,1056357588,kathrynlucy,@GemStafford it's gorgeous I was going to s...,Lambeth,,0,2015-02-01 01:29:42,it s gorgeous I was going to say def didn t se...,2
64508,550742197658984448,,-0.023569,51.50206,-0.023569,51.50206,01/01/2015 19:55:46,104704999,pablocasts,oxford st ahora http://t.co/1djokzYwh4,Tower Hamlets,,0,2015-01-01 19:55:46,oxford st ahora,2


In [41]:
tweets.shape

(97778, 16)

In [42]:
tweets.LONGITUDE.unique().shape

(75078,)

In [16]:
tweets[(tweets.parsed_tweets.str.contains("rt", regex=False)) & (tweets.LOCATION == "Barking and Dagenham")]

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,Cluster,parsed_tweets
90352,550225072255152128,,0.144982,51.530544,0.144982,51.530544,31/12/2014 09:40:54,195715912,blatter77,@Ayourb: Unfortunately I can't get through to ...,Barking and Dagenham,,0,2014-12-31 09:40:54,1,Unfortunately I can t get through to contribut...
90382,550228526289678337,,0.073499,51.541374,0.073499,51.541374,31/12/2014 09:54:38,243312543,Sethski1,@Cameron_Overend happy birthday man,Barking and Dagenham,,0,2014-12-31 09:54:38,1,happy birthday man
90387,550226192339853313,,0.141035,51.547894,0.141035,51.547894,31/12/2014 09:45:21,156073950,Skibobo,Just look at RT @Amara_fob: Might just fart @...,Barking and Dagenham,IfWeDate,0,2014-12-31 09:45:21,1,Just look at RT Might just fart
90390,550220014935367680,,0.141066,51.547910,0.141066,51.547909,31/12/2014 09:20:48,156073950,Skibobo,Hmmmmm RT @swaynkaayyy: Best party you went to...,Barking and Dagenham,,0,2014-12-31 09:20:48,1,Hmmmmm RT Best party you went to this year
90395,550229270019444736,,0.141065,51.547910,0.141065,51.547909,31/12/2014 09:57:35,156073950,Skibobo,"Should be a white top RT @Whodeeny_: The ""danc...",Barking and Dagenham,,0,2014-12-31 09:57:35,1,Should be a white top RT The dance on every ni...
90399,550220462072934400,,0.141066,51.547910,0.141066,51.547909,31/12/2014 09:22:35,156073950,Skibobo,"Jozi RT @Gunslow: ""@swaynkaayyy: ...",Barking and Dagenham,,0,2014-12-31 09:22:35,1,Jozi RT Best party you went to this year
90459,550192349775949826,,0.144965,51.530530,0.144965,51.530529,31/12/2014 07:30:52,195715912,blatter77,@ogundamisi:If GMB & Sambo submitted certain c...,Barking and Dagenham,,0,2014-12-31 07:30:52,1,GMB Sambo submitted certain certificates yrs a...
90467,550205297466556416,,0.143453,51.531900,0.143453,51.531898,31/12/2014 08:22:19,195715912,blatter77,@ironmandas:For me; It is sheer incompetence f...,Barking and Dagenham,,0,2014-12-31 08:22:19,1,me It is sheer incompetence for asking for cer...
90469,550172270376009729,,0.111597,51.550777,0.111597,51.550777,31/12/2014 06:11:05,514628605,MonicaAmaox,It's actually on site for certain man when me ...,Barking and Dagenham,,0,2014-12-31 06:11:05,1,It s actually on site for certain man when me ...
90470,550191486835650561,,0.142933,51.532215,0.142933,51.532215,31/12/2014 07:27:27,195715912,blatter77,@ogundamisi: Why is no one asking @inecnigeria...,Barking and Dagenham,,0,2014-12-31 07:27:27,1,Why is no one asking about the certificates su...
