In [1]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
tweets = pd.read_csv("./data/london_tweets.csv", encoding="latin1")

In [3]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50
1,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,Croydon,,0,2014-12-31 09:47:53
2,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Newham,,0,2014-12-31 09:47:55
3,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Havering,,0,2014-12-31 09:49:25
4,550227602011856896,,-0.104869,51.50953,-0.104869,51.509529,31/12/2014 09:50:57,151094999,HevnKISZ,@JJJ_Jeeppy >> # XD,Southwark,,0,2014-12-31 09:50:57


Remove non-plaintext elements from tweets

In [4]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [5]:
tweets["parsed_tweets"] = tweets.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [6]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,parsed_tweets
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50,Not yet got tics for Liverpool waiting till Fr...
1,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,Croydon,,0,2014-12-31 09:47:53,If it doesn t make you happy by December st le...
2,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Newham,,0,2014-12-31 09:47:55,thanks and are you open tomorrow at all
3,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Havering,,0,2014-12-31 09:49:25,but he is
4,550227602011856896,,-0.104869,51.50953,-0.104869,51.509529,31/12/2014 09:50:57,151094999,HevnKISZ,@JJJ_Jeeppy >> # XD,Southwark,,0,2014-12-31 09:50:57,XD


In [7]:
# tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))
# tweets_by_hour = pd.DataFrame({"message_text_for_hour" : tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))}).reset_index()

In [8]:
# need to get pos tag
class StemmerTokenizer(object):

    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def __call__(self, doc):
        is_noun = lambda pos: pos[:2] == 'NN'
        doc = " ".join(re.findall("[a-zA-Z]+", doc))
        tokenized = nltk.word_tokenize(doc)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [9]:
idf_by_borough = {}

for borough in tweets.LOCATION.unique():
    clf = TfidfVectorizer(tokenizer=StemmerTokenizer(), ngram_range=(1, 3), stop_words=set(stopwords.words('english')))
    try:
        clf.fit(tweets[tweets["LOCATION"] == borough]["parsed_tweets"])
        word2idf = dict(zip(clf.get_feature_names(), clf.idf_))
        sorted_idf = sorted(word2idf.items(), key=lambda x: x[1], reverse=True)
    except ValueError:
        sorted_idf = []

    idf_by_borough[borough] = {
        "borough": borough,
        "sorted_idf": sorted_idf
    }
    
    print("Completed", borough)

print("Finished!")

Completed Kingston upon Thames
Completed Croydon
Completed Newham
Completed Havering
Completed Southwark
Completed Hackney
Completed Wandsworth
Completed Westminster
Completed Bexley
Completed Kensington and Chelsea
Completed Islington
Completed Hillingdon
Completed Camden
Completed Barnet
Completed Hounslow
Completed City of London
Completed Ealing
Completed Lambeth
Completed Waltham Forest
Completed Tower Hamlets
Completed Harrow
Completed Bromley
Completed Redbridge
Completed Brent
Completed Greenwich
Completed Richmond upon Thames
Completed Enfield
Completed Sutton
Completed Lewisham
Completed Hammersmith and Fulham
Completed Barking and Dagenham
Completed Haringey
Completed Merton
Finished!


In [11]:
for borough in idf_by_borough.keys():
    print(borough)
    print(idf_by_borough[borough]["sorted_idf"][-10:])

Kingston upon Thames
[('ha', 4.180923272775899), ('wa', 4.163828839416599), ('love', 4.06697901342668), ('thi', 3.708766790169062), ('happi new year', 3.4210847177172807), ('happi new', 3.3971791968637266), ('happi', 3.123372470905563), ('new year', 3.0595827330302328), ('new', 2.9430489167742815), ('year', 2.7775344782967077)]
Croydon
[('love', 4.263955409641097), ('go', 4.257034966796523), ('get', 4.152364344232635), ('happi new year', 4.0242825563756774), ('happi new', 4.0134128841387735), ('thi', 3.761498035545623), ('happi', 3.732510498672371), ('new year', 3.5069209616299006), ('new', 3.3644717960833233), ('year', 3.1245211264527324)]
Newham
[('one', 4.400934880020013), ('wa', 4.369924643277452), ('u', 4.08033599255509), ('thi', 3.9418170584195282), ('happi new year', 3.8514944201262424), ('happi new', 3.635650171293644), ('new year', 3.495728980570351), ('happi', 3.4250390361262424), ('new', 3.2561605495887407), ('year', 3.1204385755843993)]
Havering
[('get', 4.231553740092256),

In [37]:
for borough in idf_by_borough.keys():
    print(idf_by_borough[borough]["sorted_idf"][1])

('aadam said', 7.548219102762372)
('aa gracia', 8.540621528657152)
('aa mii', 8.55197478591866)
('aaaaaannnnd sleep', 8.291996750638674)
('aah', 8.636028261517456)
('aaaaaaaaaaaaaaooo dalston', 8.584773077612198)
('aaah mai', 8.523751400353667)
('aaaaaw thank', 8.526448273617135)
('aaaaah want', 8.242440205029254)
('aa angol', 8.178164022763461)
('aaron ramsey', 8.39018142822643)
('aa loung', 8.648024566610818)
('aaaaaaah ai', 8.639642287858013)
('aaaaahahahahahaha putangina', 8.671127904431579)
('aaaaa wait', 7.673929879321067)
('aaaargh x', 7.275703197782002)
('aasra veri', 8.07791990550546)
('aa white', 8.786967002614873)
('aa gya', 8.169350016670599)
('ab', 8.629246932551226)
('aamir clap woo', 7.915723448631314)
('aaaaa made', 8.503013958722956)
('aa nikla', 7.987951820976414)
('aa ke', 8.164720378771857)
('aayyoo hurri', 8.324160327128636)
('aan op', 7.837869151517971)
('aaron lennon', 8.496652438168283)
('aath gov', 7.643138479461352)
('aaaaaand action', 8.071997722376185)
('aa 