In [1]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
borough_to_cluster = pd.read_csv("./data/borough_to_cluster.csv")

In [4]:
borough_to_cluster.head()

Unnamed: 0,LOCATION,Cluster
0,Barking and Dagenham,1
1,Barnet,1
2,Bexley,1
3,Brent,2
4,Bromley,1


In [3]:
tweets = pd.read_csv("./data/london_tweets.csv", encoding="latin1")

In [5]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50
1,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,Croydon,,0,2014-12-31 09:47:53
2,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Newham,,0,2014-12-31 09:47:55
3,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Havering,,0,2014-12-31 09:49:25
4,550227602011856896,,-0.104869,51.50953,-0.104869,51.509529,31/12/2014 09:50:57,151094999,HevnKISZ,@JJJ_Jeeppy >> # XD,Southwark,,0,2014-12-31 09:50:57


In [6]:
data = tweets.merge(borough_to_cluster, left_on='LOCATION', right_on='LOCATION', how='outer')

In [7]:
data.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,Cluster
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50,1
1,550228733014331392,,-0.305024,51.427956,-0.305024,51.427956,31/12/2014 09:55:27,275233035,2bcmel,@pjbish hey dude from bude; U 2! Big hugs to t...,Kingston upon Thames,,0,2014-12-31 09:55:27,1
2,550212608746606594,,-0.261681,51.39176,-0.261681,51.391762,31/12/2014 08:51:23,113918054,12Elbestreet,One of the greatest singers of all time good m...,Kingston upon Thames,,0,2014-12-31 08:51:23,1
3,550218811904114688,,-0.298933,51.42021,-0.298933,51.420212,31/12/2014 09:16:01,2463405963,mrsclaireismail,@theshores124 We were planning to go down to B...,Kingston upon Thames,,0,2014-12-31 09:16:01,1
4,550225718060519424,,-0.293391,51.416702,-0.293391,51.416702,31/12/2014 09:43:28,450719269,manda10110,@joemcelderry91 Wishing you Health; Love and ...,Kingston upon Thames,,0,2014-12-31 09:43:28,1


In [8]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [9]:
data["parsed_tweets"] = data.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [18]:
# need to get pos tag
class StemmerTokenizer(object):

    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def __call__(self, doc):
        doc = doc.lower()
        doc = " ".join(re.findall("[a-zA-Z]+", doc))
        tokenized = nltk.word_tokenize(doc)
        nouns = [word for (word, pos) in nltk.pos_tag(tokenized)] 
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [19]:
stemmer = PorterStemmer()

def get_stemmed_tokens(doc):
    is_noun = lambda pos: pos[:2] == 'NN'
    doc = " ".join(re.findall("[a-zA-Z]+", doc))
    tokenized = nltk.word_tokenize(doc)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    return [stemmer.stem(t) for t in word_tokenize(doc)]

In [20]:
data["stemmed_tokens"] = data.parsed_tweets.apply(lambda tweet: get_stemmed_tokens(tweet))

In [30]:
data.head(15)

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,Cluster,parsed_tweets,stemmed_tokens,is_new_years_tweet
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50,1,Not yet got tics for Liverpool waiting till Fr...,"[not, yet, got, tic, for, liverpool, wait, til...",0
1,550228733014331392,,-0.305024,51.427956,-0.305024,51.427956,31/12/2014 09:55:27,275233035,2bcmel,@pjbish hey dude from bude; U 2! Big hugs to t...,Kingston upon Thames,,0,2014-12-31 09:55:27,1,hey dude from bude U Big hugs to the family an...,"[hey, dude, from, bude, U, big, hug, to, the, ...",0
2,550212608746606594,,-0.261681,51.39176,-0.261681,51.391762,31/12/2014 08:51:23,113918054,12Elbestreet,One of the greatest singers of all time good m...,Kingston upon Thames,,0,2014-12-31 08:51:23,1,One of the greatest singers of all time good m...,"[one, of, the, greatest, singer, of, all, time...",0
3,550218811904114688,,-0.298933,51.42021,-0.298933,51.420212,31/12/2014 09:16:01,2463405963,mrsclaireismail,@theshores124 We were planning to go down to B...,Kingston upon Thames,,0,2014-12-31 09:16:01,1,We were planning to go down to Bournemouth Fri...,"[We, were, plan, to, go, down, to, bournemouth...",0
4,550225718060519424,,-0.293391,51.416702,-0.293391,51.416702,31/12/2014 09:43:28,450719269,manda10110,@joemcelderry91 Wishing you Health; Love and ...,Kingston upon Thames,,0,2014-12-31 09:43:28,1,Wishing you Health Love and Laughter for and s...,"[wish, you, health, love, and, laughter, for, ...",0
5,550226427069865984,,-0.261647,51.391495,-0.261647,51.391495,31/12/2014 09:46:17,113918054,12Elbestreet,@julieo25 the secret is safe with me good job ...,Kingston upon Thames,,0,2014-12-31 09:46:17,1,the secret is safe with me good job he is not ...,"[the, secret, is, safe, with, me, good, job, h...",0
6,550232427348897792,,-0.293364,51.41669,-0.293364,51.416691,31/12/2014 10:10:08,450719269,manda10110,@joemcelderry91 We'll have to do it again then...,Kingston upon Thames,EvolutionTour,0,2014-12-31 10:10:08,1,We ll have to do it again then won t we,"[We, ll, have, to, do, it, again, then, won, t...",0
7,550232636313325568,,-0.27239,51.365818,-0.27239,51.365818,31/12/2014 10:10:57,37974717,MaybeBeth,@alex26stallard You want the puppy; you deal w...,Kingston upon Thames,,0,2014-12-31 10:10:57,1,You want the puppy you deal with the poop,"[you, want, the, puppi, you, deal, with, the, ...",0
8,550219313672900608,,-0.295426,51.37761,-0.295426,51.377609,31/12/2014 09:18:01,483244499,martinpowell4,@sammyp333 Will be fine thanks,Kingston upon Thames,,0,2014-12-31 09:18:01,1,Will be fine thanks,"[will, be, fine, thank]",0
9,550220685797515265,,-0.297143,51.416954,-0.297143,51.416954,31/12/2014 09:23:28,391440046,jessssicatz,mama knows http://t.co/dXXQ2NRrpd,Kingston upon Thames,,0,2014-12-31 09:23:28,1,mama knows,"[mama, know]",0


In [21]:
target_tokens = ["happi", "new", "year"]

In [31]:
data["is_new_years_tweet"] = data.stemmed_tokens.apply(lambda tokens: any(x in tokens for x in target_tokens))

In [32]:
data.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,Cluster,parsed_tweets,stemmed_tokens,is_new_years_tweet
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50,1,Not yet got tics for Liverpool waiting till Fr...,"[not, yet, got, tic, for, liverpool, wait, til...",False
1,550228733014331392,,-0.305024,51.427956,-0.305024,51.427956,31/12/2014 09:55:27,275233035,2bcmel,@pjbish hey dude from bude; U 2! Big hugs to t...,Kingston upon Thames,,0,2014-12-31 09:55:27,1,hey dude from bude U Big hugs to the family an...,"[hey, dude, from, bude, U, big, hug, to, the, ...",False
2,550212608746606594,,-0.261681,51.39176,-0.261681,51.391762,31/12/2014 08:51:23,113918054,12Elbestreet,One of the greatest singers of all time good m...,Kingston upon Thames,,0,2014-12-31 08:51:23,1,One of the greatest singers of all time good m...,"[one, of, the, greatest, singer, of, all, time...",False
3,550218811904114688,,-0.298933,51.42021,-0.298933,51.420212,31/12/2014 09:16:01,2463405963,mrsclaireismail,@theshores124 We were planning to go down to B...,Kingston upon Thames,,0,2014-12-31 09:16:01,1,We were planning to go down to Bournemouth Fri...,"[We, were, plan, to, go, down, to, bournemouth...",False
4,550225718060519424,,-0.293391,51.416702,-0.293391,51.416702,31/12/2014 09:43:28,450719269,manda10110,@joemcelderry91 Wishing you Health; Love and ...,Kingston upon Thames,,0,2014-12-31 09:43:28,1,Wishing you Health Love and Laughter for and s...,"[wish, you, health, love, and, laughter, for, ...",False


In [33]:
new_years_tweets = data[data.is_new_years_tweet == True]

In [34]:
new_years_tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,Cluster,parsed_tweets,stemmed_tokens,is_new_years_tweet
14,550214484628082688,,-0.295413,51.377613,-0.295413,51.377613,31/12/2014 08:58:50,483244499,martinpowell4,@digdeepdolly @JulieBond1 @BrightonMarathn @Ru...,Kingston upon Thames,,0,2014-12-31 08:58:50,1,Friend at work did Brighton last year it was a...,"[friend, at, work, did, brighton, last, year, ...",True
25,550218423515762688,,-0.273753,51.39938,-0.273753,51.39938,31/12/2014 09:14:29,365780632,gazzadexy40,@iamjujun my christmas card from the mighty ye...,Kingston upon Thames,,0,2014-12-31 09:14:29,1,my christmas card from the mighty year ones Ha...,"[my, christma, card, from, the, mighti, year, ...",True
29,550232802722336768,,-0.29335,51.416683,-0.29335,51.416683,31/12/2014 10:11:37,450719269,manda10110,@greesyduck Health and Happiness to you in 201...,Kingston upon Thames,,0,2014-12-31 10:11:37,1,Health and Happiness to you in too Pete xxx,"[health, and, happi, to, you, in, too, pete, xxx]",True
31,550216489132101632,,-0.273675,51.399395,-0.273675,51.399395,31/12/2014 09:06:48,365780632,gazzadexy40,@iamjujun mine has been doing voluntary work f...,Kingston upon Thames,,0,2014-12-31 09:06:48,1,mine has been doing voluntary work for a chari...,"[mine, ha, been, do, voluntari, work, for, a, ...",True
37,550230273670262785,,-0.295399,51.37757,-0.295399,51.377571,31/12/2014 10:01:34,483244499,martinpowell4,@AdamPrav @AndreTrill @UKRunChat @parkrunUK @O...,Kingston upon Thames,,0,2014-12-31 10:01:34,1,That s something I would like to do next year ...,"[that, s, someth, I, would, like, to, do, next...",True


In [35]:
cluster_0 = new_years_tweets[new_years_tweets.Cluster == 0]
cluster_1 = new_years_tweets[new_years_tweets.Cluster == 1]
cluster_2 = new_years_tweets[new_years_tweets.Cluster == 2]
cluster_3 = new_years_tweets[new_years_tweets.Cluster == 3]
cluster_4 = new_years_tweets[new_years_tweets.Cluster == 4]

In [36]:
new_years_tweets.to_csv("./data/new_years_tweets_by_cluster.csv")

In [40]:
for idx, row in cluster_2.iterrows():
    print(row)

id                                                   550226469356855296
Name                                                                NaN
X                                                             -0.082229
Y                                                               51.4904
LONGITUDE                                                     -0.082229
LATITUDE                                                        51.4904
MESSAGEDATE                                         31/12/2014 09:46:27
USERID                                                         19785093
USERSCREENNAME                                            Sonic_Screwup
MESSAGETEXT           Sorry just thought I'd offer a balanced view o...
LOCATION                                                      Southwark
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                                            2014-12-31 0

Name: 13188, dtype: object
id                                                   550421017731272704
Name                                                                NaN
X                                                             -0.033828
Y                                                               51.5002
LONGITUDE                                                     -0.033828
LATITUDE                                                        51.5002
MESSAGEDATE                                         31/12/2014 22:39:31
USERID                                                        290345467
USERSCREENNAME                                                 REISSMFC
MESSAGETEXT           New Year's Eve in my room watching Tears of th...
LOCATION                                                      Southwark
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

Name: 14065, dtype: object
id                                                   550373058452942849
Name                                                                NaN
X                                                             -0.076372
Y                                                               51.5036
LONGITUDE                                                     -0.076372
LATITUDE                                                        51.5036
MESSAGEDATE                                         31/12/2014 19:28:57
USERID                                                        546823454
USERSCREENNAME                                                JTAH_1990
MESSAGETEXT           This time last year I had so many hopes and dr...
LOCATION                                                      Southwark
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

id                                          550741844347613184
Name                                                       NaN
X                                                    -0.060027
Y                                                      51.4959
LONGITUDE                                            -0.060027
LATITUDE                                               51.4959
MESSAGEDATE                                01/01/2015 19:54:22
USERID                                               125108379
USERSCREENNAME                                  molliewiddows_
MESSAGETEXT           New year selfie   http://t.co/JA1LRjHpFp
LOCATION                                             Southwark
HASHTAGS                                                   NaN
ISRETWEET                                                    0
datetime                                   2015-01-01 19:54:22
Cluster                                                      2
parsed_tweets                                  New year

Name: 20268, dtype: object
id                                      550242313973559296
Name                                                   NaN
X                                                -0.195356
Y                                                  51.4561
LONGITUDE                                        -0.195356
LATITUDE                                           51.4561
MESSAGEDATE                            31/12/2014 10:49:25
USERID                                            29823208
USERSCREENNAME                             JeremyGriffiths
MESSAGETEXT           2015 is going to be my year         
LOCATION                                        Wandsworth
HASHTAGS                                               NaN
ISRETWEET                                                0
datetime                               2014-12-31 10:49:25
Cluster                                                  2
parsed_tweets                       is going to be my year
stemmed_tokens               

Name: 20921, dtype: object
id                                        550441299481198593
Name                                                     NaN
X                                                  -0.189875
Y                                                    51.4504
LONGITUDE                                          -0.189875
LATITUDE                                             51.4504
MESSAGEDATE                              01/01/2015 00:00:07
USERID                                              27891207
USERSCREENNAME                                    PaulBoross
MESSAGETEXT           Happy New Year! http://t.co/uMK15ntUNo
LOCATION                                          Wandsworth
HASHTAGS                                                 NaN
ISRETWEET                                                  0
datetime                                 2015-01-01 00:00:07
Cluster                                                    2
parsed_tweets                                 Happy New Ye

Name: 21681, dtype: object
id                                                   550476662530658304
Name                                                                NaN
X                                                             -0.150158
Y                                                               51.4704
LONGITUDE                                                     -0.150158
LATITUDE                                                        51.4704
MESSAGEDATE                                         01/01/2015 02:20:38
USERID                                                        130811704
USERSCREENNAME                                             bombaylychee
MESSAGETEXT           Seeing in the New Year with a Calvados from Al...
LOCATION                                                     Wandsworth
HASHTAGS                                             StartAsIMeanToGoOn
ISRETWEET                                                             0
datetime                             

Name: 30590, dtype: object
id                                                   550369702296715264
Name                                                                NaN
X                                                             -0.191549
Y                                                               51.4991
LONGITUDE                                                     -0.191549
LATITUDE                                                        51.4991
MESSAGEDATE                                         31/12/2014 19:15:37
USERID                                                        260815294
USERSCREENNAME                                                   zap_ol
MESSAGETEXT           @realmartinkemp Happy New Year @Shirliekemp  F...
LOCATION                                         Kensington and Chelsea
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

id                                                   550471100485079040
Name                                                                NaN
X                                                             -0.195451
Y                                                               51.5108
LONGITUDE                                                     -0.195451
LATITUDE                                                        51.5108
MESSAGEDATE                                         01/01/2015 01:58:32
USERID                                                        285545487
USERSCREENNAME                                          AisyahShahminan
MESSAGETEXT           New year; new me? Screw that! New year; new na...
LOCATION                                         Kensington and Chelsea
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                                            2015-01-01 0

id                                                   550754536596701185
Name                                                                NaN
X                                                             -0.194755
Y                                                               51.4888
LONGITUDE                                                     -0.194755
LATITUDE                                                        51.4888
MESSAGEDATE                                         01/01/2015 20:44:48
USERID                                                        307358185
USERSCREENNAME                                          DeanConstantino
MESSAGETEXT           @DagiPapa6 smashing way for the YIDS to start ...
LOCATION                                         Kensington and Chelsea
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                                            2015-01-01 2

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.



id                                                   550326658264498176
Name                                                                NaN
X                                                              0.007059
Y                                                               51.5541
LONGITUDE                                                      0.007059
LATITUDE                                                        51.5541
MESSAGEDATE                                         31/12/2014 16:24:34
USERID                                                       1281662814
USERSCREENNAME                                           ItsJessiara_xo
MESSAGETEXT           People always say "New year. New me." ...2 min...
LOCATION                                                 Waltham Forest
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                                            2014-12-31 

Name: 59744, dtype: object
id                                                   550476976377823232
Name                                                                NaN
X                                                             -0.008601
Y                                                               51.6268
LONGITUDE                                                     -0.008601
LATITUDE                                                        51.6268
MESSAGEDATE                                         01/01/2015 02:21:53
USERID                                                       2918509551
USERSCREENNAME                                           muke_cashton96
MESSAGETEXT           @5SOS have good year; have fun on the tour; ca...
LOCATION                                                 Waltham Forest
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

Name: 61496, dtype: object
id                                             550244143134691329
Name                                                          NaN
X                                                       -0.073398
Y                                                         51.5097
LONGITUDE                                               -0.073398
LATITUDE                                                  51.5097
MESSAGEDATE                                   31/12/2014 10:56:41
USERID                                                  102722892
USERSCREENNAME                                       thetombyford
MESSAGETEXT           @gaydio @tobywhitehouse I look so happy lol
LOCATION                                            Tower Hamlets
HASHTAGS                                                      NaN
ISRETWEET                                                       0
datetime                                      2014-12-31 10:56:41
Cluster                                          

Name: 62357, dtype: object
id                                                   550388001172652032
Name                                                                NaN
X                                                             -0.053574
Y                                                               51.5275
LONGITUDE                                                     -0.053574
LATITUDE                                                        51.5275
MESSAGEDATE                                         31/12/2014 20:28:19
USERID                                                       2532587552
USERSCREENNAME                                          ohmyllamatwaimz
MESSAGETEXT           @twaimz Check out my Llamas new music video it...
LOCATION                                                  Tower Hamlets
HASHTAGS                                                      llamasong
ISRETWEET                                                             0
datetime                             

Name: 62880, dtype: object
id                                                   550555322226855936
Name                                                                NaN
X                                                             -0.065397
Y                                                               51.5214
LONGITUDE                                                     -0.065397
LATITUDE                                                        51.5214
MESSAGEDATE                                         01/01/2015 07:33:12
USERID                                                        245222174
USERSCREENNAME                                             jayjaydeepee
MESSAGETEXT           @andymcscouse "That shepherd's pie was stunnin...
LOCATION                                                  Tower Hamlets
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

Name: 63542, dtype: object
id                                                   550371807673716736
Name                                                                NaN
X                                                             -0.037168
Y                                                               51.5191
LONGITUDE                                                     -0.037168
LATITUDE                                                        51.5191
MESSAGEDATE                                         31/12/2014 19:23:59
USERID                                                         83952844
USERSCREENNAME                                                BeatWoven
MESSAGETEXT           Thank-u to all that have been part of my 2014!...
LOCATION                                                  Tower Hamlets
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

Name: 73389, dtype: object
id                                                   550355746979061760
Name                                                                NaN
X                                                             -0.317128
Y                                                               51.5786
LONGITUDE                                                     -0.317128
LATITUDE                                                        51.5786
MESSAGEDATE                                         31/12/2014 18:20:09
USERID                                                        188788669
USERSCREENNAME                                             HollieeSarah
MESSAGETEXT           Today is the day millions of people make New Y...
LOCATION                                                          Brent
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

Name: 74332, dtype: object
id                                                   550476352500269056
Name                                                                NaN
X                                                             -0.255857
Y                                                               51.5663
LONGITUDE                                                     -0.255857
LATITUDE                                                        51.5663
MESSAGEDATE                                         01/01/2015 02:19:24
USERID                                                        215363298
USERSCREENNAME                                             weare_br0ken
MESSAGETEXT           @yelyahwilliams @XChadballX I am so so so so s...
LOCATION                                                          Brent
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

Name: 96650, dtype: object
id                                                   550449682678546432
Name                                                                NaN
X                                                              -0.16778
Y                                                               51.4192
LONGITUDE                                                      -0.16778
LATITUDE                                                        51.4192
MESSAGEDATE                                         01/01/2015 00:33:25
USERID                                                        414735661
USERSCREENNAME                                                rebelle82
MESSAGETEXT           Happy New Year!!!!       #drunk #fun #selfie #...
LOCATION                                                         Merton
HASHTAGS                             drunk fun selfie love happynewyear
ISRETWEET                                                             0
datetime                             

Name: 97406, dtype: object
id                                                   550726262122364929
Name                                                                NaN
X                                                             -0.176275
Y                                                                 51.42
LONGITUDE                                                     -0.176275
LATITUDE                                                          51.42
MESSAGEDATE                                         01/01/2015 18:52:27
USERID                                                         40254901
USERSCREENNAME                                              thefashfeed
MESSAGETEXT           A look back over last years popular posts (inc...
LOCATION                                                         Merton
HASHTAGS                                                            NaN
ISRETWEET                                                             0
datetime                             

In [41]:
eastender_tokens = ['eastend','ronni', 'kill','die', 'emma','luci','phil','nick','denis', 'dead']

In [42]:
data["is_eastender_tweet"] = np.nan

In [43]:
data["is_eastender_tweet"] = data.stemmed_tokens.apply(lambda tokens: any(x in tokens for x in eastender_tokens))

In [45]:
eastender_tweets = data[data.is_eastender_tweet == True]

In [46]:
eastender_tweets.to_csv("./data/eastender_tweets.csv")

In [47]:
football_tokens = ['arsen','goal','lampard','wenger','liverpool','team','chelsea','game','kane','spur','harri','tottenham','sunderland','player','leagu','footbal','arsen','mourinho','play','beat','cahil','score']

In [48]:
data["is_football_tweet"] = np.nan

In [49]:
data["is_football_tweet"] = data.stemmed_tokens.apply(lambda tokens: any(x in tokens for x in football_tokens))

In [50]:
football_tweets = data[data.is_football_tweet == True]

In [51]:
football_tweets.to_csv("./data/football_tweets.csv")

In [52]:
football_tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,Cluster,parsed_tweets,stemmed_tokens,is_new_years_tweet,is_eastender_tweet,is_football_tweet
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames,,0,2014-12-31 09:47:50,1,Not yet got tics for Liverpool waiting till Fr...,"[not, yet, got, tic, for, liverpool, wait, til...",False,False,True
61,550211786700750849,,-0.272859,51.39866,-0.272859,51.398659,31/12/2014 08:48:07,1348662787,Deccers_8,@SkySportsNewsHQ #ssnhqSchurrle The Build Up; ...,Kingston upon Thames,ssnhqSchurrle,0,2014-12-31 08:48:07,1,The Build Up The Pass The Goal Best For Me,"[the, build, Up, the, pass, the, goal, best, f...",False,False,True
128,550252286908596224,,-0.295655,51.38408,-0.295655,51.384079,31/12/2014 11:29:03,121374134,aewparsons,Hope ECB is watching. Only play T20 blast matc...,Kingston upon Thames,lessonslearned,0,2014-12-31 11:29:03,1,Hope ECB is watching Only play T blast matches...,"[hope, ecb, is, watch, onli, play, T, blast, m...",False,False,True
147,550245732125777920,,-0.257089,51.40669,-0.257089,51.406689,31/12/2014 11:03:00,1530791,niallomalley,@thameswater @Ofwat Hi Tom; any sign of an upd...,Kingston upon Thames,,0,2014-12-31 11:03:00,1,Hi Tom any sign of an update from the attendin...,"[Hi, tom, ani, sign, of, an, updat, from, the,...",False,False,True
257,550317344313344000,,-0.305016,51.36993,-0.305016,51.36993,31/12/2014 15:47:33,335423694,_rachelanstee,Decided in advance that on valentines day I'm ...,Kingston upon Thames,,0,2014-12-31 15:47:33,1,Decided in advance that on valentines day I m ...,"[decid, in, advanc, that, on, valentin, day, I...",False,False,True
