In [1]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
tweets = pd.read_csv("./data/tweets_all.csv", encoding="latin1")

In [3]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0


In [4]:
min_lat = tweets.LATITUDE.min()
min_long = tweets.LONGITUDE.min()

In [5]:
lat_range = tweets.LATITUDE.max() - tweets.LATITUDE.min()
long_range = tweets.LONGITUDE.max() - tweets.LONGITUDE.min()

lat_bin_size = lat_range/9
long_bin_size= long_range/9

In [29]:
print(lat_bin_size)
print(long_bin_size)

0.045694986979166664
0.09580022096633911


In [6]:
tweets["long_bin"] = tweets.LONGITUDE.apply(lambda long: (min_long + (((long - min_long)//long_bin_size)*long_bin_size)))
tweets["lat_bin"] = tweets.LATITUDE.apply(lambda lat: (min_lat + (((lat - min_lat)//lat_bin_size)*lat_bin_size)))

In [7]:
tweets["long_bin"] = tweets["long_bin"].apply(lambda l: str(l))
tweets["lat_bin"] = tweets["lat_bin"].apply(lambda l: str(l))

In [8]:
tweets["lat_long_bin"] = tweets["lat_bin"] + "," + tweets["long_bin"]

In [9]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,long_bin,lat_bin,lat_long_bin
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0,-0.3293425440788269,51.375291188557945,"51.375291188557945,-0.3293425440788269"
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0,-0.2335423231124878,51.466681162516274,"51.466681162516274,-0.2335423231124878"
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0,-0.0419418811798095,51.32959620157877,"51.329596201578774,-0.04194188117980957"
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0,-0.0419418811798095,51.512376149495445,"51.512376149495445,-0.04194188117980957"
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0,0.1496585607528686,51.55807113647461,"51.55807113647461,0.14965856075286865"


Remove non-plaintext elements from tweets

In [10]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [11]:
tweets["parsed_tweets"] = tweets.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [12]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,long_bin,lat_bin,lat_long_bin,parsed_tweets
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0,-0.3293425440788269,51.375291188557945,"51.375291188557945,-0.3293425440788269",Not yet got tics for Liverpool waiting till Fr...
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0,-0.2335423231124878,51.466681162516274,"51.466681162516274,-0.2335423231124878",So Many Accents
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0,-0.0419418811798095,51.32959620157877,"51.329596201578774,-0.04194188117980957",If it doesn t make you happy by December st le...
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0,-0.0419418811798095,51.512376149495445,"51.512376149495445,-0.04194188117980957",thanks and are you open tomorrow at all
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0,0.1496585607528686,51.55807113647461,"51.55807113647461,0.14965856075286865",but he is


In [13]:
# tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))
# tweets_by_hour = pd.DataFrame({"message_text_for_hour" : tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))}).reset_index()

In [14]:
# need to get pos tag
class StemmerTokenizer(object):

    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def __call__(self, doc):
        doc = " ".join(re.findall("[a-zA-Z]+", doc))
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [15]:
tweets.lat_bin.unique()

array(['51.375291188557945', '51.466681162516274', '51.329596201578774',
       '51.512376149495445', '51.55807113647461', '51.649461110432945',
       '51.603766123453774', '51.42098617553711', '51.28390121459961',
       '51.69515609741211'], dtype=object)

In [16]:
tweets.long_bin.unique()

array(['-0.3293425440788269', '-0.2335423231124878',
       '-0.04194188117980957', '0.14965856075286865',
       '-0.425142765045166', '-0.13774210214614868',
       '0.05385833978652954', '-0.5209429860115051',
       '0.24545878171920776', '0.3412590026855469'], dtype=object)

In [17]:
idf_by_lat_long_bin = {}

for lat_long_bin in tweets.lat_long_bin.unique():
    clf = TfidfVectorizer(tokenizer=StemmerTokenizer(), min_df=1, max_df=0.7, ngram_range=(1, 3), stop_words=set(stopwords.words('english')))
    try:
        clf.fit(tweets[tweets["lat_long_bin"] == lat_long_bin]["parsed_tweets"])
        word2idf = dict(zip(clf.get_feature_names(), clf.idf_))
        sorted_idf = sorted(word2idf.items(), key=lambda x: x[1], reverse=True)
    except ValueError:
        sorted_idf = []

    idf_by_lat_long_bin[lat_long_bin] = {
        "lat_long_bin": lat_long_bin,
        "sorted_idf": sorted_idf
    }
    
    print("Completed", lat_long_bin)

print("Finished!")

Completed 51.375291188557945,-0.3293425440788269
Completed 51.466681162516274,-0.2335423231124878
Completed 51.329596201578774,-0.04194188117980957
Completed 51.512376149495445,-0.04194188117980957
Completed 51.55807113647461,0.14965856075286865
Completed 51.649461110432945,-0.3293425440788269
Completed 51.603766123453774,-0.425142765045166
Completed 51.466681162516274,-0.13774210214614868
Completed 51.512376149495445,-0.13774210214614868
Completed 51.603766123453774,0.14965856075286865
Completed 51.329596201578774,-0.13774210214614868
Completed 51.42098617553711,-0.2335423231124878
Completed 51.512376149495445,-0.2335423231124878
Completed 51.42098617553711,0.05385833978652954
Completed 51.42098617553711,-0.3293425440788269
Completed 51.512376149495445,-0.5209429860115051
Completed 51.603766123453774,-0.2335423231124878
Completed 51.42098617553711,-0.5209429860115051
Completed 51.42098617553711,-0.425142765045166
Completed 51.466681162516274,-0.3293425440788269
Completed 51.5123761494

In [30]:
idf_by_lat_long_bin["51.512376149495445,-0.2335423231124878"]

{'lat_long_bin': '51.512376149495445,-0.2335423231124878',
 'sorted_idf': [('aa', 9.120142581909091),
  ('aa ke', 9.120142581909091),
  ('aa ke tu', 9.120142581909091),
  ('aaaaand', 9.120142581909091),
  ('aaaaand knee', 9.120142581909091),
  ('aaaaand knee joint', 9.120142581909091),
  ('aaaaaw', 9.120142581909091),
  ('aaaaaw thank', 9.120142581909091),
  ('aaaaaw thank dear', 9.120142581909091),
  ('aaah', 9.120142581909091),
  ('aaah je', 9.120142581909091),
  ('aaah je pourrai', 9.120142581909091),
  ('aah', 9.120142581909091),
  ('aah bhari', 9.120142581909091),
  ('aah bhari ghut', 9.120142581909091),
  ('aamir', 9.120142581909091),
  ('aamir liaqat', 9.120142581909091),
  ('aamir liaqat murder', 9.120142581909091),
  ('aan', 9.120142581909091),
  ('aan aa', 9.120142581909091),
  ('aan aa ke', 9.120142581909091),
  ('aan hai', 9.120142581909091),
  ('aan hai ni', 9.120142581909091),
  ('aan tenu', 9.120142581909091),
  ('aan tenu ki', 9.120142581909091),
  ('aaww', 9.1201425819

In [33]:
idf_by_lat_long_bin["51.466681162516274,-0.13774210214614868"]

{'lat_long_bin': '51.466681162516274,-0.13774210214614868',
 'sorted_idf': [('aaaaalllllll', 9.634175906901852),
  ('aaaaalllllll chicken', 9.634175906901852),
  ('aaaaalllllll chicken gees', 9.634175906901852),
  ('aaaallliiivvvve', 9.634175906901852),
  ('aah', 9.634175906901852),
  ('aah hell', 9.634175906901852),
  ('aah hell smh', 9.634175906901852),
  ('aan de', 9.634175906901852),
  ('aan de blije', 9.634175906901852),
  ('aan londen', 9.634175906901852),
  ('aan londen voor', 9.634175906901852),
  ('aarika', 9.634175906901852),
  ('aarika adam', 9.634175906901852),
  ('aarika adam item', 9.634175906901852),
  ('aaron', 9.634175906901852),
  ('aaron tveit', 9.634175906901852),
  ('aaron tveit wot', 9.634175906901852),
  ('aaronkelli', 9.634175906901852),
  ('aaronkelli croydon', 9.634175906901852),
  ('aaronkelli croydon london', 9.634175906901852),
  ('abaslesarab', 9.634175906901852),
  ('abaslesarab tottenham', 9.634175906901852),
  ('abaslesarab tottenham london', 9.63417590

In [32]:
51.466681162516274,-0.13774210214614868

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,long_bin,lat_bin,lat_long_bin,parsed_tweets
6687,550304934621904896,,-0.092266,51.48789,-0.092266,51.487888,31/12/2014 14:58:15,2776733714,toodank4you,bugun london eye dolmadan erkenden gidip yer k...,London; England,,0,-0.1377421021461486,51.466681162516274,"51.466681162516274,-0.13774210214614868",bugun london eye dolmadan erkenden gidip yer k...
15780,550362323282849792,,0.01856,51.51498,0.01856,51.51498,31/12/2014 18:46:17,1373196584,_waleskavieira,Todo mundo vai pra london eye ver a queima de ...,East Ham; London,friopracaraca,0,-0.0419418811798095,51.512376149495445,"51.512376149495445,-0.04194188117980957",Todo mundo vai pra london eye ver a queima de ...
17702,550367893012377602,,-0.092372,51.48793,-0.092372,51.48793,31/12/2014 19:08:25,2776733714,toodank4you,abiniz london eye gidip fireworks izlicek ...,Camberwell; London,,0,-0.1377421021461486,51.466681162516274,"51.466681162516274,-0.13774210214614868",abiniz london eye gidip fireworks izlicek
20741,550314697447444481,,-0.119467,51.503426,-0.119467,51.503426,31/12/2014 15:37:02,490420282,jugnuishiqui,The london eye @ The Official London Eye http:...,Lambeth; London,,0,-0.1377421021461486,51.466681162516274,"51.466681162516274,-0.13774210214614868",The london eye The Official London Eye
22372,550257670314541056,,-0.415433,51.688892,-0.415433,51.688892,31/12/2014 11:50:26,278911482,Yi2Tan,didn't manage to get tickets for the london ey...,Leavesden; England,,0,-0.425142765045166,51.64946111043295,"51.649461110432945,-0.425142765045166",didn t manage to get tickets for the london ey...
29929,550345650970173440,,-0.101129,51.51396,-0.101129,51.513962,31/12/2014 17:40:02,273611605,ClaudiaMartinsG,@ london eye http://t.co/h6xGO3qa1Y,City of London; London,,0,-0.1377421021461486,51.512376149495445,"51.512376149495445,-0.13774210214614868",london eye
31737,550439141545369600,,-0.122351,51.504734,-0.122351,51.504734,31/12/2014 23:51:32,33514924,paul2v,This is my view opposite the london eye; Londo...,London; England,,0,-0.1377421021461486,51.466681162516274,"51.466681162516274,-0.13774210214614868",This is my view opposite the london eye London...
34604,550384136410894337,,-0.119467,51.503426,-0.119467,51.503426,31/12/2014 20:12:58,46111530,agungpryo,This iconic london eye will be so sparking and...,Lambeth; London,london,0,-0.1377421021461486,51.466681162516274,"51.466681162516274,-0.13774210214614868",This iconic london eye will be so sparking and...
41356,550420920842878977,,-0.124968,51.48742,-0.124968,51.487419,31/12/2014 22:39:08,51819706,crustacean12,Not long now......all in situ waiting @ the lo...,Lambeth; London,,0,-0.1377421021461486,51.466681162516274,"51.466681162516274,-0.13774210214614868",Not long now all in situ waiting the london eye
43151,550584972608696321,,-0.047929,51.42537,-0.047929,51.425369,01/01/2015 09:31:01,276435464,affaniator,Fire works by the london eye. Happy 2015 @ ...,Lewisham; London,,0,-0.1377421021461486,51.42098617553711,"51.42098617553711,-0.13774210214614868",Fire works by the london eye Happy London Eye ...
