In [30]:
import datetime
import math
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
tweets = pd.read_csv("./data/tweets_all.csv", encoding="latin1")

In [3]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0


In [4]:
tweets["datetime"] = pd.to_datetime(tweets.MESSAGEDATE)

In [5]:
tweets["datetime_to_nearest_hour"] = tweets.datetime.apply(lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour))

In [6]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,datetime_to_nearest_hour
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0,2014-12-31 09:47:50,2014-12-31 09:00:00
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0,2014-12-31 09:47:52,2014-12-31 09:00:00
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0,2014-12-31 09:47:53,2014-12-31 09:00:00
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0,2014-12-31 09:47:55,2014-12-31 09:00:00
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0,2014-12-31 09:49:25,2014-12-31 09:00:00


Remove non-plaintext elements from tweets

In [7]:
def parse_tweets(tweet):

    tweet = re.sub('@[^\s]+','',tweet).strip()
    tweet = re.sub('#[^\s]+','',tweet).strip()
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet).strip()
    tweet = " ".join(re.findall("[a-zA-Z]+", tweet))
    
    return tweet

In [8]:
tweets["parsed_tweets"] = tweets.MESSAGETEXT.apply(lambda tweet: parse_tweets(tweet))

In [9]:
tweets.head()

Unnamed: 0,id,Name,X,Y,LONGITUDE,LATITUDE,MESSAGEDATE,USERID,USERSCREENNAME,MESSAGETEXT,LOCATION,HASHTAGS,ISRETWEET,datetime,datetime_to_nearest_hour,parsed_tweets
0,550226818624942080,,-0.26215,51.391407,-0.26215,51.391407,31/12/2014 09:47:50,113918054,12Elbestreet,@julieo25 Not yet got tics for Liverpool waiti...,Kingston upon Thames; London,,0,2014-12-31 09:47:50,2014-12-31 09:00:00,Not yet got tics for Liverpool waiting till Fr...
1,550226823314178048,,-0.149466,51.49256,-0.149466,51.492561,31/12/2014 09:47:52,465989904,alice_foster_95,So. Many. Accents #alicesadventureswithmegabus,London; England,alicesadventureswithmegabus,0,2014-12-31 09:47:52,2014-12-31 09:00:00,So Many Accents
2,550226827944660992,,-0.014891,51.354042,-0.014891,51.354042,31/12/2014 09:47:53,55872342,MissGee_Pee,If it doesn't make you happy by December 31st....,New Addington; London,,0,2014-12-31 09:47:53,2014-12-31 09:00:00,If it doesn t make you happy by December st le...
3,550226837537058816,,0.006698,51.53202,0.006698,51.532021,31/12/2014 09:47:55,147266450,vinita_ramtri,@westfieldstrat thanks; and are you open tomor...,Stratford; London,,0,2014-12-31 09:47:55,2014-12-31 09:00:00,thanks and are you open tomorrow at all
4,550227214227505152,,0.214417,51.582535,0.214417,51.582535,31/12/2014 09:49:25,518928608,CallumGordon7,@WestHam_Central but he is,Romford; London,,0,2014-12-31 09:49:25,2014-12-31 09:00:00,but he is


All parsed text concatenated by hourly intervals

In [11]:
tweets.datetime_to_nearest_hour.unique()

array(['2014-12-31T09:00:00.000000000', '2014-12-31T10:00:00.000000000',
       '2014-12-31T08:00:00.000000000', '2014-12-31T06:00:00.000000000',
       '2014-12-31T07:00:00.000000000', '2014-12-31T04:00:00.000000000',
       '2014-12-31T05:00:00.000000000', '2014-12-31T14:00:00.000000000',
       '2014-12-31T15:00:00.000000000', '2014-12-31T11:00:00.000000000',
       '2014-12-31T13:00:00.000000000', '2014-12-31T12:00:00.000000000',
       '2014-12-31T18:00:00.000000000', '2014-12-31T19:00:00.000000000',
       '2014-12-31T16:00:00.000000000', '2014-12-31T17:00:00.000000000',
       '2014-12-31T23:00:00.000000000', '2015-01-01T00:00:00.000000000',
       '2014-12-31T20:00:00.000000000', '2014-12-31T22:00:00.000000000',
       '2015-01-01T08:00:00.000000000', '2015-01-01T09:00:00.000000000',
       '2015-01-01T04:00:00.000000000', '2015-01-01T05:00:00.000000000',
       '2015-01-01T06:00:00.000000000', '2015-01-01T07:00:00.000000000',
       '2014-12-31T21:00:00.000000000', '2015-01-01

In [None]:
# tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))
# tweets_by_hour = pd.DataFrame({"message_text_for_hour" : tweets.groupby("datetime_to_nearest_hour")["parsed_tweets"].apply(lambda s: "%s" % ' '.join(s))}).reset_index()

In [14]:
# need to get pos tag
class StemmerTokenizer(object):

    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def __call__(self, doc):
        doc = " ".join(re.findall("[a-zA-Z]+", doc))
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]

In [25]:
idf_by_hourly_interval = {}

for hour_interval in tweets.datetime_to_nearest_hour.unique():
    clf = TfidfVectorizer(tokenizer=StemmerTokenizer(), min_df=5, max_df=0.7, stop_words=set(stopwords.words('english')))
    clf.fit(tweets[tweets["datetime_to_nearest_hour"] == hour_interval]["parsed_tweets"])
    word2idf = dict(zip(clf.get_feature_names(), clf.idf_))
    sorted_idf = sorted(word2idf.items(), key=lambda x: x[1], reverse=True)

    idf_by_hourly_interval[hour_interval] = {
        "hour_interval": hour_interval,
        "sorted_idf": sorted_idf
    }
    
    print("Completed", hour_interval)

print("Finished!")

Completed 2014-12-31T09:00:00.000000000
Completed 2014-12-31T10:00:00.000000000
Completed 2014-12-31T08:00:00.000000000
Completed 2014-12-31T06:00:00.000000000
Completed 2014-12-31T07:00:00.000000000
Completed 2014-12-31T04:00:00.000000000
Completed 2014-12-31T05:00:00.000000000
Completed 2014-12-31T14:00:00.000000000
Completed 2014-12-31T15:00:00.000000000
Completed 2014-12-31T11:00:00.000000000
Completed 2014-12-31T13:00:00.000000000
Completed 2014-12-31T12:00:00.000000000
Completed 2014-12-31T18:00:00.000000000
Completed 2014-12-31T19:00:00.000000000
Completed 2014-12-31T16:00:00.000000000
Completed 2014-12-31T17:00:00.000000000
Completed 2014-12-31T23:00:00.000000000
Completed 2015-01-01T00:00:00.000000000
Completed 2014-12-31T20:00:00.000000000
Completed 2014-12-31T22:00:00.000000000
Completed 2015-01-01T08:00:00.000000000
Completed 2015-01-01T09:00:00.000000000
Completed 2015-01-01T04:00:00.000000000
Completed 2015-01-01T05:00:00.000000000
Completed 2015-01-01T06:00:00.000000000


In [26]:
idf_by_hourly_interval

{numpy.datetime64('2014-12-31T04:00:00.000000000'): {'hour_interval': numpy.datetime64('2014-12-31T04:00:00.000000000'),
  'sorted_idf': [('becaus', 5.509026325435189),
   ('boy', 5.509026325435189),
   ('dm', 5.509026325435189),
   ('end', 5.509026325435189),
   ('first', 5.509026325435189),
   ('funni', 5.509026325435189),
   ('home', 5.509026325435189),
   ('left', 5.509026325435189),
   ('live', 5.509026325435189),
   ('look', 5.509026325435189),
   ('men', 5.509026325435189),
   ('mi', 5.509026325435189),
   ('nash', 5.509026325435189),
   ('need', 5.509026325435189),
   ('rt', 5.509026325435189),
   ('shit', 5.509026325435189),
   ('someth', 5.509026325435189),
   ('tonight', 5.509026325435189),
   ('wato', 5.509026325435189),
   ('way', 5.509026325435189),
   ('back', 5.3548756456079305),
   ('best', 5.3548756456079305),
   ('e', 5.3548756456079305),
   ('friend', 5.3548756456079305),
   ('gmt', 5.3548756456079305),
   ('good', 5.3548756456079305),
   ('hand', 5.3548756456079305

In [33]:
idf_by_hourly_interval[np.datetime64('2014-12-31T23:00:00.000000000')]

{'hour_interval': numpy.datetime64('2014-12-31T23:00:00.000000000'),
 'sorted_idf': [('af', 7.275075845189281),
  ('ah', 7.275075845189281),
  ('al', 7.275075845189281),
  ('also', 7.275075845189281),
  ('amen', 7.275075845189281),
  ('amor', 7.275075845189281),
  ('ancora', 7.275075845189281),
  ('apart', 7.275075845189281),
  ('asleep', 7.275075845189281),
  ('await', 7.275075845189281),
  ('bare', 7.275075845189281),
  ('bc', 7.275075845189281),
  ('biggest', 7.275075845189281),
  ('black', 7.275075845189281),
  ('boyfriend', 7.275075845189281),
  ('build', 7.275075845189281),
  ('car', 7.275075845189281),
  ('cat', 7.275075845189281),
  ('caus', 7.275075845189281),
  ('channel', 7.275075845189281),
  ('chelsea', 7.275075845189281),
  ('close', 7.275075845189281),
  ('cocktail', 7.275075845189281),
  ('cup', 7.275075845189281),
  ('dan', 7.275075845189281),
  ('definit', 7.275075845189281),
  ('doubl', 7.275075845189281),
  ('dream', 7.275075845189281),
  ('drop', 7.275075845189281)