In [1]:
%%time
import re
import string
from couchdb import Server
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from datetime import datetime
from nltk.tokenize import TweetTokenizer
from nltk import bigrams 
from nltk import trigrams
from textblob import TextBlob

# server = Server('http://43.240.96.132:5984/')

# db = server['testdb']
# print(type(db))
# print(len(db))

# f = open('tweets.txt', 'w+')

# count = 0
# for doc_id in db:
#     text = db[doc_id]['text']
#     text = re.sub('\n+',' ',text)
#     time = db[doc_id]['created_at']
#     f.write('{}<=>{}\n'.format(text, time))
#     count += 1
#     if count >= 10000:
#         break

punctuation = set(string.punctuation)
# stop words
stop = set(stopwords.words('english')).union(punctuation)

tweets = []
# read data from file
with open('tweets.txt') as f:
    for line in f:
        try:
            string = line.split('<=>')
            text = string[0]
            time = string[1].strip()
            tweets.append([text, time])
        except:
            continue

time_format = '%a %b %d %H:%M:%S %z %Y'
time_zone = []
for i in range(0, 24, 2):
    time_zone.append([i, i+2])

# set the lemmatizer
lemmatizer = WordNetLemmatizer()
    
def lemmatize(text):
    lemma = lemmatizer.lemmatize(text,'v')
    if lemma == text:
        lemma = lemmatizer.lemmatize(text,'n')
    return lemma

# set the tokenizer for tweets
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

# dict sort time_zone and words pairs
# {(0,2): {word1: 2, word2: 1, ...}, (2,4):..., ....}
time_words = {}

for i in tweets:
    time_object = datetime.strptime(i[1], time_format)
    time = time_object.hour + float(time_object.minute / 60)
    for zone in time_zone:
        if zone[0] <= time <= zone[1]:
            tokens = tknzr.tokenize(i[0])
            for j in tokens:
                word = lemmatize(j.lower())
                if word in stop or word[:5] == 'https' or word[:4] == 'http':
                    continue
                else:
                    word_dict = time_words.get((zone[0], zone[1]), {})
                    word_dict[word] = word_dict.get(word, 0) + 1
                    time_words[(zone[0], zone[1])] = word_dict
        else:
            continue

def getHotWord(time_zone, time_word_dict):
    '''
        The data type of time_zone is tuple
        e.g. (2, 4) means from 2-4 in 24H
    '''
    dict = time_word_dict[time_zone]
    sorted_list = sorted(dict.items(), key=lambda x:x[1], reverse=True)
    return sorted_list[:20]

# zone set to (0,2)
hot_word = getHotWord((0,2), time_words)
print(hot_word)

[('…', 654), ('’', 296), ('...', 114), ('get', 110), ('like', 108), ('go', 83), ('one', 76), ('see', 69), ('time', 66), ('new', 64), ('look', 62), ('😂', 61), ('think', 60), ('thank', 59), ('love', 57), ('u', 57), ('work', 56), ('great', 56), ('say', 51), ('day', 51)]
CPU times: user 5.38 s, sys: 211 ms, total: 5.59 s
Wall time: 5.61 s


In [2]:
bigram_time_words = {}

for i in tweets:
    time_object = datetime.strptime(i[1], time_format)
    time = time_object.hour + float(time_object.minute / 60)
    sentence = i[0].split()
    bigram = list(bigrams(sentence))
    for zone in time_zone:
        if zone[0] <= time <= zone[1]:
            for gram in bigram:
                word_dict = bigram_time_words.get((zone[0], zone[1]), {})
                word_dict[gram] = word_dict.get(gram, 0) + 1
                bigram_time_words[(zone[0], zone[1])] = word_dict


bigram_hot_word = getHotWord((0,2), bigram_time_words)
print(bigram_hot_word)

[(('of', 'the'), 59), (('in', 'the'), 53), (('for', 'the'), 38), (('on', 'the'), 34), (('is', 'a'), 34), (('for', 'a'), 31), (('to', 'be'), 28), (('at', 'the'), 28), (('to', 'the'), 26), (('I', 'just'), 21), (('have', 'to'), 20), (('I', 'have'), 19), (('is', 'the'), 18), (('with', 'the'), 18), (('via', '@YouTube'), 17), (('a', 'great'), 17), (('a', '@YouTube'), 17), (('one', 'of'), 17), (('This', 'is'), 16), (('have', 'a'), 16)]


In [3]:
print(len(bigram_time_words[(0,2)]))

23864


In [4]:
trigram_time_words = {}

for i in tweets:
    time_object = datetime.strptime(i[1], time_format)
    time = time_object.hour + float(time_object.minute / 60)
    sentence = i[0].split()
    trigram = list(trigrams(sentence))
    for zone in time_zone:
        if zone[0] <= time <= zone[1]:
            for gram in trigram:
                word_dict = trigram_time_words.get((zone[0], zone[1]), {})
                word_dict[gram] = word_dict.get(gram, 0) + 1
                trigram_time_words[(zone[0], zone[1])] = word_dict

trigram_hot_word = getHotWord((0,2), trigram_time_words)
print(trigram_hot_word)

[(('I', 'liked', 'a'), 15), (('liked', 'a', '@YouTube'), 15), (('a', '@YouTube', 'video'), 15), (('BONG!', 'BONG!', 'BONG!'), 10), (('DING', 'DING', 'DING'), 10), (('one', 'of', 'the'), 9), (('lol', 'lol', 'lol'), 6), (('you', 'have', 'to'), 5), (('I', 'love', 'the'), 5), (('a', 'lot', 'of'), 5), (('going', 'to', 'be'), 5), (('thanks', 'for', 'the'), 5), (('This', 'is', 'a'), 5), (('Looking', 'forward', 'to'), 5), (('I', 'need', 'to'), 4), (('just', 'want', 'to'), 4), (('of', 'the', 'best'), 4), (('@australian', '@sallyrugg', '@yassmin_a'), 4), (('@The_Real_BiM', '@radicapitalist', '@ChadCottle…'), 4), (('https://t.co/2QWGIs8JwY', '#funkykidsradio', '#music4kids'), 4)]


In [5]:
print(len(trigram_time_words[(0,2)]))

24469


In [6]:
%%time
# sentiment analysis----------------

from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer
import string

punctuation = set(string.punctuation)
pattern = re.compile('(@)|(#)')

tb = Blobber(analyzer=NaiveBayesAnalyzer())

for i in tweets[:20]:
#     print(i[0])
#     print(tb(i[0]).sentiment.p_pos)
    # remove @ and # 
    tweet = re.sub(pattern,' ', i[0]).strip()
    # remove http...
    tweet = ' '.join([i for i in tweet.split() if not i.startswith('http')])
    # remove puncutation
    tweet = ''.join([i for i in tweet if i not in punctuation])
    print(tweet)
    print(tb(tweet).sentiment.p_pos)
    

# for i in tweets[:20]:
#     testimonial = TextBlob(i[0])
#     print(i[0])
#     print(testimonial.sentiment)
#     print('\n')

theuntzpodcast ethanglassmusic he got shucker punched
0.40642458100558687
Kiama’s popularity appears set to continue as a “top end” Hamptonsinspired property hits the market…
0.5445573290292737
Trust carefully
0.5805716023107329
Rivalm8 MF up 10
0.5
guys humidifiers are cool im in an office but i feel like im breathing fresh air from outside
0.5139646576639431
Camiliasilf Hope you do
0.4921981004070555
Hi hello i have broken out in a rash all over my body Everything burns
0.02988286956907639
ahhh bbb
0.2499999999999997
Telstra Hi Tim unfortunately I dont have time to deal with this right now so will have to wait until I get home…
0.07500773467107623
Can you see the little 8 legged creature we found this week Can you…
0.5327828811146877
Kidman Hugh Jackman top Time most influential 100 list
0.5873215241205519
XxKaRLyKiTTeNxX Pretty sure Id remember it Also it would be such a good comeback for Louis just saying
0.6188829811519586
SeadPetovic Saints are gonna be like James harden and be 2

In [33]:
%%time
# topic modelling----------------

import string
import numpy as np
from nltk.corpus import wordnet as wn
import nltk

punctuation = set(string.punctuation)
stop = set(stopwords.words('english'))

import gensim
from gensim import corpora

lemma = WordNetLemmatizer()

corpus = []
pattern = re.compile('(#)|(@[A-Za-z0-9]+)')

for i in tweets[:50]:
    # remove @ and # 
    tweet = re.sub(pattern,' ', i[0]).strip()
    print(i)
    # remove http...
    tweet = [i for i in tweet.lower().split() if not i.startswith('http')]
    
    # remove verb
    pos_tag = nltk.pos_tag(tweet)
    tweet = [i[0] for i in pos_tag if (not i[1].startswith('V')) and (i[1] != 'IN') ]
    
    # remove stop words
    tweet = " ".join([i for i in tweet if i not in stop])
    # remove puncutation
    tweet = ''.join([i for i in tweet if i not in punctuation])
    # normalise the word
    tweet = [lemmatize(word) for word in tweet.split()]
   
    corpus.append(tweet)
    # print(tweet)

# for i in corpus:
#     print(i)


# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(corpus)

# filter the extreme tokens
dictionary.filter_extremes(no_below=2, no_above=0.9)

# Converting list of corpus into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50, dtype=np.float64)

# get probability distribution for each tweet
# probability = ldamodel.get_document_topics(doc_term_matrix)
# print(len(probability))
# for i in probability:
#     print(i)

print('\n')
for i in ldamodel.show_topics(num_words=7):
    print(i)

['@theuntzpodcast @ethanglassmusic he got shucker punched', 'Fri Apr 20 01:50:51 +0000 2018']
['Kiama’s popularity appears set to continue, as a “top end”, Hamptons-inspired property hits the market...… https://t.co/LnbuOBxDAt', 'Fri Apr 20 01:50:52 +0000 2018']
['#Trust carefully https://t.co/Wxo8PMSJcY https://t.co/rzv8tDIxYL', 'Fri Apr 20 01:50:52 +0000 2018']
['@Rivalm8 MF up 1-0', 'Fri Apr 20 01:50:52 +0000 2018']
["guys humidifiers are cool. i'm in an office but i feel like i'm breathing fresh air from outside", 'Fri Apr 20 01:50:53 +0000 2018']
['@Camiliasilf Hope you do', 'Fri Apr 20 01:50:53 +0000 2018']
['Hi, hello, i have broken out in a rash all over my body. Everything burns https://t.co/BR3XijnuhD', 'Fri Apr 20 01:50:54 +0000 2018']
['ahhh bbb', 'Fri Apr 20 01:50:54 +0000 2018']
["@Telstra Hi Tim, unfortunately I don't have time to deal with this right now, so will have to wait until I get home… https://t.co/LmrfRnT7tC", 'Fri Apr 20 01:50:55 +0000 2018']
['Can you see the

In [23]:
# # analysis the most tweeting user
# import draft

# # get top 10 users
# users = draft.get_top_n_user(10)

In [40]:
# get user tweets
print(users)
# 133042870
user_tweets = draft.get_user_tweets(971914328982011900)

[{'key': 256478435, 'value': 2464}, {'key': 931566656115032000, 'value': 1211}, {'key': 3958911379, 'value': 1093}, {'key': 979984755125928000, 'value': 958}, {'key': 133042870, 'value': 871}, {'key': 971914328982011900, 'value': 823}, {'key': 57314631, 'value': 746}, {'key': 2427740995, 'value': 717}, {'key': 378439995, 'value': 686}, {'key': 347883924, 'value': 655}]
getting response from the server...


In [41]:
%%time
# user topic modelling----------------

import string
import numpy as np
import re
from nltk.corpus import wordnet as wn
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

punctuation = set(string.punctuation)
stop = set(stopwords.words('english'))

import gensim
from gensim import corpora

# set the lemmatizer
lemmatizer = WordNetLemmatizer()
    
def lemmatize(text):
    lemma = lemmatizer.lemmatize(text,'v')
    if lemma == text:
        lemma = lemmatizer.lemmatize(text,'n')
    return lemma

corpus = []
pattern = re.compile('(#)|(@[A-Za-z0-9]+)')
for i in user_tweets:
    # remove @ and # 
    tweet = re.sub(pattern,' ', i).strip()
    # remove http...
    tweet = [i for i in tweet.lower().split() if not i.startswith('http')]
    
    # remove verb
    pos_tag = nltk.pos_tag(tweet)
    tweet = [i[0] for i in pos_tag if (not i[1].startswith('V')) and (i[1] != 'IN') ]
    
    # remove stop words
    tweet = " ".join([i for i in tweet if i not in stop])
    # remove puncutation
    tweet = ''.join([i for i in tweet if i not in punctuation])
    # normalise the word
    tweet = [lemmatize(word) for word in tweet.split()]
    # tweet = [word for word in tweet.split()]
    corpus.append(tweet)
    # print(tweet)

# for i in corpus:
#     print(i)


# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(corpus)

# filter the extreme tokens
dictionary.filter_extremes(no_below=2, no_above=0.9)

# Converting list of corpus into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50, dtype=np.float64)

# get probability distribution for each tweet
# probability = ldamodel.get_document_topics(doc_term_matrix)
# print(len(probability))
# for i in probability:
#     print(i)

print('\n')
for i in ldamodel.show_topics(num_words=7):
    print(i)



(0, '0.032*"you" + 0.026*"right" + 0.020*"that’s" + 0.012*"also" + 0.012*"good" + 0.012*"time" + 0.011*"that"')
(1, '0.036*"russia" + 0.022*"syria" + 0.018*"u" + 0.016*"belgium" + 0.016*"russian" + 0.012*"people" + 0.010*"it’s"')
(2, '0.032*"video" + 0.029*"official" + 0.021*"love" + 0.020*"so" + 0.018*"—" + 0.015*"word" + 0.015*"michelle"')
(3, '0.021*"—" + 0.015*"this" + 0.015*"still" + 0.014*"never" + 0.012*"even" + 0.012*"way" + 0.011*"well"')
(4, '0.023*"would" + 0.019*"much" + 0.017*"year" + 0.012*"past" + 0.010*"netherlands" + 0.010*"spain" + 0.010*"opcw"')
CPU times: user 15.5 s, sys: 30.5 ms, total: 15.5 s
Wall time: 15.6 s


In [15]:
cleanedTweet=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)", " ", text).split())

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
s0 = 'Kiama’s popularity appears set to waaaayyyy continue, as a “top end”, #Hamptons-inspired property hits the market...… https://t.co/LnbuOBxDAt'
a = tknzr.tokenize(s0)
print(a)

['Kiama', '’', 's', 'popularity', 'appears', 'set', 'to', 'waaayyy', 'continue', ',', 'as', 'a', '“', 'top', 'end', '”', '!', '!', '!', ',', '#Hamptons-inspired', 'property', 'hits', 'the', 'market', '...', '…', 'https://t.co/LnbuOBxDAt']
