In [1]:
import os
import pandas as pd
import matplotlib
import math
from matplotlib import pyplot as plt
from matplotlib import figure
%matplotlib inline

font = {'family' : 'sans-serif',
        'weight' : 'bold',
        'size'   : 10}

matplotlib.rc('font', **font)


__location__ = os.path.realpath((os.getcwd()))

raw_data = pd.read_csv(os.path.join(__location__, 'data/cleaned_data.csv'))
raw_data['date_time']  = pd.to_datetime(raw_data['date_time'])

user_messages = raw_data[raw_data['user']!='WhatsApp']
notifications = raw_data[raw_data['user']=='WhatsApp']

print 'loaded',len(user_messages),'user messages and',len(notifications),'WhatsApp notifications from file'

user_names = set(user_messages['user'])
min_day = raw_data['date_time'].min()
max_day = raw_data['date_time'].max()

print raw_data

loaded 19974 user messages and 136 WhatsApp notifications from file
                date_time                                            message  \
0     2013-09-28 17:29:00          Shaggy changed the subject to “The hoard”   
1     2013-09-28 17:29:00                              USER_ENTERED : Shaggy   
2     2013-09-28 17:29:00                               USER_ENTERED : Marth   
3     2013-09-28 17:29:00                                USER_ENTERED : Dave   
4     2013-09-28 17:30:00                                           [MEDIA]    
5     2013-09-28 17:33:00                                USER_ENTERED : Bunn   
6     2013-09-28 17:33:00                                USER_ENTERED : Cate   
7     2013-09-28 17:33:00                                USER_ENTERED : Phil   
8     2013-09-28 17:31:00                                           [MEDIA]    
9     2013-09-28 17:33:00                   Shaggy changed this group's icon   
10    2013-09-28 17:51:00                           

split messages and write to files for ease of access.

In [2]:
import nltk 
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import os
import string
import re

stop_words = {'im','a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your'}
exclude = set(string.punctuation)
exclude.remove('\'')

token_dict = {}
stemmer = PorterStemmer()

def replace_punctiation_char(ch):
    if ch in exclude:
        return ' '
    else :
        return ch
    
def convert_utf(in_):
    try :
        return str(in_)
    except UnicodeEncodeError:
        return str(in_.encode('ascii', 'ignore'))
  
def remove_websites(text):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)      

def remove_punctiation(s):    
    return ''.join(replace_punctiation_char(ch) for ch in s )
    
def remove_emojis(s):
    return re.sub(r'EMOJI\[[a-z\d]*\]', ' ', s)

def remove_media(s):
    return re.sub(r'MEDIA', ' ', s)

def replace_repeats(s):
    return re.sub(r'(.)\1{3,}', r'\1', s)
    
def stem_tokens(tokens, stemmer):
    stemmed = [stemmer.stem(item) for item in tokens]
    return ' '.join(stemmed)

def remove_whitespace(s):
    return ' '.join(s.split())

def remove_numbers(s):
    return re.sub("\d+", "", s)

def scrub_text(text):
    #text =  fin.read().strip() # Prints the content of the file
    just_text = convert_utf(remove_websites(remove_media(remove_emojis(text)).lower()))
    no_punctuation = remove_numbers(remove_punctiation(just_text))
    no_repeats = replace_repeats(no_punctuation)
    return remove_whitespace(no_repeats) 

def concatenate_list(input_strings):
    return reduce(lambda x,y : x + "." + y,input_strings)

user_messages['scrubbed_text'] = [scrub_text(text) for text in user_messages.message]
print user_messages

def get_messages(user_name):    
    this_user_messages = user_messages[user_messages['user']==user_name]
    return concatenate_list(this_user_messages.scrubbed_text.values)

def write_messages(messages, user_name):
    output_file = open(os.path.join(__location__, 'nlp/user_messages/' + user_name + '.txt'),'w')
    output_file.write(messages)
    output_file.close()
    
for user_name in user_names:
    messages = get_messages(user_name)
    write_messages(messages, user_name) 

                date_time                                            message  \
4     2013-09-28 17:30:00                                           [MEDIA]    
8     2013-09-28 17:31:00                                           [MEDIA]    
10    2013-09-28 17:51:00                                [MEDIA] .  [MEDIA]    
11    2013-09-28 17:53:00                                           [MEDIA]    
12    2013-09-28 17:53:00                                           [MEDIA]    
13    2013-09-28 17:55:00                                       Add vanessa!   
14    2013-09-28 18:51:00                                           [MEDIA]    
15    2013-09-28 20:13:00  With the help of Tegs, Angie and Gee xx.  [MED...   
16    2013-09-28 20:15:00                                           [MEDIA]    
17    2013-09-28 20:40:00                                OMG I'm so excited!   
18    2013-09-28 21:00:00                           Desde mexico!.  [MEDIA]    
19    2013-09-28 21:01:00               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
corpusdir= 'nlp/user_messages/'
newcorpus = PlaintextCorpusReader(corpusdir, '.*')

def tokenize(text):
    #text =  fin.read().strip() # Prints the content of the file
    just_text = remove_websites(remove_media(remove_emojis(text)).lower())
    no_punctuation = remove_punctiation(just_text)
    no_repeats = replace_repeats(no_punctuation)
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems  

def remove_whitespace(s):
    return ' '.join(s.split())
    
for infile in sorted(newcorpus.fileids()):
    #print infile # The fileids of each file.
    fin =  newcorpus.open(infile)  # Opens the file.
    text =  fin.read().strip() # Prints the content of the file
    token_dict[infile] = convert_utf(text)


In [4]:
import numpy as np
    
tvf = TfidfVectorizer(min_df=3,  max_features=None, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}',ngram_range=(0,1), use_idf=1,smooth_idf=1,
        sublinear_tf=1, stop_words = 'english')

for key in token_dict.keys():
    tvf.fit(get_other_docs(key))
    feature_names = tvf.get_feature_names()
    tfidf_array = tvf.transform([token_dict[key]])
    scores = np.ravel(tfidf_array.todense())
    with_index = zip(feature_names, scores)

    

NameError: name 'get_other_docs' is not defined

In [5]:
user_messages['month_name'] = [date.strftime('%b-%Y') for date in user_messages.date_time]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [6]:
def concatenate_documents(group):
    return '.'.join(group.scrubbed_text)

month_docs = user_messages.groupby('month_name').apply(concatenate_documents)

In [7]:
from itertools import groupby

def find_subset_words(words):
    single_words = set()
    double_words = set()
    return_words = []
    for word in words:
        word_split = word.split()
        if(len(word_split) == 1):
            single_words.update(word_split)
        else:
            double_words.update(word_split)
            return_words.append(word)
    def is_subset_word(single):
        for double in double_words:
            return single in double
                
    return [single for single in single_words if not is_subset_word(single)] + return_words

print find_subset_words(['a','c', 'a b','a d'])

['c', 'a b', 'a d']


In [8]:
import pyximport
pyximport.install()
from wordcloud.wordcloud import WordCloud
    
tvf = TfidfVectorizer(min_df=2,  max_features=None, strip_accents='unicode',  
    analyzer='word', #token_pattern=r'\w{1,}',
    ngram_range=(0,2), use_idf=1,smooth_idf=1,
    sublinear_tf=1, stop_words = 'english')

#tvf = CountVectorizer(min_df=2,  max_features=None, strip_accents='unicode',  
#    analyzer='word',token_pattern=r'\w{1,}',ngram_range=(0,2), stop_words = 'english')

WIDTH = 800
HEIGHT = 400

wc = WordCloud(background_color="white", max_words=2000, width= WIDTH, height = HEIGHT)

tvf.fit(month_docs)

sorted_dates = sorted(month_docs.index.values)

In [21]:
for month in sorted_dates:
    print month
    feature_names = tvf.get_feature_names()
    tfidf_array = tvf.transform([month_docs[month]])
    scores = np.ravel(tfidf_array.todense())
    with_index = zip(feature_names, scores)
    words = sorted(with_index , key = lambda pair : - pair[1])
    print words
    wc.generate_from_word_sizes(words)
    wc.to_file('wordcloud/months/' + month + '.png')
    plt.axis("off")

Apr-2014


ImportError: The _imagingft C module is not installed

In [9]:
def image_from_text(text):
    return

import PIL
from PIL import ImageFont
from PIL import Image
from PIL import ImageDraw

def make_text_image(text):
    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 40)
    img=Image.new("RGBA", (HEIGHT,50),(255,255,255))
    draw = ImageDraw.Draw(img)
    w, h = font.getsize(text)
    draw.text(((HEIGHT-w)/2,0), text, fill="black", font = font)
    out = img.rotate(270)
    out.save('wordcloud/months/text_image_' + text + '.png')
    return out
from IPython.display import Image as ipImage
import Image
from matplotlib.pyplot import imshow

concat_im = Image.new('RGB', (WIDTH + 50, HEIGHT * len(sorted_dates)))

for index, month in enumerate(sorted_dates):
    im = Image.open("wordcloud/months/" + month + '.png')
    text_im = make_text_image(month)
    concat_im.paste(im, (50, HEIGHT * index))
    concat_im.paste(text_im,(0, HEIGHT * index))
big_file_name = 'wordcloud/months/big_file.png'
concat_im.save(big_file_name)

ipImage(big_file_name)

IOError: cannot identify image file 'wordcloud/months/Apr-2014.png'