In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics
import pytz
import os
from sklearn import preprocessing
from sklearn.externals import joblib
import multiprocessing as mp
from datetime import datetime, timedelta
import pickle

#import PatternVectorizer and SimpleClassifier
from pattern_classifier import  SimpleClassifier, PatternVectorizer

In [2]:
cls_persistence = 'data/simple_classifier_model.pkl.compressed'
pv_persistence = 'data/pattern_vectorizer.pkl.compressed'

cls = joblib.load(cls_persistence)
pv = joblib.load(pv_persistence)

## Load Tweets

In [3]:
timezone_location_dict = {
    'Pacific Time (US & Canada)':'US/Pacific',
    'Central Time (US & Canada)':'US/Central',
    'Eastern Time (US & Canada)':'US/Eastern',
    'London':'Europe/London',
    'Sydney':'Australia/Sydney',
    'Tokyo':'Asia/Tokyo',
    'Africa/Nairobi':'Africa/Nairobi',
    'Arizona':'US/Arizona',
    'Kyiv':'Europe/Simferopol',
    'Europe/London':'Europe/London',
    'Atlantic Time (Canada)':'Atlantic/Canary',
    'Midway Island':'Pacific/Apia',
    'Auckland':'Pacific/Auckland',
    'Amsterdam':'Europe/Amsterdam',
    'Baghdad':'Asia/Riyadh',
    'Riyadh':'Asia/Riyadh',
    'Belgrade':'Europe/Belgrade',
    'Quito':'Pacific/Galapagos',
    'Pretoria':'Africa/Johannesburg',
    'Beijing':'Asia/Shanghai',
    'Hong Kong':'Asia/Shanghai',
    'Dublin':'Europe/Dublin',
    'Mountain Time (US & Canada)':'Mountain/US',
    'Hawaii':'Pacific/Honolulu',
    'Brisbane':'Australia/Brisbane',
    'Vienna':'Europe/Vienna',
    'Islamabad':'Asia/Karachi',
    'Casablanca':'Africa/Casablanca',
    'Yakutsk':'Asia/Yakutsk',
    'Tijuana':'America/Tijuana',
    'Johannesburg':'Africa/Johannesburg',
    'Mountain Time (US & Canada)':'US/Central',
    'Bangkok': 'Asia/Bangkok',
    'Harare': 'Africa/Harare',
    'Chennai': 'Asia/Kolkata',
    'Kolkata': 'Asia/Kolkata',
    'Brussels': 'Europe/Brussels',
    'Melbourne': 'Australia/Melbourne',
    'Alaska': 'US/Alaska',
    'Perth': 'Australia/Perth',
    'Denver':'America/Denver',
    'Los Angeles': 'America/Los_Angeles',
    'indiana': 'US/East-Indiana',
    'Vancouver': 'America/Vancouver',
    'Berlin': 'Europe/Berlin',
    'Kentucky': 'America/Kentucky/Louisville',
    'Copenhagen':'Europe/Copenhagen',
    'Athens': 'Europe/Athens',
    'Edinburgh': 'Etc/Greenwich',
    'Wellington': 'Asia/Anadyr',
    'Santiago': 'US/Pacific',
    'America/New_York': 'US/Eastern',
    'Paris': 'Europe/Paris',
    'Jakarta': 'Asia/Jakarta',
    'New Delhi': 'Asia/Calcutta',
    'Chicago': 'US/Central',
    'America/Chicago': 'US/Central',
    'Fiji': 'Pacific/Fiji',
    'US/Mountain': 'US/Mountain'
}


### Load Patient TimeZone

In [4]:
def readPatientTimezone(folder, filename):
    time_dict = {}
    with open(folder + filename, 'r') as openfile:
        start = False
        for line in openfile.readlines():
            split = line.strip().split('\t')
            if split[1] == 'bernievassallo': start = True
            if split[2] != "None" and start:
                time_dict[split[1]] = split[2]
            else: 
                continue
    return time_dict

patient_timezone_dict = readPatientTimezone('','bipolar_user_timezone')
print(len(patient_timezone_dict))

27


### Load Patient Ill Time

In [5]:
bipolar_diagnosed_dict = {}

with open('bipolar_list') as f:
    for line in f.readlines():
        name, date = line.strip().split('\t')
        if name in patient_timezone_dict:
            if len(date.split('/')) == 3:
                bipolar_diagnosed_dict[name] = datetime.strptime(date, "%Y/%m/%d")
            elif len(date.split('/')) == 2:
                bipolar_diagnosed_dict[name] = datetime.strptime(date, "%Y/%m")
                
print(len(bipolar_diagnosed_dict))

26


### Read Patient Tweets

In [6]:
# Start from : bernievassallo
file_dir = 'patient_tweets/'
col = ['screen', 'id', 'datetime', 'text']
user_tweet_list = [pd.read_csv(file_dir+file, delimiter = '\t', names = col) for file in bipolar_diagnosed_dict]
print(len(user_tweet_list))

26


### Convert datetime_str to datetime and Adjust Datetime by Timezone

In [7]:
def to_local_timezone(local, orig_time):
    # -8 is because I collect data from taiwan which is +8 not +8
    utc_time = orig_time-timedelta(hours = 8)
    tz = pytz.timezone(local)
    return pytz.utc.localize(utc_time, is_dst=None).astimezone(tz)

In [8]:
# Before Adjust
user_tweet_list[0].datetime[0]

'2017-04-27 11:57:06'

In [9]:
for i,user_tweets in enumerate(user_tweet_list):
    user_timezone = patient_timezone_dict.get(user_tweets.screen[0])
    user_tweet_list[i].datetime = user_tweets.datetime.astype('datetime64[ns]').map(lambda x: to_local_timezone(timezone_location_dict[user_timezone], x))     

In [10]:
# After Adjust
user_tweet_list[0].datetime[0]

Timestamp('2017-04-26 20:57:06-0700', tz='US/Pacific')

### Filter Time

In [11]:
# Filter the time > 15month
def filterDiagnosedPeriod(user_tweets, diagnosed_time):
    # After Diagnosed 3month
    user_tweets = user_tweets[user_tweets.datetime <= (diagnosed_time + timedelta(weeks = 14))]
    # Before Diagnosed 1year
    user_tweets = user_tweets[user_tweets.datetime >= (diagnosed_time - timedelta(weeks = 54))]
    user_tweets.datetime = user_tweets.datetime.map(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
    return user_tweets

In [12]:
time_filter_tweet_list = []
for i, user_tweets in enumerate(user_tweet_list):
    diagnosed_time = bipolar_diagnosed_dict.get(user_tweets.screen[0])
    temp_tweet_list = filterDiagnosedPeriod(user_tweets, diagnosed_time)
    if len(temp_tweet_list) > 0:
        time_filter_tweet_list.append(temp_tweet_list)

In [13]:
del user_tweet_list
len(time_filter_tweet_list)

24

## Query Emotion

In [14]:
def emotion_query(user_tweets):
    documentPatternVectors = pv.transform(user_tweets.text)

    # using two emotion, guess the one with the smallest rank
    Y_GUESS_2 = cls.get_top_emotion_ambiguous(documentPatternVectors, ascending=True, n=2)

    Guess_array = np.array(Y_GUESS_2)

    user_tweets['emotion1'] = Guess_array[:,0]
    user_tweets['emotion2'] = Guess_array[:,1]
    user_tweets['ambiguous'] = Guess_array[:,2]
    
    return user_tweets

### Multi-Processing

In [15]:
pool = mp.Pool(processes=mp.cpu_count())
multi_res = [pool.apply_async(emotion_query, (user_tweets,)) for user_tweets in time_filter_tweet_list]
emo_user_tweet_list = [res.get() for res in multi_res]

In [16]:
del time_filter_tweet_list
print(len(emo_user_tweet_list))

24


In [17]:
emo_user_tweet_list[0]

Unnamed: 0,screen,id,datetime,text,emotion1,emotion2,ambiguous
0,sobertony,sobertony,2017-04-26 20:57:06,Interesting... I had no clue looked like some ...,fear,surprise,false
1,sobertony,sobertony,2017-04-26 20:21:13,You're crushing it Rev,disgust,anticipation,true
2,sobertony,sobertony,2017-04-26 20:15:44,Thanks man. I felt really good after 5th step ...,surprise,joy,false
3,sobertony,sobertony,2017-04-26 19:26:01,I love the #sobercount because it's one way to...,trust,anger,false
4,sobertony,sobertony,2017-04-26 19:13:38,Very isolated kind of work. Especially when hi...,fear,anger,false
5,sobertony,sobertony,2017-04-26 19:12:58,It's gets worse when your theology changes and...,disgust,anger,true
6,sobertony,sobertony,2017-04-26 18:49:40,When I worked as a pastor (before things blew ...,sadness,surprise,false
7,sobertony,sobertony,2017-04-26 18:46:44,My psychologist was very glad that I'd found a...,surprise,fear,false
8,sobertony,sobertony,2017-04-26 18:44:21,@boner_sober bro. What's happening?,anticipation,surprise,false
9,sobertony,sobertony,2017-04-26 18:42:35,I would think burnout is a hugh problem. Espec...,anticipation,anger,false


### Normal Way

In [None]:
# emo_user_tweet_list = []
# for i, user_tweets in enumerate(user_tweet_list):
#     try:
#         emo_user_tweet_list.append(emotion_query(user_tweets))
#     except:
#         print(i)


## Query Sentiment 140

In [18]:
import re
import urllib
import json

In [19]:
# function to delete url
def del_url(line):
    return re.sub(r'(\S*(\.com).*)|(https?:\/\/.*)', "", line)
# replace tag
def checktag(line): 
    return re.sub(r'\@|\#', "", line)
# Some special character
def checkSpecial(line):
    return line.replace('♡', 'love ').replace('\"','').replace('“','').replace('”','').replace('…','...').replace('’','\'').replace('•', '').replace('–','-')

def sendto140(line):
        query_string = '{"data": ['
        # username, date, datetime, content 
        query_string += '{"text": "' + checkSpecial(checktag(del_url(str(line).strip()))) + '"},'

        query_string = query_string[:-1] + ']}'
        try:
            data = query_string.encode('utf-8');
            response = urllib.request.urlopen('http://www.sentiment140.com/api/bulkClassifyJson', data) # request to server     
            page = response.read() # get the response     
            # print page # print the result     
            query_result = json.loads((page).decode('utf-8')) # parse the result. The result is in JSON format
            return sentiment_dict[int(query_result["data"][0]["polarity"])]
        except:
            print('\n{} - Fail : {}'.format( datetime.now().strftime("%Y/%m/%d %H:%M:%S"), query_string))
            return sentiment_dict[2]



sentiment_dict = {
                0:-1,
                2: 0,
                4: 1
                }

In [20]:
for i, emo_user_tweets in enumerate(emo_user_tweet_list):
    try:
        print('{}. {} : {}'.format(i, emo_user_tweets['screen'][0], len(emo_user_tweets)))
        emo_user_tweet_list[i]['sentiment'] = emo_user_tweets.text.apply(lambda t: sendto140(t))
    except:
        print(i)

0. sobertony : 1038

2017/05/08 22:47:25 - Fail : {"data": [{"text": "That's what make it awesome. I'm so new to this but I'm​amazed that the hard days come and go. The good ones are worth it."}]}

2017/05/08 22:47:26 - Fail : {"data": [{"text": "Thank you for sharing. That's awesome​."}]}

2017/05/08 22:47:29 - Fail : {"data": [{"text": "That's a huge​ number. You're getting it done."}]}

2017/05/08 22:47:37 - Fail : {"data": [{"text": "Thank you. I'm​ glad for Twitter friends like you."}]}

2017/05/08 22:47:47 - Fail : {"data": [{"text": "Start with love. Then use I statements not accusations. I am worrying​ about you."}]}

2017/05/08 22:47:51 - Fail : {"data": [{"text": "I'm​ glad for that because I'm not as rich as I used to imagine when I was drinking"}]}

2017/05/08 22:48:25 - Fail : {"data": [{"text": "Clever advice doesn't help much, but I'll welcome any friend who's​ been through it too. "}]}

2017/05/08 22:48:32 - Fail : {"data": [{"text": "A friend said she's​ my new addicti

## Out

In [109]:
file_dir = 'new_bipolar/'
col = ['id', 'screen', 'datetime', 'text','emotion1','emotion2','ambiguous','sentiment']
emo_user_tweet_list = [pd.read_csv(file_dir+file, delimiter = '\t', names = col) for file in os.listdir(file_dir)]

In [111]:
for i, user_tweets in enumerate(emo_user_tweet_list):
    emo_user_tweet_list[i].index = pd.to_datetime(user_tweets.datetime)
    del emo_user_tweet_list[i]['id']
    del emo_user_tweet_list[i]['datetime']
    emo_user_tweet_list[i] = emo_user_tweet_list[i].rename( columns = {'screen':'name','emotion1':'emotion','emotion2':'emotion_2','sentiment':'polarity'})

In [124]:
for i, user_tweets in enumerate(emo_user_tweet_list):
    emo_user_tweet_list[i] = emo_user_tweet_list[i].sort_index().fillna(0)
    emo_user_tweet_list[i]['dt'] = np.zeros(user_tweets.shape[0],dtype=float)
    emo_user_tweet_list[i].loc[:-1,'dt'] = (user_tweets.index[1:].values - user_tweets.index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')

In [128]:
# export
for i in range(len(emo_user_tweet_list)):
    emo_user_tweet_list[i].to_pickle('new_bipolar_pickle/bipolar_clean'+str(i+400)+'.p')