In [17]:
import sys
sys.path.append('./library') # age_gender_predictor
sys.path.append('../.env/lib/python2.7/site-packages') # make sure it can get virtualenv lib
from datetime import datetime, timedelta
import numpy as np
import re
import pandas as pd
import json
import os
from collections import defaultdict
from datetime import datetime, timedelta

In [18]:
def checkFolderFile(folder):
    return os.listdir(folder)

# ## Read BD_user
def loadTweets(folder, filename, tweets_dict):
    with open(folder + filename, 'r') as openfile:
        for line in openfile.readlines():
            try:
                username, date, datetime, content, sentiment, emotion1, emotion2, ambiguous = line.split('\t')
            except:
                print(line.split('\t'))
            tweets_dict[username][int(date)].append((datetime, content, sentiment, emotion1, emotion2, ambiguous.strip()))
    return tweets_dict

In [19]:
def date_not_in_range(user, ill_time_dict, date):
    if date >= ill_time_dict[user] - timedelta(weeks=6) \
    and date <= ill_time_dict[user] + timedelta(weeks=3):
        return False
    else:
        return True
    
def timeSeriesTransform(usersEmotions):
    for userID in usersEmotions:
        usersEmotions[userID] = pd.DataFrame.from_dict(usersEmotions[userID], orient='index').fillna(0)
        usersEmotions[userID]['dt'] = np.zeros(usersEmotions[userID].shape[0],dtype=float)
        usersEmotions[userID].loc[:-1,'dt'] = (usersEmotions[userID].index[1:].values - usersEmotions[userID].index[:-1].values).astype('timedelta64[s]') / np.timedelta64(60, 's')
    return list(usersEmotions.values())

def getHTTPRows(timeSeries):
    count = 0
    patterns = ['http://','https://']
    conditions = timeSeries['text'].str.contains(patterns[0])
    for pattern in patterns[1:]:
        conditions = conditions | timeSeries['text'].str.contains(pattern)

    return conditions.values

def userFilter(group, spam_threshold=0.5,tweets_threshold=25, time_filter = False):    #Spam and inactive user filter
#     to restrict date in latest 8 weeks
    if time_filter == True:
        temp_group=[]
        for timeSeries in group:
            eight_week_period = timeSeries.index[-1]- timedelta(weeks=8)
            temp_group.append(timeSeries[timeSeries.index > eight_week_period])
        group = temp_group
            
    new_group = []
    for timeSeries in group:
        http_rows = getHTTPRows(timeSeries)
        average_http_count = np.sum(http_rows) / timeSeries.shape[0]
        if (average_http_count < spam_threshold) and (timeSeries.shape[0] > tweets_threshold):
            new_group.append(timeSeries)
    return new_group

def TweetsFormating(tweets_dict, ill_time_dict,en_threshold=0.9):
    usersTweets = {}
    true_yes_dict = {'yes':True, 'no':False, 'None': True}
    for user in tweets_dict:       
        userID = user
        
        for date_ in tweets_dict[user]:
            if date_not_in_range(user, ill_time_dict ,datetime.strptime(str(date_), "%Y%m%d")): continue
                
            for tweet_info in tweets_dict[user][date_]:
                date, content, polarity, emotion1, emotion2, ambiguous = tweet_info
   
                date = datetime.strptime(str(date), "%Y-%m-%d %H:%M:%S")
                
                date -= timedelta(hours = 8) #Modify the time! Because get all tweets is from taiwan(+8) not +0
                text = content
                
                if userID not in usersTweets:
                    usersTweets[userID] = {}
                if date not in usersTweets[userID]:
                    usersTweets[userID][date] = {}
                usersTweets[userID][date]['name'] = userID
                usersTweets[userID][date]['text'] = text
                usersTweets[userID][date]['polarity'] =  int(polarity.strip())
                usersTweets[userID][date]['emotion'] =  emotion1
                usersTweets[userID][date]['emotion_2'] =  emotion2
                usersTweets[userID][date]['ambiguous'] =  true_yes_dict[ambiguous]

    return usersTweets
    

In [20]:
def readPatientIllTime(folder, filename):
    with open(folder + filename, 'r') as openfile:
        return [line.strip().split('\t') for line in openfile.readlines()]
    
# ## Get ill time information
patient_ill_time_list = readPatientIllTime('./', 'bipolar_list')
patient_ill_time_dict = {line[0]: line[1] for line in patient_ill_time_list}

# dict[user][diagnosed_time(datetime)]
patient_month_time_dict = {}

for patient in patient_ill_time_dict:
    datetime_list = patient_ill_time_dict[patient].split('/')
    if len(datetime_list) > 1:
        if len(datetime_list) > 2:
            patient_month_time_dict[patient] = datetime(int(datetime_list[0]), int(datetime_list[1]), int(datetime_list[2]))
        else:
            patient_month_time_dict[patient] = datetime(int(datetime_list[0]),int(datetime_list[1]),1)
    else:
        continue
print( 'Patient Counts from month ill time:' + str(len(patient_month_time_dict)))

Patient Counts from month ill time:272


In [21]:
# weeks sliding windows, move 1 week per time
def timeSeriesSlide(timeSeries_list, window_week_size):
    timeSeries_slide = []
    for timeSeries in timeSeries_list:
        timeSeries_slide += slideWindows(timeSeries, window_week_size)
    
    return timeSeries_slide
        
def slideWindows(timeline, window_week_size):
    limit = timeline.index[0] + timedelta(weeks=window_week_size) + timedelta(days=1)
    if timeline.index[-1] < limit:
        return [timeline[timeline.index < limit]]
    else:
        new_start = timeline.index[0]+timedelta(weeks=1) + timedelta(days=1)
        return [timeline[timeline.index < limit]] + slideWindows(timeline[timeline.index > new_start], window_week_size)
    

In [27]:
user_folder = 'patient emo_senti/'
# {username:{int(date):{[(datetime,content,sentiment,emotion1,emotion2,ambiguous),...]}}}
bd_tweets_dict = defaultdict(lambda: defaultdict(lambda:[]))
for user in checkFolderFile(user_folder):
    bd_tweets_dict = loadTweets(user_folder, user, bd_tweets_dict)

In [28]:
bd_tweets = TweetsFormating(bd_tweets_dict, patient_month_time_dict)

In [29]:
bd_timeSeries = timeSeriesTransform(bd_tweets)


In [30]:
len(bd_timeSeries)

170

In [31]:
bd_8weeks_timeSeries = timeSeriesSlide(bd_timeSeries, 8)

In [32]:
len(bd_8weeks_timeSeries)

303

In [33]:
bd_clean = userFilter(bd_8weeks_timeSeries, spam_threshold=0.7, tweets_threshold=40)

In [34]:
len(bd_clean)

276

In [37]:
# export
for i in range(len(bd_clean)):
    bd_clean[i].to_pickle('Bipolar/bd_shift2week/'+str(i)+'.p')