In [32]:
import pandas as pd
import numpy as np
import pickle
import os

# File syntax standardize

In [45]:
def convert_to_pandas_pickle(file, dir_path='./', out_dir_path = './pickle/'):
    # Read and organlize files
    with open(dir_path+file) as f:
        # Define table structure
        content_dict = {'uid':[],
                        'screen_name':[],
                        'tweetime':[],
                        'tweet':[]}
        
        for line in f.readlines():
            uid, screen_name, tweetime, tweet = line.split('\t')
            content_dict['uid'].append(uid)
            content_dict['screen_name'].append(screen_name)
            content_dict['tweetime'].append(tweetime)
            content_dict['tweet'].append(tweet.strip()) # Strip() to remove '\n'

    df = pd.DataFrame(content_dict, columns = ['uid', 'screen_name', 'tweetime', 'tweet'])
    # df['datetime_int'] = pd.to_datetime(df['tweetime']).astype(np.int64)
    # df = df.set_index('datetime_int')

    # Convert to UTC-0 timezone
    df['tweetime'] = pd.to_datetime(df['tweetime']) - pd.Timedelta('8 hours')
    df = df.set_index('tweetime')
    
    # Save file
    df.to_pickle(out_dir_path+file)
    
    return True

In [47]:
dir_path = 'datasets/depression_users_tweets(diagnosed_today)/'
out_dir_path = 'datasets/depression_raw_pickle/'
files = os.listdir(dir_path)

# Convert all files to pickle
for file in files:
    convert_to_pandas_pickle(file, dir_path, out_dir_path)


# Get timeline before diagnosis

In [48]:
import re

In [49]:
def replace_Token(tweet):
    tweet = re.sub('(https?:\/\/)|(pic)\S+','URLTOK',tweet.lower().strip()) # url
    tweet = re.sub('@(?:[a-zA-Z0-9_]+)', '<M>', tweet) # mention
#     tweet = re.sub('#(?:[a-zA-Z0-9_]+)', '<H>', tweet) # hashtag
    return tweet

In [56]:
out_dir_path = 'datasets/depression_raw_pickle/'
files = os.listdir(out_dir_path)

# Read all timelines from pickles
depression_timelines = [pd.read_pickle(out_dir_path+file) for file in files]

In [57]:
for timeline in depression_timelines:
    timeline['repl_tweet'] = timeline.tweet.map(replace_Token)

depression_timelines[0].tweet.head(5)

tweetime
2014-09-19 11:07:53         Hugging trees hbu pic.twitter.com/PIuyEYmFmM
2014-09-10 13:18:22             Pretty clouds pic.twitter.com/0qVIOT5OEJ
2014-09-07 13:08:33    " @illhueminati : share a picture you took rec...
2014-09-06 16:49:38                           pic.twitter.com/tiyAdEwG1T
2014-09-05 15:09:13    Today was one of the best days of my life. pic...
Name: tweet, dtype: object

In [53]:
depression_timelines[0].head(5)

Unnamed: 0_level_0,uid,screen_name,tweet,repl_tweet
tweetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-09-19 11:07:53,2759738561,Autumn_Rxin,Hugging trees hbu pic.twitter.com/PIuyEYmFmM,hugging trees hbu URLTOK
2014-09-10 13:18:22,2759738561,Autumn_Rxin,Pretty clouds pic.twitter.com/0qVIOT5OEJ,pretty clouds URLTOK
2014-09-07 13:08:33,2759738561,Autumn_Rxin,""" @illhueminati : share a picture you took rec...",""" <M> : share a URLTOK you took recently” URLTOK"
2014-09-06 16:49:38,2759738561,Autumn_Rxin,pic.twitter.com/tiyAdEwG1T,URLTOK
2014-09-05 15:09:13,2759738561,Autumn_Rxin,Today was one of the best days of my life. pic...,today was one of the best days of my life. URLTOK


## Get diagnosed day

In [58]:
diagnosed_day_dict = {}

# Go through every person to get diagnosed date
for timeline in depression_timelines:

    diagnosed_dates = timeline[timeline.tweet.str.contains('Diagnosed|diagnosed') &
                               timeline.tweet.str.contains('Depression|depression') &
                               timeline.tweet.str.contains('Today|today')].index
    if len(diagnosed_dates) == 1:
        diagnosed_day_dict[timeline.uid[0]] = diagnosed_dates[0]

    elif len(diagnosed_dates) == 0:
        print("\n{}:\t{}".format("Not Found", timeline.uid[0]))
    else:
        print("\n{}:\t{}".format("Multiple Result", timeline.uid[0]))
        print(set([each_date.date() for each_date in diagnosed_dates]))
        diagnosed_day_dict[timeline.uid[0]] = diagnosed_dates[0]


Multiple Result:	715408711
{datetime.date(2012, 7, 25), datetime.date(2012, 7, 26)}

Multiple Result:	196699727
{datetime.date(2012, 1, 13)}

Multiple Result:	1580882760
{datetime.date(2013, 7, 9)}

Multiple Result:	2245701436
{datetime.date(2014, 3, 12)}

Multiple Result:	59537566
{datetime.date(2013, 4, 12)}

Multiple Result:	3474584555
{datetime.date(2015, 12, 10)}

Multiple Result:	173935927
{datetime.date(2012, 2, 10)}


## Dump file

In [65]:
if len(diagnosed_day_dict) == len(depression_timelines):
    # open a file, where you ant to store the data
    file = open('datasets/depression_diagnosed_time', 'wb')

    # dump information to that file
    pickle.dump(diagnosed_day_dict, file)

    # close the file
    file.close()
    print('Dump Finished')

Dump Finished
